From e39cab6256706e23d1e16177913dae190fe32e4f Mon Sep 17 00:00:00 2001
From: Fabian Thorand <fabian@channable.com>
Date: Fri, 14 Jan 2022 14:53:03 +0100
Subject: [PATCH] Defer freeing of mega block groups

Solves the quadratic worst case performance of freeing megablocks that
was described in issue #19897.

During GC runs, we now keep a secondary free list for megablocks that is
neither sorted, nor coalesced. That way, free becomes an O(1) operation
at the expense of not being able to reuse memory for larger allocations.
At the end of a GC run, the secondary free list is sorted and then
merged into the actual free list in a single pass.

That way, our worst case performance is O(n log(n)) rather than O(n^2).

We postulate that temporarily losing coalescense during a single GC run
won't have any adverse effects in practice because:

- We would need to release enough memory during the GC, and then after
  that (but within the same GC run) allocate a megablock group of more
  than one megablock. This seems unlikely, as large objects are not
  copied during GC, and so we shouldn't need such large allocations
  during a GC run.
- Allocations of megablock groups of more than one megablock are rare.
  They only happen when a single heap object is large enough to require
  that amount of space. Any allocation areas that are supposed to hold
  more than one heap object cannot use megablock groups, because only
  the first megablock of a megablock group has valid `bdescr`s. Thus,
  heap object can only start in the first megablock of a group, not in
  later ones.
---
 rts/sm/BlockAlloc.c | 267 ++++++++++++++++++++++++++++++++++++++------
 rts/sm/BlockAlloc.h |   3 +
 rts/sm/GC.c         |  10 ++
 3 files changed, 245 insertions(+), 35 deletions(-)

diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index 180a06f33ebc..d5651db6d8e8 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -161,11 +161,23 @@ static void  initMBlock(void *mblock, uint32_t node);
 static bdescr *free_list[MAX_NUMA_NODES][NUM_FREE_LISTS];
 static bdescr *free_mblock_list[MAX_NUMA_NODES];
 
+// For avoiding quadratic runtime performance when freeing a large number of
+// mblocks during a single GC run, free will be deferred to a separate free list
+// that foregoes sorting and coalecense. As the final step in a GC run we can
+// then separately sort the deferred list, and merge it with the actual free
+// list in one go.
+static bool defer_mblock_frees;
+static bdescr *deferred_free_mblock_list[MAX_NUMA_NODES];
+
 W_ n_alloc_blocks;   // currently allocated blocks
 W_ hw_alloc_blocks;  // high-water allocated blocks
 
 W_ n_alloc_blocks_by_node[MAX_NUMA_NODES];
 
+
+static bdescr* splitDeferredList(bdescr* head);
+static void sortDeferredList(bdescr** head);
+
 /* -----------------------------------------------------------------------------
    Initialisation
    -------------------------------------------------------------------------- */
@@ -371,41 +383,79 @@ split_block_low (bdescr *bd, W_ n)
     return bd;
 }
 
-/* Only initializes the start pointers on the first megablock and the
- * blocks field of the first bdescr; callers are responsible for calling
- * initGroup afterwards.
+
+/* Find a fitting block for the allocation request in the given free list.
+   Returns:
+     - not NULL: when an exact match was found in the free list.
+     - NULL: when no exact match was found. In this case, the out parameter
+       `best` can be inspected to get the best fitting block from the free list.
  */
 static bdescr *
-alloc_mega_group (uint32_t node, StgWord mblocks)
+alloc_mega_group_from_free_list (bdescr** free_list_head, StgWord n, bdescr** best)
 {
-    bdescr *best, *bd, *prev;
-    StgWord n;
-
-    n = MBLOCK_GROUP_BLOCKS(mblocks);
-
-    best = NULL;
+    bdescr *bd, *prev;
+    *best = NULL;
     prev = NULL;
-    for (bd = free_mblock_list[node]; bd != NULL; prev = bd, bd = bd->link)
+    for (bd = *free_list_head; bd != NULL; prev = bd, bd = bd->link)
     {
         if (bd->blocks == n)
         {
             if (prev) {
                 prev->link = bd->link;
             } else {
-                free_mblock_list[node] = bd->link;
+                *free_list_head = bd->link;
             }
             return bd;
         }
         else if (bd->blocks > n)
         {
-            if (!best || bd->blocks < best->blocks)
+            if (!*best || bd->blocks < (*best)->blocks)
             {
-                best = bd;
+                *best = bd;
             }
         }
     }
+    return NULL;
+}
 
-    if (best)
+/* Only initializes the start pointers on the first megablock and the
+ * blocks field of the first bdescr; callers are responsible for calling
+ * initGroup afterwards.
+ */
+static bdescr *
+alloc_mega_group (uint32_t node, StgWord mblocks)
+{
+    bdescr *best, *bd;
+    StgWord n;
+
+    n = MBLOCK_GROUP_BLOCKS(mblocks);
+
+    if(defer_mblock_frees)
+    {
+        // Freeing mega groups is currently deferred. Try to serve new requests
+        // preferentially from our deferred free list.
+        bd = alloc_mega_group_from_free_list(&deferred_free_mblock_list[node], n, &best);
+        if(bd)
+        {
+            return bd;
+        }
+        else if(!best)
+        {
+            // If there was neither an exact nor a best match, try the regular free list.
+            bd = alloc_mega_group_from_free_list(&free_mblock_list[node], n, &best);
+        }
+    }
+    else
+    {
+        // Otherwise, just always use the regular free list
+        bd = alloc_mega_group_from_free_list(&free_mblock_list[node], n, &best);
+    }
+
+    if (bd)
+    {
+        return bd;
+    }
+    else if (best)
     {
         // we take our chunk off the end here.
         StgWord best_mblocks  = BLOCKS_TO_MBLOCKS(best->blocks);
@@ -758,30 +808,90 @@ free_mega_group (bdescr *mg)
     bdescr *bd, *prev;
     uint32_t node;
 
-    // Find the right place in the free list.  free_mblock_list is
-    // sorted by *address*, not by size as the free_list is.
-    prev = NULL;
     node = mg->node;
-    bd = free_mblock_list[node];
-    while (bd && bd->start < mg->start) {
-        prev = bd;
-        bd = bd->link;
-    }
 
-    // coalesce backwards
-    if (prev)
-    {
-        mg->link = prev->link;
-        prev->link = mg;
-        mg = coalesce_mblocks(prev);
+    if(defer_mblock_frees) {
+        // Put the block on the deferred free list without coalescing.
+        mg->link = deferred_free_mblock_list[node];
+        deferred_free_mblock_list[node] = mg;
+    } else {
+        // Find the right place in the free list.  free_mblock_list is
+        // sorted by *address*, not by size as the free_list is.
+        prev = NULL;
+        bd = free_mblock_list[node];
+        while (bd && bd->start < mg->start) {
+            prev = bd;
+            bd = bd->link;
+        }
+
+        // coalesce backwards
+        if (prev)
+        {
+            mg->link = prev->link;
+            prev->link = mg;
+            mg = coalesce_mblocks(prev);
+        }
+        else
+        {
+            mg->link = free_mblock_list[node];
+            free_mblock_list[node] = mg;
+        }
+        // coalesce forwards
+        coalesce_mblocks(mg);
+
+        IF_DEBUG(sanity, checkFreeListSanity());
     }
-    else
-    {
-        mg->link = free_mblock_list[node];
-        free_mblock_list[node] = mg;
+}
+
+static void
+free_deferred_mega_groups (uint32_t node)
+{
+    bdescr *mg, *bd, *prev, *new_head;
+
+    sortDeferredList(&deferred_free_mblock_list[node]);
+
+    new_head = deferred_free_mblock_list[node];
+    deferred_free_mblock_list[node] = NULL;
+
+    // Keeping track of the location in the free list
+    prev = NULL;
+    bd = free_mblock_list[node];
+
+    while(new_head != NULL) {
+        // Obtain mblock to free
+        mg = new_head;
+        new_head = new_head->link;
+
+        // Find the right place in the free list. This is similar to the process
+        // in `free_mega_group`, but we can exploit that the deferred list is
+        // sorted: the search starts out where the previous mblock was inserted.
+        // This means we only need to traverse the free list once to free all
+        // the mblocks, rather than once per mblock.
+        while (bd && bd->start < mg->start) {
+            prev = bd;
+            bd = bd->link;
+        }
+
+        // coalesce backwards
+        if (prev)
+        {
+            mg->link = prev->link;
+            prev->link = mg;
+            mg = coalesce_mblocks(prev);
+        }
+        else
+        {
+            mg->link = free_mblock_list[node];
+            free_mblock_list[node] = mg;
+        }
+
+        // coalesce forwards
+        coalesce_mblocks(mg);
+
+        // initialize search for next round
+        prev = mg;
+        bd = prev->link;
     }
-    // coalesce forwards
-    coalesce_mblocks(mg);
 
     IF_DEBUG(sanity, checkFreeListSanity());
 }
@@ -949,6 +1059,93 @@ initMBlock(void *mblock, uint32_t node)
     }
 }
 
+// Find the midpoint of the linked list.
+static bdescr* splitDeferredList(bdescr* head) {
+    bdescr *fast, *slow, *second_half;
+
+    slow = head;
+    fast = slow->link;
+
+    while(fast != NULL) {
+        fast = fast->link;
+        if(fast) {
+            fast = fast->link;
+            slow = slow->link;
+        }
+    }
+
+    second_half = slow->link;
+    // Cap first list at half
+    slow->link = NULL;
+    return second_half;
+}
+
+static void sortDeferredList(bdescr** head) {
+    bdescr *first_half, *second_half, *cur;
+
+    if(*head == NULL || (*head)->link == NULL) {
+        // 0 or 1 elements, done
+        return;
+    }
+
+    first_half = *head;
+    second_half = splitDeferredList(*head);
+
+    sortDeferredList(&first_half);
+    sortDeferredList(&second_half);
+
+    // Sort by address
+    if(first_half->start < second_half->start) {
+        *head = first_half;
+        first_half = first_half->link;
+    } else {
+        *head = second_half;
+        second_half = second_half->link;
+    }
+    cur = *head;
+
+    while(first_half != NULL && second_half != NULL) {
+        if(first_half->start < second_half->start) {
+            cur->link = first_half;
+            first_half = first_half->link;
+        } else {
+            cur->link = second_half;
+            second_half = second_half->link;
+        }
+        cur = cur->link;
+    }
+
+    // Now one of the two is exhausted, so order doesn't matter
+    while(first_half != NULL) {
+        cur->link = first_half;
+        first_half = first_half->link;
+        cur = cur->link;
+    }
+    while(second_half != NULL) {
+        cur->link = second_half;
+        second_half = second_half->link;
+        cur = cur->link;
+    }
+}
+
+void deferMBlockFreeing() {
+    if(defer_mblock_frees) {
+        barf("MBlock freeing is already deferred");
+    }
+    defer_mblock_frees = true;
+}
+
+void commitMBlockFreeing() {
+    if(! defer_mblock_frees) {
+        barf("MBlock freeing was never deferred");
+    }
+    defer_mblock_frees = false;
+
+    for(uint32_t node = 0; node < n_numa_nodes; node++) {
+        free_deferred_mega_groups(node);
+    }
+}
+
 /* -----------------------------------------------------------------------------
    Stats / metrics
    -------------------------------------------------------------------------- */
diff --git a/rts/sm/BlockAlloc.h b/rts/sm/BlockAlloc.h
index f28e35fc87a6..addee6cb8341 100644
--- a/rts/sm/BlockAlloc.h
+++ b/rts/sm/BlockAlloc.h
@@ -13,6 +13,9 @@
 bdescr *allocLargeChunk (W_ min, W_ max);
 bdescr *allocLargeChunkOnNode (uint32_t node, W_ min, W_ max);
 
+void deferMBlockFreeing(void);
+void commitMBlockFreeing(void);
+
 /* Debugging  -------------------------------------------------------------- */
 
 extern W_ countBlocks       (bdescr *bd);
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 15aef3a9fc54..71c1ecbfeb90 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -442,6 +442,11 @@ GarbageCollect (uint32_t collect_gen,
   memInventory(DEBUG_gc);
 #endif
 
+  // Defer all free calls for the megablock allocator to avoid quadratic runtime
+  // explosion when freeing a lot of memory in a single GC
+  // (https://gitlab.haskell.org/ghc/ghc/-/issues/19897).
+  deferMBlockFreeing();
+
   // do this *before* we start scavenging
   collectFreshWeakPtrs();
 
@@ -977,6 +982,11 @@ GarbageCollect (uint32_t collect_gen,
   resurrectThreads(resurrected_threads);
   ACQUIRE_SM_LOCK;
 
+  // Finally free the deferred mblocks by sorting the deferred free list and
+  // merging it into the actual sorted free list. This needs to happen here so
+  // that the `returnMemoryToOS` call down below can successfully free memory.
+  commitMBlockFreeing();
+
   if (major_gc) {
       W_ need_prealloc, need_live, need, got;
       uint32_t i;
-- 
GitLab