From 47d6acd3be1fadc0c59b7b4d4e105242c0ae0b90 Mon Sep 17 00:00:00 2001 From: Matthew Pickering <matthewtpickering@gmail.com> Date: Thu, 4 Mar 2021 10:16:07 +0000 Subject: [PATCH] rts: Use a separate free block list for allocatePinned The way in which allocatePinned took blocks out of the nursery was leading to horrible fragmentation in some workloads. The strategy now is that a separate free block list is reserved for each capability and blocks are taken from there. When it's empty the global SM lock is taken and a fresh block of size PINNED_EMPTY_SIZE is allocated. Fixes #19481 --- rts/Capability.c | 1 + rts/Capability.h | 2 + rts/sm/Sanity.c | 7 +- rts/sm/Storage.c | 162 +++++++++++++++++++++++++++++++--- testsuite/tests/rts/T19481.hs | 56 ++++++++++++ testsuite/tests/rts/all.T | 2 + 6 files changed, 215 insertions(+), 15 deletions(-) create mode 100644 testsuite/tests/rts/T19481.hs diff --git a/rts/Capability.c b/rts/Capability.c index 136a62a71e24..7a83821e0026 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -314,6 +314,7 @@ initCapability (Capability *cap, uint32_t i) cap->interrupt = 0; cap->pinned_object_block = NULL; cap->pinned_object_blocks = NULL; + cap->pinned_object_empty = NULL; #if defined(PROFILING) cap->r.rCCCS = CCS_SYSTEM; diff --git a/rts/Capability.h b/rts/Capability.h index df486829ea7b..14ba9ef2d7c7 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -93,6 +93,8 @@ struct Capability_ { bdescr *pinned_object_block; // full pinned object blocks allocated since the last GC bdescr *pinned_object_blocks; + // empty pinned object blocks, to be allocated into + bdescr *pinned_object_empty; // per-capability weak pointer list associated with nursery (older // lists stored in generation object) diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c index 193a1a884c7c..b39559a653f8 100644 --- a/rts/sm/Sanity.c +++ b/rts/sm/Sanity.c @@ -1185,7 +1185,7 @@ memInventory (bool show) { uint32_t g, i; W_ gen_blocks[RtsFlags.GcFlags.generations]; - W_ nursery_blocks = 0, retainer_blocks = 0, + W_ nursery_blocks = 0, free_pinned_blocks = 0, retainer_blocks = 0, arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0, upd_rem_set_blocks = 0; W_ live_blocks = 0, free_blocks = 0; @@ -1223,6 +1223,7 @@ memInventory (bool show) nursery_blocks += capabilities[i]->pinned_object_block->blocks; } nursery_blocks += countBlocks(capabilities[i]->pinned_object_blocks); + free_pinned_blocks += countBlocks(capabilities[i]->pinned_object_empty); } #if defined(PROFILING) @@ -1252,7 +1253,7 @@ memInventory (bool show) } live_blocks += nursery_blocks + + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks - + upd_rem_set_blocks; + + upd_rem_set_blocks + free_pinned_blocks; #define MB(n) (((double)(n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_))) @@ -1271,6 +1272,8 @@ memInventory (bool show) } debugBelch(" nursery : %5" FMT_Word " blocks (%6.1lf MB)\n", nursery_blocks, MB(nursery_blocks)); + debugBelch(" empty pinned : %5" FMT_Word " blocks (%6.1lf MB)\n", + nursery_blocks, MB(free_pinned_blocks)); debugBelch(" retainer : %5" FMT_Word " blocks (%6.1lf MB)\n", retainer_blocks, MB(retainer_blocks)); debugBelch(" arena blocks : %5" FMT_Word " blocks (%6.1lf MB)\n", diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 2bab2d6432f1..82e959e8d290 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -67,6 +67,16 @@ generation *oldest_gen = NULL; /* oldest generation, for convenience */ nursery *nurseries = NULL; uint32_t n_nurseries; +/* Pinned Nursery Size, the number of blocks that we reserve for + * pinned data. The number chosen here decides whether pinned objects + * are allocated from the free_list (if n < BLOCKS_PER_MBLOCK) or whether + * a fresh mblock is allocated each time. + * See Note [Sources of Block Level Fragmentation] + * */ + +#define PINNED_EMPTY_SIZE BLOCKS_PER_MBLOCK + + /* * When we are using nursery chunks, we need a separate next_nursery * pointer for each NUMA node. @@ -353,6 +363,7 @@ void listAllBlocks (ListBlocksCb cb, void *user) cb(user, capabilities[i]->pinned_object_block); } cb(user, capabilities[i]->pinned_object_blocks); + cb(user, capabilities[i]->pinned_object_empty); } } @@ -1257,25 +1268,47 @@ allocatePinned (Capability *cap, W_ n /*words*/, W_ alignment /*bytes*/, W_ alig // avoid that (benchmarks that allocate a lot of pinned // objects scale really badly if we do this). // - // So first, we try taking the next block from the nursery, in - // the same way as allocate(). - bd = cap->r.rCurrentNursery->link; + // See Note [Sources of Block Level Fragmentation] + // for a more complete history of this section. + bd = cap->pinned_object_empty; if (bd == NULL) { - // The nursery is empty: allocate a fresh block (we can't fail + // The pinned block list is empty: allocate a fresh block (we can't fail // here). ACQUIRE_SM_LOCK; - bd = allocBlockOnNode(cap->node); + bd = allocNursery(cap->node, NULL, PINNED_EMPTY_SIZE); RELEASE_SM_LOCK; - initBdescr(bd, g0, g0); - } else { - newNurseryBlock(bd); - // we have a block in the nursery: steal it - cap->r.rCurrentNursery->link = bd->link; - if (bd->link != NULL) { - bd->link->u.back = cap->r.rCurrentNursery; + } + + // Bump up the nursery pointer to avoid the pathological situation + // where a program is *only* allocating pinned objects. + // T4018 fails without this safety. + // This has the effect of counting a full pinned block in the same way + // as a full nursery block, so GCs will be triggered at the same interval + // if you are only allocating pinned data compared to normal allocations + // via allocate(). + bdescr * nbd; + nbd = cap->r.rCurrentNursery->link; + if (nbd != NULL){ + newNurseryBlock(nbd); + cap->r.rCurrentNursery->link = nbd->link; + if (nbd->link != NULL) { + nbd->link->u.back = cap->r.rCurrentNursery; } - cap->r.rNursery->n_blocks -= bd->blocks; + dbl_link_onto(nbd, &cap->r.rNursery->blocks); + // Important for accounting purposes + if (cap->r.rCurrentAlloc){ + finishedNurseryBlock(cap, cap->r.rCurrentAlloc); + } + cap->r.rCurrentAlloc = nbd; + } + + + cap->pinned_object_empty = bd->link; + newNurseryBlock(bd); + if (bd->link != NULL) { + bd->link->u.back = cap->pinned_object_empty; } + initBdescr(bd, g0, g0); cap->pinned_object_block = bd; bd->flags = BF_PINNED | BF_LARGE | BF_EVACUATED; @@ -1912,3 +1945,106 @@ _bdescr (StgPtr p) } #endif + +/* +Note [Sources of Block Level Fragmentation] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Block level fragmentation is when there is unused space in megablocks. +The amount of fragmentation can be calculated as the difference between the +total size of allocated blocks and the total size of allocated megablocks. + +The act of the copying collection naturally reduces fragmentation by moving +data between megablocks. Over time, the effect is that most megablocks end up quite full because +data will be copied out of fragmented megablocks. The new block is chosen from +the free list where the aim is to choose a gap of approximately the right size for +the copied block so the data will end up in a probably less fragmented block. +There are two situations where we end up with block fragmentation. + +1. Fragmentation from pinned data +2. Fragmentation from nursery allocated blocks + +# Pinned Data Fragmentation + +There are two sources of +pinned data, large objects and pinned bytearrays. After one of these object +types is allocated, it is never moved by the collector +and therefore if all the other blocks are collected around it then you can end +up with a megablock with one pinned block and no other blocks. No special +effort is taken in the compiler +to ensure that this kind of fragmentation doesn't happen in the first place and +once the heap is fragmented in this way, there's nothing you can do about it +beyond hoping that the pinned data is eventually freed. + +# Nursery Fragmentation + +The other reason that a block may not ever be moved or emptied is if it forms +part of the nursery. When the nursery is first allocated then it is made up of +megablock sized chunks, so if the nursery is 4 megabytes then it will consist of +blocks from about 4 megablocks. + +Over time, the nursery is resized (by resizeNurseries) under various conditions. +It gets bigger when +we are allocating more and then smaller when we are allocating less. +When the nursery is resized +blocks are added or removed to it at potentially smaller sizes than a complete +megablock. For example, if the nursery size needs to increase by 1, then +the free list is consulted for a block of size 1 (from a random block) +and that's added to the nursery. + +Over time the make-up of the nursery changes from 4 +contiguous megablocks to a hodge-podge of blocks from different megablocks. In +some programs (see #19481), the fragmentation is so bad that a program with +only 4 MB of live data can retain over 500 megablocks because each of these +megablocks contributed a small number of blocks to the nursery. + +In particular, and confusingly, this second form of fragmentation was caused +by the act of allocating pinned objects. `allocPinned` was the primary +reason that the nursery size decreases by small amounts. When `allocPinned` +needed a block then it took a block permanently out of +the nursery which shrunk the size of the nursery by 1 block. Then next time the size +of the nursery was checked, the `alloc_nurseries` found that the existing +nursery was smaller than the desired size and a new blocked needed +to be added. This allocation was serviced from an arbitrary megablock +which had some free space. The effect over time as more allocation happened +was the nursery became made up of blocks from many different megablocks. + +Instead now we maintain a separate small list of blocks in `pinned_object_empty` +which fresh blocks are taken from when we need a new one for a pinned block rather +than threatening the continuity of the nursery. The size of this list is controlled +by the PINNED_EMPTY_SIZE macro. + +In theory, this kind of fragmentation due to the nursery could still happen +but in practice removing the primary cause (allocatePinned) was sufficient to +greatly improve the situation. Another way to "fix" fragmentation of the nursery +would be to periodically reallocate it when it was fragmented across many megablocks. + +Ticket: #19481 + +# When can fragmentation be observed? + +Fragmentation is observed when the live data in a program is low compared to +the overall resident size of the heap. The block allocator can reuse unused +space within a megablock and therefore as residency +increases again, the fragmented blocks will get filled up. Having a block-level +fragmented heap means your program will never go below a certain memory +threshold but it doesn't "use" more memory during periods of high residency. +To clarify, say you observe 100 MB of fragmentation when your live data is +4 MB, if your live data rise to 200MB then you probably will not still observe 100 MB +of fragmentation as the block allocate will use the space in fragmented megablocks. + +# How to observe fragmentation + +Your heap is probably fragmented when + +* Live bytes is low +* Memory in use (number of megablocks) is comparatively high +* The size of the free list dominates residency (this can be observed using the + debug RTS and the memory inventory produced by -Dg). + +# Compacting Collector + +The compacting collector does nothing to improve megablock +level fragmentation. The role of the compacting GC is to remove object level +fragmentation and to use less memory when collecting. - see #19248 +*/ diff --git a/testsuite/tests/rts/T19481.hs b/testsuite/tests/rts/T19481.hs new file mode 100644 index 000000000000..bd3ed6895fe6 --- /dev/null +++ b/testsuite/tests/rts/T19481.hs @@ -0,0 +1,56 @@ +{-# LANGUAGE UnboxedTuples #-} +{-# LANGUAGE MagicHash #-} +module Main where + +import GHC.Exts +import GHC.IO +import GHC.Stats +import System.Mem +import Control.Monad + +data BA = BA ByteArray# + +-- TODO: This shouldn't be hardcoded but MBLOCK_SIZE isn't exported by +-- any RTS header I could find. +mblockSize = 2 ^ 20 + +main = do + -- Increasing this number increases the amount of fragmentation (but not + -- linearly) + ba <- replicateM 500 one + replicateM 100 performMajorGC + s <- getRTSStats + let mblocks = (gcdetails_mem_in_use_bytes (gc s) `div` mblockSize) + if mblocks < 15 + then return () + else error ("Heap is fragmented: " ++ show mblocks) + return () + +one = do + ba <- mkBlock + bs <- mapM isP ba + return () + + +isP (BA ba) = IO $ \s0 -> (# s0, isTrue# (isByteArrayPinned# ba) #) + +mkN 0 = return [] +mkN k = (:) <$> mkBA <*> mkN (k - 1) + +-- Mixture of pinned and unpinned allocation so that allocatePinned takes +-- some pinned blocks from the nursery. +mkBlock = (++) <$> replicateM 100 mkBAP <*> replicateM 10000 mkBA + +mkBAP = + IO $ \s0 -> + -- 1024 is below large object threshold but fills up a block quickly + case newPinnedByteArray# 1024# s0 of + (# s1, mba #) -> case unsafeFreezeByteArray# mba s1 of + (# s2, ba #) -> (# s2, BA ba #) + +mkBA = + IO $ \s0 -> + -- 1024 is below large object threshold but fills up a block quickly + case newByteArray# 1024# s0 of + (# s1, mba #) -> case unsafeFreezeByteArray# mba s1 of + (# s2, ba #) -> (# s2, BA ba #) diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T index e74834d2a154..9f2a54cd0f26 100644 --- a/testsuite/tests/rts/all.T +++ b/testsuite/tests/rts/all.T @@ -423,3 +423,5 @@ test('T17088', compile_and_run, ['-rtsopts -O2']) test('T15427', normal, compile_and_run, ['']) + +test('T19481', extra_run_opts('+RTS -T -RTS'), compile_and_run, ['']) -- GitLab