diff --git a/docs/users_guide/9.2.1-notes.rst b/docs/users_guide/9.2.1-notes.rst index de4a98300180ccdb715918e1542de38bbb83ff5b..3b0022fb8a8477e6e711c9e3419c8fa9daa2b981 100644 --- a/docs/users_guide/9.2.1-notes.rst +++ b/docs/users_guide/9.2.1-notes.rst @@ -150,8 +150,6 @@ Runtime system Moreover, we now correctly account for the size of the array, meaning that space lost to fragmentation is no longer counted as live data. - - - The ``-xt`` RTS flag has been removed. Now STACK and TSO closures are always included in heap profiles. Tooling can choose to filter out these closure types ` if necessary. @@ -162,6 +160,11 @@ Runtime system be consumed with ``eventlog2html``. This profiling mode does not require a profiling build. +- The RTS will now gradually return unused memory back to the OS rather than + retaining a large amount (up to 4 * live) indefinitely. The rate at which memory + is returned is controlled by the :rts-flag:`-Fd ⟨factor⟩`. Memory return + is triggered by consecutive idle collections. + ``ghc-prim`` library ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst index b8da4aee01421a510fbc69ff2d705bee41daad26..25b27fdd1b378cb7cf506747d99687ace3ceab87 100644 --- a/docs/users_guide/runtime_control.rst +++ b/docs/users_guide/runtime_control.rst @@ -577,6 +577,25 @@ performance. The :rts-flag:`-F ⟨factor⟩` setting will be automatically reduced by the garbage collector when the maximum heap size (the :rts-flag:`-M ⟨size⟩` setting) is approaching. +.. rts-flag:: -Fd ⟨factor⟩ + + :default: 4 + + .. index:: + single: heap size, factor + + The inverse rate at which unused memory is returned to the OS when it is no longer + needed. After a large amount of allocation the RTS will start by retaining + a lot of allocated blocks in case it will need them again shortly but then + it will gradually release them based on the :rts-flag:`-Fd ⟨factor⟩`. On + each subsequent major collection which is not caused by a heap overflow a little + more memory will attempt to be returned until the amount retained is similar to + the amount of live bytes. + + Increasing this factor will make the rate memory is returned slower, decreasing + it will make memory be returned more eagerly. Setting it to 0 will disable the + memory return (which will emulate the behaviour in releases prior to 9.2). + .. rts-flag:: -G ⟨generations⟩ :default: 2 diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index eda961656db48f2179e098973b64fb66c418fb15..ff05426e8a881fb160fd41c5580b29e3ce13006a 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -50,6 +50,7 @@ typedef struct _GC_FLAGS { uint32_t heapSizeSuggestion; /* in *blocks* */ bool heapSizeSuggestionAuto; double oldGenFactor; + double returnDecayFactor; double pcFreeHeap; bool useNonmoving; // default = false diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc index 2abe5d7d85479a67f057da8296f0f7256ebb34f6..138033758bea514f5bbc334ad5dc77157dbe828a 100644 --- a/libraries/base/GHC/RTS/Flags.hsc +++ b/libraries/base/GHC/RTS/Flags.hsc @@ -131,6 +131,7 @@ data GCFlags = GCFlags , heapSizeSuggestion :: Word32 , heapSizeSuggestionAuto :: Bool , oldGenFactor :: Double + , returnDecayFactor :: Double , pcFreeHeap :: Double , generations :: Word32 , squeezeUpdFrames :: Bool @@ -435,6 +436,7 @@ getGCFlags = do <*> (toBool <$> (#{peek GC_FLAGS, heapSizeSuggestionAuto} ptr :: IO CBool)) <*> #{peek GC_FLAGS, oldGenFactor} ptr + <*> #{peek GC_FLAGS, returnDecayFactor} ptr <*> #{peek GC_FLAGS, pcFreeHeap} ptr <*> #{peek GC_FLAGS, generations} ptr <*> (toBool <$> diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index fa4af7f7a692e8fb24008ab28ede14930fbfa3b8..9bf3f692ab10d34f55c6e0cfe0a304784b340ae8 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -164,6 +164,7 @@ void initRtsFlagsDefaults(void) RtsFlags.GcFlags.heapSizeSuggestionAuto = false; RtsFlags.GcFlags.pcFreeHeap = 3; /* 3% */ RtsFlags.GcFlags.oldGenFactor = 2; + RtsFlags.GcFlags.returnDecayFactor = 4; RtsFlags.GcFlags.useNonmoving = false; RtsFlags.GcFlags.nonmovingSelectorOpt = false; RtsFlags.GcFlags.generations = 2; @@ -324,6 +325,12 @@ usage_text[] = { " -F<n> Sets the collecting threshold for old generations as a factor of", " the live data in that generation the last time it was collected", " (default: 2.0)", +" -Fd<n> Sets the inverse rate which memory is returned to the OS after being", +" optimistically retained after being allocated. Subsequent major", +" collections not caused by heap overflow will return an amount of", +" memory controlled by this factor (higher is slower). Setting the factor", +" to 0 means memory is not returned.", +" (default 4.0)", " -n<size> Allocation area chunk size (0 = disabled, default: 0)", " -O<size> Sets the minimum size of the old generation (default 1M)", " -M<size> Sets the maximum heap size (default unlimited) Egs: -M256k -M1G", @@ -1153,10 +1160,19 @@ error = true; case 'F': OPTION_UNSAFE; - RtsFlags.GcFlags.oldGenFactor = atof(rts_argv[arg]+2); + switch(rts_argv[arg][2]) { + case 'd': + RtsFlags.GcFlags.returnDecayFactor = atof(rts_argv[arg]+3); + if (RtsFlags.GcFlags.returnDecayFactor < 0) + bad_option( rts_argv[arg] ); + break; + default: + RtsFlags.GcFlags.oldGenFactor = atof(rts_argv[arg]+2); - if (RtsFlags.GcFlags.oldGenFactor < 0) - bad_option( rts_argv[arg] ); + if (RtsFlags.GcFlags.oldGenFactor < 0) + bad_option( rts_argv[arg] ); + break; + }; break; case 'D': diff --git a/rts/Schedule.c b/rts/Schedule.c index d9d5c9a74a99c12dfa1e05813076af265cd1b9aa..e0631482c91c2bceb6f9bf4d35f9e22410ce8a24 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -166,7 +166,7 @@ static bool scheduleHandleThreadFinished( Capability *cap, Task *task, StgTSO *t ); static bool scheduleNeedHeapProfile(bool ready_to_gc); static void scheduleDoGC( Capability **pcap, Task *task, - bool force_major, bool deadlock_detect ); + bool force_major, bool is_overflow_gc, bool deadlock_detect ); static void deleteThread (StgTSO *tso); static void deleteAllThreads (void); @@ -267,7 +267,7 @@ schedule (Capability *initialCapability, Task *task) case SCHED_INTERRUPTING: debugTrace(DEBUG_sched, "SCHED_INTERRUPTING"); /* scheduleDoGC() deletes all the threads */ - scheduleDoGC(&cap,task,true,false); + scheduleDoGC(&cap,task,true,false,false); // after scheduleDoGC(), we must be shutting down. Either some // other Capability did the final GC, or we did it above, @@ -576,7 +576,7 @@ run_thread: } if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) { - scheduleDoGC(&cap,task,false,false); + scheduleDoGC(&cap,task,false,ready_to_gc,false); } } /* end of while() */ } @@ -951,7 +951,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task) // they are unreachable and will therefore be sent an // exception. Any threads thus released will be immediately // runnable. - scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/); + scheduleDoGC (pcap, task, true/*force major GC*/, false /* Whether it is an overflow GC */, true/*deadlock detection*/); cap = *pcap; // when force_major == true. scheduleDoGC sets // recent_activity to ACTIVITY_DONE_GC and turns off the timer @@ -1025,7 +1025,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS) while (!emptyInbox(cap)) { // Executing messages might use heap, so we should check for GC. if (doYouWantToGC(cap)) { - scheduleDoGC(pcap, cap->running_task, false, false); + scheduleDoGC(pcap, cap->running_task, false, false, false); cap = *pcap; } @@ -1590,7 +1590,7 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task) // behind deadlock_detect argument. static void scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS, - bool force_major, bool deadlock_detect) + bool force_major, bool is_overflow_gc, bool deadlock_detect) { Capability *cap = *pcap; bool heap_census; @@ -1883,9 +1883,9 @@ delete_threads_and_gc: // emerge they don't immediately re-enter the GC. pending_sync = 0; signalCondition(&sync_finished_cond); - GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap); + GarbageCollect(collect_gen, heap_census, is_overflow_gc, deadlock_detect, gc_type, cap, idle_cap); #else - GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL); + GarbageCollect(collect_gen, heap_census, is_overflow_gc, deadlock_detect, 0, cap, NULL); #endif // If we're shutting down, don't leave any idle GC work to do. @@ -2773,7 +2773,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS) nonmovingStop(); Capability *cap = task->cap; waitForCapability(&cap,task); - scheduleDoGC(&cap,task,true,false); + scheduleDoGC(&cap,task,true,false,false); ASSERT(task->incall->tso == NULL); releaseCapability(cap); } @@ -2841,7 +2841,7 @@ performGC_(bool force_major) // TODO: do we need to traceTask*() here? waitForCapability(&cap,task); - scheduleDoGC(&cap,task,force_major,false); + scheduleDoGC(&cap,task,force_major,false,false); releaseCapability(cap); exitMyTask(); } diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 2911aad7a0ff52cbcbafb2a0bcf498c1287386ae..1a71bd7bf09a59dc9c4ba9301b1467f3f0f42fcb 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -120,6 +120,8 @@ bool unload_mark_needed; */ static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC +static int consec_idle_gcs = 0; + /* Mut-list stats */ #if defined(DEBUG) // For lack of a better option we protect mutlist_scav_stats with oldest_gen->sync @@ -261,6 +263,7 @@ addMutListScavStats(const MutListScavStats *src, void GarbageCollect (uint32_t collect_gen, const bool do_heap_census, + const bool is_overflow_gc, const bool deadlock_detect, uint32_t gc_type USED_IF_THREADS, Capability *cap, @@ -981,11 +984,26 @@ GarbageCollect (uint32_t collect_gen, } #endif - /* If the amount of data remains constant, next major GC we'll - * require (F+1)*live + prealloc. We leave (F+2)*live + prealloc - * in order to reduce repeated deallocation and reallocation. #14702 - */ - need = need_prealloc + (RtsFlags.GcFlags.oldGenFactor + 2) * need_live; + // Reset the counter if the major GC was caused by a heap overflow + consec_idle_gcs = is_overflow_gc ? 0 : consec_idle_gcs + 1; + + // See Note [Scaling retained memory] + double scaled_factor = + RtsFlags.GcFlags.returnDecayFactor > 0 + ? RtsFlags.GcFlags.oldGenFactor / pow(2, (float) consec_idle_gcs / RtsFlags.GcFlags.returnDecayFactor) + : RtsFlags.GcFlags.oldGenFactor; + + debugTrace(DEBUG_gc, "factors: %f %d %f", RtsFlags.GcFlags.oldGenFactor, consec_idle_gcs, scaled_factor ); + + // Unavoidable need depends on GC strategy + // * Copying need 2 * live + // * Compacting need 1.x * live (we choose 1.2) + // * Nonmoving needs ~ 1.x * live + double unavoidable_need_factor = (oldest_gen->compact || RtsFlags.GcFlags.useNonmoving) + ? 1.2 : 2; + W_ scaled_needed = (scaled_factor + unavoidable_need_factor) * need_live; + debugTrace(DEBUG_gc, "factors_2: %f %d", unavoidable_need_factor, scaled_needed); + need = need_prealloc + scaled_needed; /* Also, if user set heap size, do not drop below it. */ @@ -1003,6 +1021,7 @@ GarbageCollect (uint32_t collect_gen, need = BLOCKS_TO_MBLOCKS(need); got = mblocks_allocated; + debugTrace(DEBUG_gc,"Returning: %d %d", got, need); uint32_t returned = 0; if (got > need) { @@ -2208,3 +2227,53 @@ bool doIdleGCWork(Capability *cap STG_UNUSED, bool all) * work_stealing is "mostly immutable". We set it to false when we begin the * final sequential collections, for the benefit of notifyTodoBlock. * */ + +/* Note [Scaling retained memory] + * Tickets: #19381 #19359 #14702 + * + * After a spike in memory usage we have been conservative about returning + * allocated blocks to the OS in case we are still allocating a lot and would + * end up just reallocating them. The result of this was that up to 4 * live_bytes + * of blocks would be retained once they were allocated even if memory usage ended up + * a lot lower. + * + * For a heap of size ~1.5G, this would result in OS memory reporting 6G which is + * both misleading and worrying for users. + * In long-lived server applications this results in consistent high memory + * usage when the live data size is much more reasonable (for example ghcide) + * + * Therefore we have a new (2021) strategy which starts by retaining up to 4 * live_bytes + * of blocks before gradually returning uneeded memory back to the OS on subsequent + * major GCs which are NOT caused by a heap overflow. + * + * Each major GC which is NOT caused by heap overflow increases the consec_idle_gcs + * counter and the amount of memory which is retained is inversely proportional to this number. + * By default the excess memory retained is + * oldGenFactor (controlled by -F) / 2 ^ (consec_idle_gcs * returnDecayFactor) + * + * On a major GC caused by a heap overflow, the `consec_idle_gcs` variable is reset to 0 + * (as we could continue to allocate more, so retaining all the memory might make sense). + * + * Therefore setting bigger values for `-Fd` makes the rate at which memory is returned slower. + * Smaller values make it get returned faster. Setting `-Fd0` means no additional memory + * is retained. + * + * The default is `-Fd4` which results in the following scaling: + * + * > mapM print [(x, 1/ (2**(x / 4))) | x <- [1 :: Double ..20]] + * (1.0,0.8408964152537146) + * ... + * (4.0,0.5) + * ... + * (12.0,0.125) + * ... + * (20.0,3.125e-2) + * + * So after 12 consecutive GCs only 0.1 of the maximum memory used will be retained. + * + * Further to this decay factor, the amount of memory we attempt to retain is + * also influenced by the GC strategy for the oldest generation. If we are using + * a copying strategy then we will need at least 2 * live_bytes for copying to take + * place, so we always keep that much. If using compacting or nonmoving then we need a lower number, + * so we just retain at least `1.2 * live_bytes` for some protection. + */ diff --git a/rts/sm/GC.h b/rts/sm/GC.h index 239f281910c0c543b0d5521ab43b2d541b41f378..da90c613024bb6f186f4c4c061ca862f3b98b81f 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -19,6 +19,7 @@ void GarbageCollect (uint32_t collect_gen, bool do_heap_census, + bool is_overflow_gc, bool deadlock_detect, uint32_t gc_type, Capability *cap, diff --git a/testsuite/tests/rts/T19381.hs b/testsuite/tests/rts/T19381.hs new file mode 100644 index 0000000000000000000000000000000000000000..1b65e06ac9aec7352b5bfe07fe336497deef6628 --- /dev/null +++ b/testsuite/tests/rts/T19381.hs @@ -0,0 +1,40 @@ +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} +{-# LANGUAGE BangPatterns #-} +module Main where + +import GHC.Exts +import GHC.IO +import GHC.Stats +import System.Mem +import Control.Monad + +data BA = BA ByteArray# + +mblockSize = 2 ^ 20 + +main = do + -- Allocate 1000 byte arrays, to get a high watermark before only keeping + -- 100 of them. + ba <- take 100 <$> replicateM 1000 mkBA + let !n = (length ba) + -- Each major GC should free some amount of memory, 100 is just a large + -- number + replicateM 100 performMajorGC + s <- getRTSStats + let mblocks = (gcdetails_mem_in_use_bytes (gc s) `div` mblockSize) + live = (gcdetails_live_bytes (gc s) `div` mblockSize) + if fromIntegral mblocks < (2.2 * fromIntegral live) + then return () + else error ("Additional memory is retained: " + ++ show live ++ "/" + ++ show mblocks) + -- Here to retain the ba + (length ba) `seq` return () + +mkBA = + let (I# siz) = 2^19 -- ~0.1MB + in IO $ \s0 -> + case newByteArray# siz s0 of + (# s1, mba #) -> case unsafeFreezeByteArray# mba s1 of + (# s2, ba #) -> (# s2, BA ba #) diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T index 9f2a54cd0f269d130c6b7c3841c080e95749daaa..7100aaf3d79d3bc7925cd08c6eafcd677480c1a4 100644 --- a/testsuite/tests/rts/all.T +++ b/testsuite/tests/rts/all.T @@ -425,3 +425,4 @@ test('T17088', test('T15427', normal, compile_and_run, ['']) test('T19481', extra_run_opts('+RTS -T -RTS'), compile_and_run, ['']) +test('T19381', extra_run_opts('+RTS -T -RTS'), compile_and_run, [''])