Commit f7bbc343 authored by Simon Marlow's avatar Simon Marlow Committed by Ben Gamari

Run C finalizers incrementally during mutation

With a large heap it's possible to build up a lot of finalizers
between GCs.  We've observed GC spending up to 50% of its time running
finalizers.  But there's no reason we have to run finalizers during
GC, and especially no reason we have to block *all* the mutator
threads while *one* GC thread runs finalizers one by one.

I thought about a bunch of alternative ways to handle this, which are
documented along with runSomeFinalizers() in Weak.c.  The approach I
settled on is to have a capability run finalizers if it is idle.  So
running finalizers is like a low-priority background thread. This
requires some minor scheduler changes, but not much.  In the future we
might be able to move more GC work into here (I have my eye on freeing
large blocks, for example).

Test Plan:
* validate
* tested on our system and saw reductions in GC pauses of 40-50%.

Reviewers: bgamari, niteria, osa1, erikd

Reviewed By: bgamari, osa1

Subscribers: rwbarton, thomie, carter

Differential Revision: https://phabricator.haskell.org/D4521
parent cf809950
......@@ -679,7 +679,11 @@ scheduleYield (Capability **pcap, Task *task)
// otherwise yield (sleep), and keep yielding if necessary.
do {
didGcLast = yieldCapability(&cap,task, !didGcLast);
if (doIdleGCWork(cap, false)) {
didGcLast = false;
} else {
didGcLast = yieldCapability(&cap,task, !didGcLast);
}
}
while (shouldYieldCapability(cap,task,didGcLast));
......@@ -1798,6 +1802,9 @@ delete_threads_and_gc:
}
#endif
// Do any remaining idle GC work from the previous GC
doIdleGCWork(cap, true /* all of it */);
#if defined(THREADED_RTS)
// reset pending_sync *before* GC, so that when the GC threads
// emerge they don't immediately re-enter the GC.
......@@ -1807,6 +1814,11 @@ delete_threads_and_gc:
GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
#endif
// If we're shutting down, don't leave any idle GC work to do.
if (sched_state == SCHED_SHUTTING_DOWN) {
doIdleGCWork(cap, true /* all of it */);
}
traceSparkCounters(cap);
switch (recent_activity) {
......
......@@ -17,6 +17,12 @@
#include "ThreadLabels.h"
#include "Trace.h"
// List of dead weak pointers collected by the last GC
static StgWeak *finalizer_list = NULL;
// Count of the above list.
static uint32_t n_finalizers = 0;
void
runCFinalizers(StgCFinalizerList *list)
{
......@@ -84,15 +90,16 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
StgMutArrPtrs *arr;
StgWord size;
uint32_t n, i;
Task *task;
task = myTask();
if (task != NULL) {
task->running_finalizers = true;
}
ASSERT(n_finalizers == 0);
finalizer_list = list;
// count number of finalizers, and kill all the weak pointers first...
// Traverse the list and
// * count the number of Haskell finalizers
// * overwrite all the weak pointers with DEAD_WEAK
n = 0;
i = 0;
for (w = list; w; w = w->link) {
// Better not be a DEAD_WEAK at this stage; the garbage
// collector removes DEAD_WEAKs from the weak pointer list.
......@@ -102,7 +109,8 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
n++;
}
runCFinalizers((StgCFinalizerList *)w->cfinalizers);
// Remember the length of the list, for runSomeFinalizers() below
i++;
#if defined(PROFILING)
// A weak pointer is inherently used, so we do not need to call
......@@ -113,14 +121,16 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
// no need to fill the slop, either. See stg_DEAD_WEAK_info
// in StgMiscClosures.cmm.
#endif
// We must overwrite the header with DEAD_WEAK, so that if
// there's a later call to finalizeWeak# on this weak pointer,
// we don't run the finalizer again.
SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
}
if (task != NULL) {
task->running_finalizers = false;
}
n_finalizers = i;
// No finalizers to run?
// No Haskell finalizers to run?
if (n == 0) return;
debugTrace(DEBUG_weak, "weak: batching %d finalizers", n);
......@@ -156,3 +166,90 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
scheduleThread(cap,t);
labelThread(cap, t, "weak finalizer thread");
}
/* -----------------------------------------------------------------------------
Incrementally running C finalizers
The GC detects all the dead finalizers, but we don't want to run
them during the GC because that increases the time that the runtime
is paused.
What options are there?
1. Parallelise running the C finalizers across the GC threads
- doesn't solve the pause problem, just reduces it (maybe by a lot)
2. Make a Haskell thread to run the C finalizers, like we do for
Haskell finalizers.
+ scheduling is handled for us
- no guarantee that we'll process finalizers in a timely manner
3. Run finalizers when any capability is idle.
+ reduces pause to 0
- requires scheduler modifications
- if the runtime is busy, finalizers wait until the next GC
4. like (3), but also run finalizers incrementally between GCs.
- reduces the delay to run finalizers compared with (3)
For now we do (3). It would be easy to do (4) later by adding a
call to doIdleGCWork() in the scheduler loop, but I haven't found
that necessary so far.
-------------------------------------------------------------------------- */
// Run this many finalizers before returning from
// runSomeFinalizers(). This is so that we only tie up the capability
// for a short time, and respond quickly if new work becomes
// available.
static const int32_t finalizer_chunk = 100;
// non-zero if a thread is already in runSomeFinalizers(). This
// protects the globals finalizer_list and n_finalizers.
static volatile StgWord finalizer_lock = 0;
//
// Run some C finalizers. Returns true if there's more work to do.
//
bool runSomeFinalizers(bool all)
{
if (n_finalizers == 0)
return false;
if (cas(&finalizer_lock, 0, 1) != 0) {
// another capability is doing the work, it's safe to say
// there's nothing to do, because the thread already in
// runSomeFinalizers() will call in again.
return false;
}
debugTrace(DEBUG_sched, "running C finalizers, %d remaining", n_finalizers);
Task *task = myTask();
if (task != NULL) {
task->running_finalizers = true;
}
StgWeak *w = finalizer_list;
int32_t count = 0;
while (w != NULL) {
runCFinalizers((StgCFinalizerList *)w->cfinalizers);
w = w->link;
++count;
if (!all && count >= finalizer_chunk) break;
}
finalizer_list = w;
n_finalizers -= count;
if (task != NULL) {
task->running_finalizers = false;
}
debugTrace(DEBUG_sched, "ran %d C finalizers", count);
write_barrier();
finalizer_lock = 0;
return n_finalizers != 0;
}
......@@ -19,5 +19,6 @@ void runCFinalizers(StgCFinalizerList *list);
void runAllCFinalizers(StgWeak *w);
void scheduleFinalizers(Capability *cap, StgWeak *w);
void markWeakList(void);
bool runSomeFinalizers(bool all);
#include "EndPrivate.h"
......@@ -1875,3 +1875,28 @@ static void gcCAFs(void)
debugTrace(DEBUG_gccafs, "%d CAFs live", i);
}
#endif
/* -----------------------------------------------------------------------------
The GC can leave some work for the mutator to do before the next
GC, provided the work can be safely overlapped with mutation. This
can help reduce the GC pause time.
The mutator can call doIdleGCWork() any time it likes, but
preferably when it is idle. It's safe for multiple capabilities to
call doIdleGCWork().
When 'all' is
* false: doIdleGCWork() should only take a short, bounded, amount
of time.
* true: doIdleGCWork() will complete all the outstanding GC work.
The return value is
* true if there's more to do (only if 'all' is false).
* false otherwise.
-------------------------------------------------------------------------- */
bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
{
return runSomeFinalizers(all);
}
......@@ -26,6 +26,8 @@ typedef void (*evac_fn)(void *user, StgClosure **root);
StgClosure * isAlive ( StgClosure *p );
void markCAFs ( evac_fn evac, void *user );
bool doIdleGCWork(Capability *cap, bool all);
extern uint32_t N;
extern bool major_gc;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment