Commit a02eb298 authored by Simon Marlow's avatar Simon Marlow

New flag +RTS -qi<n>, avoid waking up idle Capabilities to do parallel GC

This is an experimental tweak to the parallel GC that avoids waking up
a Capability to do parallel GC if we know that the capability has been
idle for a (tunable) number of GC cycles.  The idea is that if you're
only using a few Capabilities, there's no point waking up the ones
that aren't busy.

e.g. +RTS -qi3

says "A Capability will participate in parallel GC if it was running
at all since the last 3 GC cycles."

Results are a bit hit and miss, and I don't completely understand why
yet.  Hence, for now it is turned off by default, and also not
documented except in the +RTS -? output.
parent 55d7931d
......@@ -170,6 +170,14 @@ struct PAR_FLAGS {
unsigned int parGcLoadBalancingGen;
/* do load-balancing in this
* generation and higher only */
unsigned int parGcNoSyncWithIdle;
/* if a Capability has been idle for
* this many GCs, do not try to wake
* it up when doing a
* non-load-balancing parallel GC.
* (zero disables) */
rtsBool setAffinity; /* force thread affinity with CPUs */
};
#endif /* THREADED_RTS */
......
......@@ -226,6 +226,7 @@ initCapability( Capability *cap, nat i )
cap->no = i;
cap->in_haskell = rtsFalse;
cap->idle = 0;
cap->run_queue_hd = END_TSO_QUEUE;
cap->run_queue_tl = END_TSO_QUEUE;
......
......@@ -46,6 +46,9 @@ struct Capability_ {
// catching unsafe call-ins.
rtsBool in_haskell;
// Has there been any activity on this Capability since the last GC?
nat idle;
// The run queue. The Task owning this Capability has exclusive
// access to its run queue, so can wake up threads without
// taking a lock, and the common path through the scheduler is
......
......@@ -195,6 +195,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.ParFlags.parGcGen = 0;
RtsFlags.ParFlags.parGcLoadBalancingEnabled = rtsTrue;
RtsFlags.ParFlags.parGcLoadBalancingGen = 1;
RtsFlags.ParFlags.parGcNoSyncWithIdle = 0;
RtsFlags.ParFlags.setAffinity = 0;
#endif
......@@ -367,6 +368,9 @@ usage_text[] = {
" (default: 1, -qb alone turns off load-balancing)",
" -qa Use the OS to set thread affinity (experimental)",
" -qm Don't automatically migrate threads between CPUs",
" -qi<n> If a processor has been idle for the last <n> GCs, do not",
" wake it up for a non-load-balancing parallel GC.",
" (0 disables, default: 0)",
#endif
" --install-signal-handlers=<yes|no>",
" Install signal handlers (default: yes)",
......@@ -1193,7 +1197,11 @@ error = rtsTrue;
= strtol(rts_argv[arg]+3, (char **) NULL, 10);
}
break;
case 'a':
case 'i':
RtsFlags.ParFlags.parGcNoSyncWithIdle
= strtol(rts_argv[arg]+3, (char **) NULL, 10);
break;
case 'a':
RtsFlags.ParFlags.setAffinity = rtsTrue;
break;
case 'm':
......
......@@ -27,6 +27,7 @@
#include "ProfHeap.h"
#include "Weak.h"
#include "sm/GC.h" // waitForGcThreads, releaseGCThreads, N
#include "sm/GCThread.h"
#include "Sparks.h"
#include "Capability.h"
#include "Task.h"
......@@ -115,6 +116,11 @@ Mutex sched_mutex;
#define FORKPROCESS_PRIMOP_SUPPORTED
#endif
// Local stats
#ifdef THREADED_RTS
static nat n_failed_trygrab_idles = 0, n_idle_caps = 0;
#endif
/* -----------------------------------------------------------------------------
* static function prototypes
* -------------------------------------------------------------------------- */
......@@ -426,6 +432,7 @@ run_thread:
cap->interrupt = 0;
cap->in_haskell = rtsTrue;
cap->idle = 0;
dirty_TSO(cap,t);
dirty_STACK(cap,t->stackobj);
......@@ -1413,6 +1420,7 @@ scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major)
{
rtsBool heap_census;
#ifdef THREADED_RTS
rtsBool idle_cap[n_capabilities];
rtsBool gc_type;
nat i, sync;
#endif
......@@ -1482,8 +1490,51 @@ scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major)
}
else
{
// multi-threaded GC: make sure all the Capabilities donate one
// GC thread each.
// If we are load-balancing collections in this
// generation, then we require all GC threads to participate
// in the collection. Otherwise, we only require active
// threads to participate, and we set gc_threads[i]->idle for
// any idle capabilities. The rationale here is that waking
// up an idle Capability takes much longer than just doing any
// GC work on its behalf.
if (RtsFlags.ParFlags.parGcNoSyncWithIdle == 0
|| (RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
N >= RtsFlags.ParFlags.parGcLoadBalancingGen)) {
for (i=0; i < n_capabilities; i++) {
idle_cap[i] = rtsFalse;
}
} else {
for (i=0; i < n_capabilities; i++) {
if (i == cap->no || capabilities[i].idle < RtsFlags.ParFlags.parGcNoSyncWithIdle) {
idle_cap[i] = rtsFalse;
} else {
idle_cap[i] = tryGrabCapability(&capabilities[i], task);
if (!idle_cap[i]) {
n_failed_trygrab_idles++;
} else {
n_idle_caps++;
}
}
}
}
// We set the gc_thread[i]->idle flag if that
// capability/thread is not participating in this collection.
// We also keep a local record of which capabilities are idle
// in idle_cap[], because scheduleDoGC() is re-entrant:
// another thread might start a GC as soon as we've finished
// this one, and thus the gc_thread[]->idle flags are invalid
// as soon as we release any threads after GC. Getting this
// wrong leads to a rare and hard to debug deadlock!
for (i=0; i < n_capabilities; i++) {
gc_threads[i]->idle = idle_cap[i];
capabilities[i].idle++;
}
// For all capabilities participating in this GC, wait until
// they have stopped mutating and are standing by for GC.
waitForGcThreads(cap);
#if defined(THREADED_RTS)
......@@ -1565,6 +1616,18 @@ delete_threads_and_gc:
if (gc_type == SYNC_GC_PAR)
{
releaseGCThreads(cap);
for (i = 0; i < n_capabilities; i++) {
if (i != cap->no) {
if (idle_cap[i]) {
ASSERT(capabilities[i].running_task == task);
task->cap = &capabilities[i];
releaseCapability(&capabilities[i]);
} else {
ASSERT(capabilities[i].running_task != task);
}
}
}
task->cap = cap;
}
#endif
......@@ -2278,6 +2341,9 @@ exitScheduler (rtsBool wait_foreign USED_IF_THREADS)
shutdownCapabilities(task, wait_foreign);
// debugBelch("n_failed_trygrab_idles = %d, n_idle_caps = %d\n",
// n_failed_trygrab_idles, n_idle_caps);
boundTaskExiting(task);
}
......
......@@ -338,6 +338,13 @@ GarbageCollect (rtsBool force_major_gc,
}
} else {
scavenge_capability_mut_lists(gct->cap);
for (n = 0; n < n_capabilities; n++) {
if (gc_threads[n]->idle) {
markCapability(mark_root, gct, &capabilities[n],
rtsTrue/*don't mark sparks*/);
scavenge_capability_mut_lists(&capabilities[n]);
}
}
}
// follow roots from the CAF list (used by GHCi)
......@@ -401,7 +408,11 @@ GarbageCollect (rtsBool force_major_gc,
pruneSparkQueue(&capabilities[n]);
}
} else {
pruneSparkQueue(gct->cap);
for (n = 0; n < n_capabilities; n++) {
if (n == cap->no || gc_threads[n]->idle) {
pruneSparkQueue(&capabilities[n]);
}
}
}
#endif
......@@ -808,6 +819,7 @@ new_gc_thread (nat n, gc_thread *t)
#endif
t->thread_index = n;
t->idle = rtsFalse;
t->free_blocks = NULL;
t->gc_count = 0;
......@@ -1114,7 +1126,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS)
while(retry) {
for (i=0; i < n_threads; i++) {
if (i == me) continue;
if (i == me || gc_threads[i]->idle) continue;
if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
prodCapability(&capabilities[i], cap->running_task);
}
......@@ -1122,7 +1134,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS)
for (j=0; j < 10; j++) {
retry = rtsFalse;
for (i=0; i < n_threads; i++) {
if (i == me) continue;
if (i == me || gc_threads[i]->idle) continue;
write_barrier();
interruptCapability(&capabilities[i]);
if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
......@@ -1154,8 +1166,8 @@ wakeup_gc_threads (nat me USED_IF_THREADS)
if (n_gc_threads == 1) return;
for (i=0; i < n_gc_threads; i++) {
if (i == me) continue;
inc_running();
if (i == me || gc_threads[i]->idle) continue;
inc_running();
debugTrace(DEBUG_gc, "waking up gc thread %d", i);
if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) barf("wakeup_gc_threads");
......@@ -1178,7 +1190,7 @@ shutdown_gc_threads (nat me USED_IF_THREADS)
if (n_gc_threads == 1) return;
for (i=0; i < n_gc_threads; i++) {
if (i == me) continue;
if (i == me || gc_threads[i]->idle) continue;
while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); }
}
#endif
......@@ -1192,8 +1204,8 @@ releaseGCThreads (Capability *cap USED_IF_THREADS)
const nat me = cap->no;
nat i;
for (i=0; i < n_threads; i++) {
if (i == me) continue;
if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
if (i == me || gc_threads[i]->idle) continue;
if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
barf("releaseGCThreads");
gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
......
......@@ -125,6 +125,7 @@ typedef struct gc_thread_ {
volatile rtsBool wakeup;
#endif
nat thread_index; // a zero based index identifying the thread
rtsBool idle; // sitting out of this GC cycle
bdescr * free_blocks; // a buffer of free blocks for this thread
// during GC without accessing the block
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment