Commit 6a405b1e authored by Simon Marlow's avatar Simon Marlow

Keep the remembered sets local to each thread during parallel GC

This turns out to be quite vital for parallel programs:

  - The way we discover which threads to traverse is by finding
    dirty threads via the remembered sets (aka mutable lists).

  - A dirty thread will be on the remembered set of the capability
    that was running it, and we really want to traverse that thread's
    stack using the GC thread for the capability, because it is in
    that CPU's cache.  If we get this wrong, we get penalised badly by
    the memory system.

Previously we had per-capability mutable lists but they were
aggregated before GC and traversed by just one of the GC threads.
This resulted in very poor performance particularly for parallel
programs with deep stacks.

Now we keep per-capability remembered sets throughout GC, which also
removes a lock (recordMutableGen_sync).
parent 192c7d55
......@@ -245,7 +245,6 @@ extern void GarbageCollect(rtsBool force_major_gc, nat gc_type, Capability *cap)
#if defined(THREADED_RTS)
extern Mutex sm_mutex;
extern Mutex atomic_modify_mutvar_mutex;
extern SpinLock recordMutableGen_sync;
#endif
#if defined(THREADED_RTS)
......@@ -258,63 +257,40 @@ extern SpinLock recordMutableGen_sync;
#define ASSERT_SM_LOCK()
#endif
#if !IN_STG_CODE
INLINE_HEADER void
recordMutableGen(StgClosure *p, generation *gen)
recordMutableGen(StgClosure *p, nat gen_no)
{
bdescr *bd;
bd = gen->mut_list;
bd = generations[gen_no].mut_list;
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
new_bd = allocBlock();
new_bd->link = bd;
bd = new_bd;
gen->mut_list = bd;
generations[gen_no].mut_list = bd;
}
*bd->free++ = (StgWord)p;
}
INLINE_HEADER void
recordMutableGenLock(StgClosure *p, generation *gen)
recordMutableGenLock(StgClosure *p, nat gen_no)
{
ACQUIRE_SM_LOCK;
recordMutableGen(p,gen);
recordMutableGen(p,gen_no);
RELEASE_SM_LOCK;
}
extern bdescr *allocBlock_sync(void);
// Version of recordMutableGen() for use in parallel GC. The same as
// recordMutableGen(), except that we surround it with a spinlock and
// call the spinlock version of allocBlock().
INLINE_HEADER void
recordMutableGen_GC(StgClosure *p, generation *gen)
{
bdescr *bd;
ACQUIRE_SPIN_LOCK(&recordMutableGen_sync);
bd = gen->mut_list;
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
new_bd = allocBlock_sync();
new_bd->link = bd;
bd = new_bd;
gen->mut_list = bd;
}
*bd->free++ = (StgWord)p;
RELEASE_SPIN_LOCK(&recordMutableGen_sync);
}
INLINE_HEADER void
recordMutable(StgClosure *p)
{
bdescr *bd;
ASSERT(closure_MUTABLE(p));
bd = Bdescr((P_)p);
if (bd->gen_no > 0) recordMutableGen(p, &RTS_DEREF(generations)[bd->gen_no]);
if (bd->gen_no > 0) recordMutableGen(p, bd->gen_no);
}
INLINE_HEADER void
......@@ -325,6 +301,8 @@ recordMutableLock(StgClosure *p)
RELEASE_SM_LOCK;
}
#endif // !IN_STG_CODE
/* -----------------------------------------------------------------------------
The CAF table - used to let us revert CAFs in GHCi
-------------------------------------------------------------------------- */
......
......@@ -217,6 +217,9 @@ initCapability( Capability *cap, nat i )
cap->mut_lists = stgMallocBytes(sizeof(bdescr *) *
RtsFlags.GcFlags.generations,
"initCapability");
cap->saved_mut_lists = stgMallocBytes(sizeof(bdescr *) *
RtsFlags.GcFlags.generations,
"initCapability");
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
cap->mut_lists[g] = NULL;
......
......@@ -65,10 +65,12 @@ struct Capability_ {
Task *suspended_ccalling_tasks;
// One mutable list per generation, so we don't need to take any
// locks when updating an old-generation thunk. These
// mini-mut-lists are moved onto the respective gen->mut_list at
// each GC.
// locks when updating an old-generation thunk. This also lets us
// keep track of which closures this CPU has been mutating, so we
// can traverse them using the right thread during GC and avoid
// unnecessarily moving the data from one cache to another.
bdescr **mut_lists;
bdescr **saved_mut_lists; // tmp use during GC
// Context switch flag. We used to have one global flag, now one
// per capability. Locks required : none (conflicts are harmless)
......
......@@ -712,7 +712,6 @@ stat_exit(int alloc)
{
nat g, s;
statsPrintf("recordMutableGen_sync: %"FMT_Word64"\n", recordMutableGen_sync.spin);
statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin);
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
......
......@@ -268,7 +268,7 @@ no_slop:
write_barrier(); \
bd = Bdescr((P_)p1); \
if (bd->gen_no != 0) { \
recordMutableGenLock(p1, &generations[bd->gen_no]); \
recordMutableGenLock(p1, bd->gen_no); \
SET_INFO(p1, &stg_IND_OLDGEN_info); \
TICK_UPD_OLD_IND(); \
and_then; \
......
......@@ -976,11 +976,20 @@ compact(StgClosure *static_objects)
for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
bdescr *bd;
StgPtr p;
nat n;
for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
for (p = bd->start; p < bd->free; p++) {
thread((StgClosure **)p);
}
}
for (n = 0; n < n_capabilities; n++) {
for (bd = capabilities[n].mut_lists[g];
bd != NULL; bd = bd->link) {
for (p = bd->start; p < bd->free; p++) {
thread((StgClosure **)p);
}
}
}
}
// the global thread list
......
......@@ -125,10 +125,6 @@ nat n_gc_threads;
// For stats:
long copied; // *words* copied & scavenged during this GC
#ifdef THREADED_RTS
SpinLock recordMutableGen_sync;
#endif
DECLARE_GCT
/* -----------------------------------------------------------------------------
......@@ -314,17 +310,7 @@ GarbageCollect (rtsBool force_major_gc,
/* -----------------------------------------------------------------------
* follow all the roots that we know about:
* - mutable lists from each generation > N
* we want to *scavenge* these roots, not evacuate them: they're not
* going to move in this GC.
* Also do them in reverse generation order, for the usual reason:
* namely to reduce the likelihood of spurious old->new pointers.
*/
for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
generations[g].saved_mut_list = generations[g].mut_list;
generations[g].mut_list = allocBlock();
// mut_list always has at least one block.
}
// the main thread is running: this prevents any other threads from
// exiting prematurely, so we can start them now.
......@@ -333,8 +319,29 @@ GarbageCollect (rtsBool force_major_gc,
inc_running();
wakeup_gc_threads(n_gc_threads, gct->thread_index);
// Mutable lists from each generation > N
// we want to *scavenge* these roots, not evacuate them: they're not
// going to move in this GC.
// Also do them in reverse generation order, for the usual reason:
// namely to reduce the likelihood of spurious old->new pointers.
//
for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
scavenge_mutable_list(&generations[g]);
scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]);
freeChain_sync(generations[g].saved_mut_list);
generations[g].saved_mut_list = NULL;
}
// scavenge the capability-private mutable lists. This isn't part
// of markSomeCapabilities() because markSomeCapabilities() can only
// call back into the GC via mark_root() (due to the gct register
// variable).
if (n_gc_threads == 1) {
for (n = 0; n < n_capabilities; n++) {
scavenge_capability_mut_lists(&capabilities[n]);
}
} else {
scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
}
// follow roots from the CAF list (used by GHCi)
......@@ -545,6 +552,12 @@ GarbageCollect (rtsBool force_major_gc,
for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
mut_list_size += bd->free - bd->start;
}
for (n = 0; n < n_capabilities; n++) {
for (bd = capabilities[n].mut_lists[g];
bd != NULL; bd = bd->link) {
mut_list_size += bd->free - bd->start;
}
}
copied += mut_list_size;
debugTrace(DEBUG_gc,
......@@ -1037,6 +1050,7 @@ gcWorkerThread (Capability *cap)
gct->evac_step = 0;
markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads,
rtsTrue/*prune sparks*/);
scavenge_capability_mut_lists(&capabilities[gct->thread_index]);
scavenge_until_all_done();
......@@ -1287,11 +1301,21 @@ init_collected_gen (nat g, nat n_threads)
static void
init_uncollected_gen (nat g, nat threads)
{
nat s, t, i;
nat s, t, n;
step_workspace *ws;
step *stp;
bdescr *bd;
// save the current mutable lists for this generation, and
// allocate a fresh block for each one. We'll traverse these
// mutable lists as roots early on in the GC.
generations[g].saved_mut_list = generations[g].mut_list;
generations[g].mut_list = allocBlock();
for (n = 0; n < n_capabilities; n++) {
capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g];
capabilities[n].mut_lists[g] = allocBlock();
}
for (s = 0; s < generations[g].n_steps; s++) {
stp = &generations[g].steps[s];
stp->scavenged_large_objects = NULL;
......@@ -1352,19 +1376,6 @@ init_uncollected_gen (nat g, nat threads)
if (t == n_gc_threads) t = 0;
}
}
// Move the private mutable lists from each capability onto the
// main mutable list for the generation.
for (i = 0; i < n_capabilities; i++) {
for (bd = capabilities[i].mut_lists[g];
bd->link != NULL; bd = bd->link) {
/* nothing */
}
bd->link = generations[g].mut_list;
generations[g].mut_list = capabilities[i].mut_lists[g];
capabilities[i].mut_lists[g] = allocBlock();
}
}
/* -----------------------------------------------------------------------------
......@@ -1377,6 +1388,7 @@ init_gc_thread (gc_thread *t)
t->static_objects = END_OF_STATIC_LIST;
t->scavenged_static_objects = END_OF_STATIC_LIST;
t->scan_bd = NULL;
t->mut_lists = capabilities[t->thread_index].mut_lists;
t->evac_step = 0;
t->failed_to_evac = rtsFalse;
t->eager_promotion = rtsTrue;
......
......@@ -131,6 +131,14 @@ typedef struct gc_thread_ {
// block that is currently being scanned
bdescr * scan_bd;
// Remembered sets on this CPU. Each GC thread has its own
// private per-generation remembered sets, so it can add an item
// to the remembered set without taking a lock. The mut_lists
// array on a gc_thread is the same as the one on the
// corresponding Capability; we stash it here too for easy access
// during GC; see recordMutableGen_GC().
bdescr ** mut_lists;
// --------------------
// evacuate flags
......
......@@ -34,3 +34,22 @@ isPartiallyFull(bdescr *bd)
#if DEBUG
void printMutableList (generation *gen);
#endif
// Version of recordMutableGen for use during GC. This uses the
// mutable lists attached to the current gc_thread structure, which
// are the same as the mutable lists on the Capability.
INLINE_HEADER void
recordMutableGen_GC (StgClosure *p, nat gen_no)
{
bdescr *bd;
bd = gct->mut_lists[gen_no];
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
new_bd = allocBlock_sync();
new_bd->link = bd;
bd = new_bd;
gct->mut_lists[gen_no] = bd;
}
*bd->free++ = (StgWord)p;
}
......@@ -25,6 +25,7 @@
#include "Trace.h"
#include "LdvProfile.h"
#include "Sanity.h"
#include "Capability.h"
static void scavenge_stack (StgPtr p, StgPtr stack_end);
......@@ -36,7 +37,8 @@ static void scavenge_large_bitmap (StgPtr p,
# define evacuate(a) evacuate1(a)
# define recordMutableGen_GC(a,b) recordMutableGen(a,b)
# define scavenge_loop(a) scavenge_loop1(a)
# define scavenge_mutable_list(g) scavenge_mutable_list1(g)
# define scavenge_mutable_list(bd,g) scavenge_mutable_list1(bd,g)
# define scavenge_capability_mut_lists(cap) scavenge_capability_mut_Lists1(cap)
#endif
/* -----------------------------------------------------------------------------
......@@ -67,6 +69,8 @@ scavengeTSO (StgTSO *tso)
return;
}
debugTrace(DEBUG_gc,"scavenging thread %d",tso->id);
saved_eager = gct->eager_promotion;
gct->eager_promotion = rtsFalse;
......@@ -692,7 +696,7 @@ scavenge_block (bdescr *bd)
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
if (bd->gen_no > 0) {
recordMutableGen_GC((StgClosure *)q, &generations[bd->gen_no]);
recordMutableGen_GC((StgClosure *)q, bd->gen_no);
}
}
}
......@@ -1047,7 +1051,7 @@ linear_scan:
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
if (gct->evac_step) {
recordMutableGen_GC((StgClosure *)q, gct->evac_step->gen);
recordMutableGen_GC((StgClosure *)q, gct->evac_step->gen_no);
}
}
......@@ -1419,13 +1423,10 @@ scavenge_one(StgPtr p)
-------------------------------------------------------------------------- */
void
scavenge_mutable_list(generation *gen)
scavenge_mutable_list(bdescr *bd, generation *gen)
{
bdescr *bd;
StgPtr p, q;
bd = gen->saved_mut_list;
gct->evac_step = &gen->steps[0];
for (; bd != NULL; bd = bd->link) {
for (q = bd->start; q < bd->free; q++) {
......@@ -1456,12 +1457,12 @@ scavenge_mutable_list(generation *gen)
// definitely doesn't point into a young generation.
// Clean objects don't need to be scavenged. Some clean
// objects (MUT_VAR_CLEAN) are not kept on the mutable
// list at all; others, such as MUT_ARR_PTRS_CLEAN and
// TSO, are always on the mutable list.
// list at all; others, such as MUT_ARR_PTRS_CLEAN
// are always on the mutable list.
//
switch (get_itbl((StgClosure *)p)->type) {
case MUT_ARR_PTRS_CLEAN:
recordMutableGen_GC((StgClosure *)p,gen);
recordMutableGen_GC((StgClosure *)p,gen->no);
continue;
case TSO: {
StgTSO *tso = (StgTSO *)p;
......@@ -1472,7 +1473,7 @@ scavenge_mutable_list(generation *gen)
scavenge_TSO_link(tso);
if (gct->failed_to_evac) {
recordMutableGen_GC((StgClosure *)p,gen);
recordMutableGen_GC((StgClosure *)p,gen->no);
gct->failed_to_evac = rtsFalse;
} else {
tso->flags &= ~TSO_LINK_DIRTY;
......@@ -1487,14 +1488,28 @@ scavenge_mutable_list(generation *gen)
if (scavenge_one(p)) {
// didn't manage to promote everything, so put the
// object back on the list.
recordMutableGen_GC((StgClosure *)p,gen);
recordMutableGen_GC((StgClosure *)p,gen->no);
}
}
}
}
// free the old mut_list
freeChain_sync(gen->saved_mut_list);
gen->saved_mut_list = NULL;
void
scavenge_capability_mut_lists (Capability *cap)
{
nat g;
/* Mutable lists from each generation > N
* we want to *scavenge* these roots, not evacuate them: they're not
* going to move in this GC.
* Also do them in reverse generation order, for the usual reason:
* namely to reduce the likelihood of spurious old->new pointers.
*/
for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
freeChain_sync(cap->saved_mut_lists[g]);
cap->saved_mut_lists[g] = NULL;
}
}
/* -----------------------------------------------------------------------------
......@@ -1560,7 +1575,7 @@ scavenge_static(void)
*/
if (gct->failed_to_evac) {
gct->failed_to_evac = rtsFalse;
recordMutableGen_GC((StgClosure *)p,oldest_gen);
recordMutableGen_GC((StgClosure *)p,oldest_gen->no);
}
break;
}
......@@ -1834,7 +1849,7 @@ scavenge_large (step_workspace *ws)
p = bd->start;
if (scavenge_one(p)) {
if (ws->step->gen_no > 0) {
recordMutableGen_GC((StgClosure *)p, ws->step->gen);
recordMutableGen_GC((StgClosure *)p, ws->step->gen_no);
}
}
......
......@@ -12,9 +12,11 @@
* ---------------------------------------------------------------------------*/
void scavenge_loop (void);
void scavenge_mutable_list (generation *g);
void scavenge_mutable_list (bdescr *bd, generation *gen);
void scavenge_capability_mut_lists (Capability *cap);
#ifdef THREADED_RTS
void scavenge_loop1 (void);
void scavenge_mutable_list1 (generation *g);
void scavenge_mutable_list1 (bdescr *bd, generation *gen);
void scavenge_capability_mut_Lists1 (Capability *cap);
#endif
......@@ -272,7 +272,6 @@ initStorage( void )
#ifdef THREADED_RTS
initSpinLock(&gc_alloc_block_sync);
initSpinLock(&recordMutableGen_sync);
whitehole_spin = 0;
#endif
......@@ -376,7 +375,7 @@ newCAF(StgClosure* caf)
* any more and can use it as a STATIC_LINK.
*/
((StgIndStatic *)caf)->saved_info = NULL;
recordMutableGen(caf, oldest_gen);
recordMutableGen(caf, oldest_gen->no);
}
RELEASE_SM_LOCK;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment