diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 4a940d2a6f611cc22b32d1b2dac64739cd8efdfe..64f3d98d45b3e1e83ea9c76c93c44b1b0b6eaaa6 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -130,11 +130,21 @@ MutListScavStats mutlist_scav_stats; */ gc_thread **gc_threads = NULL; -#if !defined(THREADED_RTS) + +#if defined(THREADED_RTS) +static Mutex gc_running_mutex; +static Condition gc_running_cv; +#if defined(PROF_SPIN) +// spin and yield counts for the quasi-SpinLock in waitForGcThreads +volatile StgWord64 waitForGcThreads_spin = 0; +volatile StgWord64 waitForGcThreads_yield = 0; +volatile StgWord64 whitehole_gc_spin = 0; +#endif // PROF_SPIN +#else // THREADED_RTS // Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)] ATTRIBUTE_ALIGNED(64); -#endif +#endif // THREADED_RTS // Number of threads running in *this* GC. Affects how many // step->todos[] lists we have to look in to find work. @@ -143,12 +153,6 @@ uint32_t n_gc_threads; // For stats: static long copied; // *words* copied & scavenged during this GC -#if defined(PROF_SPIN) && defined(THREADED_RTS) -// spin and yield counts for the quasi-SpinLock in waitForGcThreads -volatile StgWord64 waitForGcThreads_spin = 0; -volatile StgWord64 waitForGcThreads_yield = 0; -volatile StgWord64 whitehole_gc_spin = 0; -#endif bool work_stealing; @@ -473,29 +477,20 @@ GarbageCollect (uint32_t collect_gen, * Repeatedly scavenge all the areas we know about until there's no * more scavenging to be done. */ + scavenge_until_all_done(); + shutdown_gc_threads(gct->thread_index, idle_cap); + // The other threads are now stopped. + // must be last... invariant is that everything is fully + // scavenged at this point. StgWeak *dead_weak_ptr_list = NULL; StgTSO *resurrected_threads = END_TSO_QUEUE; - - for (;;) + while (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads)) { + inc_running(); scavenge_until_all_done(); - - // The other threads are now stopped. We might recurse back to - // here, but from now on this is the only thread. - - // must be last... invariant is that everything is fully - // scavenged at this point. - if (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads)) { // returns true if evaced something - inc_running(); - continue; - } - - // If we get to here, there's really nothing left to do. - break; } - shutdown_gc_threads(gct->thread_index, idle_cap); // Now see which stable names are still alive. gcStableNameTable(); @@ -1104,6 +1099,8 @@ initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS) } else { gc_threads = stgMallocBytes (to * sizeof(gc_thread*), "initGcThreads"); + initMutex(&gc_running_mutex); + initCondition(&gc_running_cv); } for (i = from; i < to; i++) { @@ -1136,7 +1133,10 @@ freeGcThreads (void) } stgFree (gc_threads[i]); } + closeCondition(&gc_running_cv); + closeMutex(&gc_running_mutex); stgFree (gc_threads); + #else for (g = 0; g < RtsFlags.GcFlags.generations; g++) { @@ -1152,7 +1152,7 @@ freeGcThreads (void) Start GC threads ------------------------------------------------------------------------- */ -static volatile StgWord gc_running_threads; +static StgWord gc_running_threads; static StgWord inc_running (void) @@ -1167,100 +1167,95 @@ static StgWord dec_running (void) { ASSERT(RELAXED_LOAD(&gc_running_threads) != 0); - return atomic_dec(&gc_running_threads); -} - -static bool -any_work (void) -{ - int g; - gen_workspace *ws; - - NONATOMIC_ADD(&gct->any_work, 1); - - write_barrier(); - - // scavenge objects in compacted generation - if (mark_stack_bd != NULL && !mark_stack_empty()) { - return true; - } +#if defined(THREADED_RTS) + ACQUIRE_LOCK(&gc_running_mutex); +#endif - // Check for global work in any gen. We don't need to check for - // local work, because we have already exited scavenge_loop(), - // which means there is no local work for this thread. - for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) { - ws = &gct->gens[g]; - if (ws->todo_large_objects) return true; - if (!looksEmptyWSDeque(ws->todo_q)) return true; - if (ws->todo_overflow) return true; - } + StgWord r = atomic_dec(&gc_running_threads); #if defined(THREADED_RTS) - if (work_stealing) { - uint32_t n; - // look for work to steal - for (n = 0; n < n_gc_threads; n++) { - if (n == gct->thread_index) continue; - for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { - ws = &gc_threads[n]->gens[g]; - if (!looksEmptyWSDeque(ws->todo_q)) return true; - } - } + if (r == 0) { + broadcastCondition(&gc_running_cv); } + RELEASE_LOCK(&gc_running_mutex); #endif - gct->no_work++; -#if defined(THREADED_RTS) - yieldThread(); -#endif + return r; +} - return false; +# if defined(THREADED_RTS) +void notify_todo_block(void) { + if(work_stealing && n_gc_threads != SEQ_CST_LOAD(&gc_running_threads)) { + signalCondition(&gc_running_cv); + } } +#endif + static void scavenge_until_all_done (void) { - DEBUG_ONLY( uint32_t r ); - +#if defined(THREADED_RTS) || defined(DEBUG) + uint32_t r; +#endif -loop: + for(;;) { #if defined(THREADED_RTS) - if (n_gc_threads > 1) { - scavenge_loop(); - } else { - scavenge_loop1(); - } + if (n_gc_threads > 1) { + scavenge_loop(); + } else { + scavenge_loop1(); + } #else - scavenge_loop(); + scavenge_loop(); #endif - collect_gct_blocks(); + collect_gct_blocks(); - // scavenge_loop() only exits when there's no work to do + // scavenge_loop() only exits when there's no work to do - // This atomic decrement also serves as a full barrier to ensure that any - // writes we made during scavenging are visible to other threads. -#if defined(DEBUG) - r = dec_running(); + // This atomic decrement also serves as a full barrier to ensure that any + // writes we made during scavenging are visible to other threads. +#if defined(THREADED_RTS) || defined(DEBUG) + r = dec_running(); #else - dec_running(); + dec_running(); #endif - traceEventGcIdle(gct->cap); + traceEventGcIdle(gct->cap); - debugTrace(DEBUG_gc, "%d GC threads still running", r); + debugTrace(DEBUG_gc, "%d GC threads still running", r); - while (SEQ_CST_LOAD(&gc_running_threads) != 0) { - // usleep(1); - if (any_work()) { - inc_running(); - traceEventGcWork(gct->cap); - goto loop; + // If there's no hope of stealing more work, then there's nowhere else + // work can come from and we are finished +#if defined(THREADED_RTS) + if(n_gc_threads > 1 && work_stealing && r != 0) { + NONATOMIC_ADD(&gct->any_work, 1); + ACQUIRE_LOCK(&gc_running_mutex); + // this is SEQ_CST because I haven't considered if it could be + // weaker + r = SEQ_CST_LOAD(&gc_running_threads); + if (r != 0) { + waitCondition(&gc_running_cv, &gc_running_mutex); + // this is SEQ_CST because I haven't considered if it could be + // weaker + r = SEQ_CST_LOAD(&gc_running_threads); + } + // here, if r is 0 then all threads are finished + // if r > 0 then either: + // - waitCondition was subject to spurious wakeup + // - a worker thread just pushed a block to it's todo_q + // so we loop back, looking for more work. + RELEASE_LOCK(&gc_running_mutex); + if (r != 0) { + inc_running(); + traceEventGcWork(gct->cap); + continue; // do loop + } + NONATOMIC_ADD(&gct->no_work, 1); } - // any_work() does not remove the work from the queue, it - // just checks for the presence of work. If we find any, - // then we increment gc_running_threads and go back to - // scavenge_loop() to perform any pending work. +#endif + break; } traceEventGcDone(gct->cap); @@ -1394,9 +1389,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) static void start_gc_threads (void) { -#if defined(THREADED_RTS) - gc_running_threads = 0; -#endif + SEQ_CST_STORE(&gc_running_threads, 0); } static void @@ -1424,7 +1417,7 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS, // After GC is complete, we must wait for all GC threads to enter the // standby state, otherwise they may still be executing inside -// any_work(), and may even remain awake until the next GC starts. +// scavenge_until_all_done(), and may even remain awake until the next GC starts. static void shutdown_gc_threads (uint32_t me USED_IF_THREADS, bool idle_cap[] USED_IF_THREADS) @@ -2076,3 +2069,22 @@ bool doIdleGCWork(Capability *cap STG_UNUSED, bool all) { return runSomeFinalizers(all); } + + +/* Note [gc_running_mutex] + * + * TODO finish and polish + * + * - gc_running_mutex guards gc_running_threads + * - gc_running_cv is associated condition variable + * - scavenge_until_all_done works until there's no more work, then acquires + * gc_running_mutex and waits on gc_running_cv + * - on wakeup, if gc_running_threads > 0, it restarts looking for work + * - gc_running_cv is signalled when a worker thread pushes a block to it's todo_q and (gc_running_threads < n_gc_threads) + * - this signal is fired while gc_running_mutex is NOT held. So this signal + * is a bit racy. It's ok because the next block pushed will signal again. + * - gc_running_cv is broadcast when dec_running reaches 0 + * - the gc leader calls shutdown_gc_threads before it starts it's serial + * collections, so no gc thread can observe gc_running_threads > 0 once + * dec_running broadcasts. + * */ diff --git a/rts/sm/GC.h b/rts/sm/GC.h index 2c2d14a7d29798184e5401e1703594ff69af52ec..d3aab1d219e96bac70b5291fd4980a878d76e84d 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -76,6 +76,7 @@ void freeGcThreads (void); void resizeGenerations (void); #if defined(THREADED_RTS) +void notify_todo_block(void); void waitForGcThreads (Capability *cap, bool idle_cap[]); void releaseGCThreads (Capability *cap, bool idle_cap[]); #endif diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c index d58fdc48ae04755a75f23722bb45347ac4828efe..fe278c9dfcfbe63abad753eaa789212b3c421d09 100644 --- a/rts/sm/GCUtils.c +++ b/rts/sm/GCUtils.c @@ -123,12 +123,22 @@ steal_todo_block (uint32_t g) { uint32_t n; bdescr *bd; + WSDeque* q; // look for work to steal for (n = 0; n < n_gc_threads; n++) { if (n == gct->thread_index) continue; - bd = stealWSDeque(gc_threads[n]->gens[g].todo_q); + q = gc_threads[n]->gens[g].todo_q; + bd = stealWSDeque(q); if (bd) { + // TODO this commented block may be a good idea gc_running_cv is + // signalled when a block is pushed. we would signal here as well, + // because we've stolen and there is something left to steal with + // the way we use (gc_running_threads < n_gc_threads) to decide + // whether to signal, I believe the below is uneccessary + // if(!looksEmptyWSDeque(q)) { + // notify_todo_block(); + // } return bd; } } @@ -168,6 +178,19 @@ push_scanned_block (bdescr *bd, gen_workspace *ws) } } +void +push_todo_block(bdescr *bd, gen_workspace *ws) { + ASSERT(bd->link == NULL); + if(!pushWSDeque(ws->todo_q, bd)) { + bd->link = ws->todo_overflow; + ws->todo_overflow = bd; + ws->n_todo_overflow++; + } +#if defined(THREADED_RTS) + notify_todo_block(); +#endif +} + /* Note [big objects] We can get an ordinary object (CONSTR, FUN, THUNK etc.) that is @@ -277,17 +300,15 @@ todo_block_full (uint32_t size, gen_workspace *ws) // Otherwise, push this block out to the global list. else { + // TODO move this to push_todo_block? have to do something about + // refill_wsdeques spamming DEBUG_ONLY( generation *gen ); DEBUG_ONLY( gen = ws->gen ); debugTrace(DEBUG_gc, "push todo block %p (%ld words), step %d, todo_q: %ld", bd->start, (unsigned long)(bd->free - bd->u.scan), gen->no, dequeElements(ws->todo_q)); - if (!pushWSDeque(ws->todo_q, bd)) { - bd->link = ws->todo_overflow; - ws->todo_overflow = bd; - ws->n_todo_overflow++; - } + push_todo_block(bd, ws); } } diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h index 798a795deb5d349dbc682be4e51b3daee295d01a..13f3a87521af2ded3c030712e3bf069ee9a67b02 100644 --- a/rts/sm/GCUtils.h +++ b/rts/sm/GCUtils.h @@ -34,6 +34,7 @@ void freeChain_sync(bdescr *bd); void freeGroup_sync(bdescr *bd); void push_scanned_block (bdescr *bd, gen_workspace *ws); +void push_todo_block (bdescr *bd, gen_workspace *ws); StgPtr todo_block_full (uint32_t size, gen_workspace *ws); StgPtr alloc_todo_block (gen_workspace *ws, uint32_t size); diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c index c48fc83248bf1674159d38f633161a71f0a59907..0a00423b96f52b6f53a6738d0da651d22fb9df4f 100644 --- a/rts/sm/Scav.c +++ b/rts/sm/Scav.c @@ -2084,18 +2084,7 @@ static StgWord refill_wsdeque(gen_workspace *ws) bdescr* temp2 = temp; temp = temp2->link; temp2->link = NULL; - if(!pushWSDeque(ws->todo_q, temp2)) { - temp2->link = temp; - temp = temp2; - break; - } - } - while(temp != NULL) { - bdescr* temp2 = temp; - temp = temp2->link; - temp2->link = ws->todo_overflow; - ws->todo_overflow = temp2; - ws->n_todo_overflow++; + push_todo_block(temp2, ws); } return n_todo_overflow; }