diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 4a940d2a6f611cc22b32d1b2dac64739cd8efdfe..64f3d98d45b3e1e83ea9c76c93c44b1b0b6eaaa6 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -130,11 +130,21 @@ MutListScavStats mutlist_scav_stats;
  */
 gc_thread **gc_threads = NULL;
 
-#if !defined(THREADED_RTS)
+
+#if defined(THREADED_RTS)
+static Mutex gc_running_mutex;
+static Condition gc_running_cv;
+#if defined(PROF_SPIN)
+// spin and yield counts for the quasi-SpinLock in waitForGcThreads
+volatile StgWord64 waitForGcThreads_spin = 0;
+volatile StgWord64 waitForGcThreads_yield = 0;
+volatile StgWord64 whitehole_gc_spin = 0;
+#endif // PROF_SPIN
+#else // THREADED_RTS
 // Must be aligned to 64-bytes to meet stated 64-byte alignment of gen_workspace
 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
     ATTRIBUTE_ALIGNED(64);
-#endif
+#endif // THREADED_RTS
 
 // Number of threads running in *this* GC.  Affects how many
 // step->todos[] lists we have to look in to find work.
@@ -143,12 +153,6 @@ uint32_t n_gc_threads;
 // For stats:
 static long copied;        // *words* copied & scavenged during this GC
 
-#if defined(PROF_SPIN) && defined(THREADED_RTS)
-// spin and yield counts for the quasi-SpinLock in waitForGcThreads
-volatile StgWord64 waitForGcThreads_spin = 0;
-volatile StgWord64 waitForGcThreads_yield = 0;
-volatile StgWord64 whitehole_gc_spin = 0;
-#endif
 
 bool work_stealing;
 
@@ -473,29 +477,20 @@ GarbageCollect (uint32_t collect_gen,
    * Repeatedly scavenge all the areas we know about until there's no
    * more scavenging to be done.
    */
+  scavenge_until_all_done();
+  shutdown_gc_threads(gct->thread_index, idle_cap);
+  // The other threads are now stopped.
 
+  // must be last...  invariant is that everything is fully
+  // scavenged at this point.
   StgWeak *dead_weak_ptr_list = NULL;
   StgTSO *resurrected_threads = END_TSO_QUEUE;
-
-  for (;;)
+  while (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads))
   {
+      inc_running();
       scavenge_until_all_done();
-
-      // The other threads are now stopped.  We might recurse back to
-      // here, but from now on this is the only thread.
-
-      // must be last...  invariant is that everything is fully
-      // scavenged at this point.
-      if (traverseWeakPtrList(&dead_weak_ptr_list, &resurrected_threads)) { // returns true if evaced something
-          inc_running();
-          continue;
-      }
-
-      // If we get to here, there's really nothing left to do.
-      break;
   }
 
-  shutdown_gc_threads(gct->thread_index, idle_cap);
 
   // Now see which stable names are still alive.
   gcStableNameTable();
@@ -1104,6 +1099,8 @@ initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
     } else {
         gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
                                      "initGcThreads");
+        initMutex(&gc_running_mutex);
+        initCondition(&gc_running_cv);
     }
 
     for (i = from; i < to; i++) {
@@ -1136,7 +1133,10 @@ freeGcThreads (void)
             }
             stgFree (gc_threads[i]);
         }
+        closeCondition(&gc_running_cv);
+        closeMutex(&gc_running_mutex);
         stgFree (gc_threads);
+
 #else
         for (g = 0; g < RtsFlags.GcFlags.generations; g++)
         {
@@ -1152,7 +1152,7 @@ freeGcThreads (void)
    Start GC threads
    ------------------------------------------------------------------------- */
 
-static volatile StgWord gc_running_threads;
+static StgWord gc_running_threads;
 
 static StgWord
 inc_running (void)
@@ -1167,100 +1167,95 @@ static StgWord
 dec_running (void)
 {
     ASSERT(RELAXED_LOAD(&gc_running_threads) != 0);
-    return atomic_dec(&gc_running_threads);
-}
-
-static bool
-any_work (void)
-{
-    int g;
-    gen_workspace *ws;
-
-    NONATOMIC_ADD(&gct->any_work, 1);
-
-    write_barrier();
-
-    // scavenge objects in compacted generation
-    if (mark_stack_bd != NULL && !mark_stack_empty()) {
-        return true;
-    }
+#if defined(THREADED_RTS)
+    ACQUIRE_LOCK(&gc_running_mutex);
+#endif
 
-    // Check for global work in any gen.  We don't need to check for
-    // local work, because we have already exited scavenge_loop(),
-    // which means there is no local work for this thread.
-    for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) {
-        ws = &gct->gens[g];
-        if (ws->todo_large_objects) return true;
-        if (!looksEmptyWSDeque(ws->todo_q)) return true;
-        if (ws->todo_overflow) return true;
-    }
+    StgWord r = atomic_dec(&gc_running_threads);
 
 #if defined(THREADED_RTS)
-    if (work_stealing) {
-        uint32_t n;
-        // look for work to steal
-        for (n = 0; n < n_gc_threads; n++) {
-            if (n == gct->thread_index) continue;
-            for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
-                ws = &gc_threads[n]->gens[g];
-                if (!looksEmptyWSDeque(ws->todo_q)) return true;
-            }
-        }
+    if (r == 0) {
+        broadcastCondition(&gc_running_cv);
     }
+    RELEASE_LOCK(&gc_running_mutex);
 #endif
 
-    gct->no_work++;
-#if defined(THREADED_RTS)
-    yieldThread();
-#endif
+    return r;
+}
 
-    return false;
+# if defined(THREADED_RTS)
+void notify_todo_block(void) {
+    if(work_stealing && n_gc_threads != SEQ_CST_LOAD(&gc_running_threads)) {
+        signalCondition(&gc_running_cv);
+    }
 }
+#endif
+
 
 static void
 scavenge_until_all_done (void)
 {
-    DEBUG_ONLY( uint32_t r );
-
+#if defined(THREADED_RTS) || defined(DEBUG)
+    uint32_t r;
+#endif
 
-loop:
+    for(;;) {
 #if defined(THREADED_RTS)
-    if (n_gc_threads > 1) {
-        scavenge_loop();
-    } else {
-        scavenge_loop1();
-    }
+        if (n_gc_threads > 1) {
+            scavenge_loop();
+        } else {
+            scavenge_loop1();
+        }
 #else
-    scavenge_loop();
+        scavenge_loop();
 #endif
 
-    collect_gct_blocks();
+        collect_gct_blocks();
 
-    // scavenge_loop() only exits when there's no work to do
+        // scavenge_loop() only exits when there's no work to do
 
-    // This atomic decrement also serves as a full barrier to ensure that any
-    // writes we made during scavenging are visible to other threads.
-#if defined(DEBUG)
-    r = dec_running();
+        // This atomic decrement also serves as a full barrier to ensure that any
+        // writes we made during scavenging are visible to other threads.
+#if defined(THREADED_RTS) || defined(DEBUG)
+        r = dec_running();
 #else
-    dec_running();
+        dec_running();
 #endif
 
-    traceEventGcIdle(gct->cap);
+        traceEventGcIdle(gct->cap);
 
-    debugTrace(DEBUG_gc, "%d GC threads still running", r);
+        debugTrace(DEBUG_gc, "%d GC threads still running", r);
 
-    while (SEQ_CST_LOAD(&gc_running_threads) != 0) {
-        // usleep(1);
-        if (any_work()) {
-            inc_running();
-            traceEventGcWork(gct->cap);
-            goto loop;
+        // If there's no hope of stealing more work, then there's nowhere else
+        // work can come from and we are finished
+#if defined(THREADED_RTS)
+        if(n_gc_threads > 1 && work_stealing && r != 0) {
+            NONATOMIC_ADD(&gct->any_work, 1);
+            ACQUIRE_LOCK(&gc_running_mutex);
+            // this is SEQ_CST because I haven't considered if it could be
+            // weaker
+            r = SEQ_CST_LOAD(&gc_running_threads);
+            if (r != 0) {
+                waitCondition(&gc_running_cv, &gc_running_mutex);
+                // this is SEQ_CST because I haven't considered if it could be
+                // weaker
+                r = SEQ_CST_LOAD(&gc_running_threads);
+            }
+            // here, if r is 0 then all threads are finished
+            // if r > 0 then either:
+            //  - waitCondition was subject to spurious wakeup
+            //  - a worker thread just pushed a block to it's todo_q
+            // so we loop back, looking for more work.
+            RELEASE_LOCK(&gc_running_mutex);
+            if (r != 0) {
+                inc_running();
+                traceEventGcWork(gct->cap);
+                continue; // do loop
+            }
+            NONATOMIC_ADD(&gct->no_work, 1);
         }
-        // any_work() does not remove the work from the queue, it
-        // just checks for the presence of work.  If we find any,
-        // then we increment gc_running_threads and go back to
-        // scavenge_loop() to perform any pending work.
+#endif
+        break;
     }
 
     traceEventGcDone(gct->cap);
@@ -1394,9 +1389,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
 static void
 start_gc_threads (void)
 {
-#if defined(THREADED_RTS)
-    gc_running_threads = 0;
-#endif
+    SEQ_CST_STORE(&gc_running_threads, 0);
 }
 
 static void
@@ -1424,7 +1417,7 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
 
 // After GC is complete, we must wait for all GC threads to enter the
 // standby state, otherwise they may still be executing inside
-// any_work(), and may even remain awake until the next GC starts.
+// scavenge_until_all_done(), and may even remain awake until the next GC starts.
 static void
 shutdown_gc_threads (uint32_t me USED_IF_THREADS,
                      bool idle_cap[] USED_IF_THREADS)
@@ -2076,3 +2069,22 @@ bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
 {
     return runSomeFinalizers(all);
 }
+
+
+/* Note [gc_running_mutex]
+ *
+ * TODO finish and polish
+ *
+ *  - gc_running_mutex guards gc_running_threads
+ *  - gc_running_cv is associated condition variable
+ *  - scavenge_until_all_done works until there's no more work, then acquires
+ *    gc_running_mutex and waits on gc_running_cv
+ *  - on wakeup, if gc_running_threads > 0, it restarts looking for work
+ *  - gc_running_cv is signalled when a worker thread pushes a block to it's todo_q and (gc_running_threads < n_gc_threads)
+ *   - this signal is fired while gc_running_mutex is NOT held. So this signal
+ *     is a bit racy. It's ok because the next block pushed will signal again.
+ *  - gc_running_cv is broadcast when dec_running reaches 0
+ *  - the gc leader calls shutdown_gc_threads before it starts it's serial
+ *    collections, so no gc thread can observe gc_running_threads > 0 once
+ *    dec_running broadcasts.
+ * */
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 2c2d14a7d29798184e5401e1703594ff69af52ec..d3aab1d219e96bac70b5291fd4980a878d76e84d 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -76,6 +76,7 @@ void freeGcThreads (void);
 void resizeGenerations (void);
 
 #if defined(THREADED_RTS)
+void notify_todo_block(void);
 void waitForGcThreads (Capability *cap, bool idle_cap[]);
 void releaseGCThreads (Capability *cap, bool idle_cap[]);
 #endif
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index d58fdc48ae04755a75f23722bb45347ac4828efe..fe278c9dfcfbe63abad753eaa789212b3c421d09 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -123,12 +123,22 @@ steal_todo_block (uint32_t g)
 {
     uint32_t n;
     bdescr *bd;
+    WSDeque* q;
 
     // look for work to steal
     for (n = 0; n < n_gc_threads; n++) {
         if (n == gct->thread_index) continue;
-        bd = stealWSDeque(gc_threads[n]->gens[g].todo_q);
+        q = gc_threads[n]->gens[g].todo_q;
+        bd = stealWSDeque(q);
         if (bd) {
+            // TODO this commented block may be a good idea gc_running_cv is
+            // signalled when a block is pushed.  we would signal here as well,
+            // because we've stolen and there is something left to steal with
+            // the way we use (gc_running_threads < n_gc_threads) to decide
+            // whether to signal, I believe the below is uneccessary
+            // if(!looksEmptyWSDeque(q)) {
+            //     notify_todo_block();
+            // }
             return bd;
         }
     }
@@ -168,6 +178,19 @@ push_scanned_block (bdescr *bd, gen_workspace *ws)
     }
 }
 
+void
+push_todo_block(bdescr *bd, gen_workspace *ws) {
+    ASSERT(bd->link == NULL);
+    if(!pushWSDeque(ws->todo_q, bd)) {
+        bd->link = ws->todo_overflow;
+        ws->todo_overflow = bd;
+        ws->n_todo_overflow++;
+    }
+#if defined(THREADED_RTS)
+    notify_todo_block();
+#endif
+}
+
 /* Note [big objects]
 
    We can get an ordinary object (CONSTR, FUN, THUNK etc.) that is
@@ -277,17 +300,15 @@ todo_block_full (uint32_t size, gen_workspace *ws)
         // Otherwise, push this block out to the global list.
         else
         {
+            // TODO move this to push_todo_block? have to do something about
+            // refill_wsdeques spamming
             DEBUG_ONLY( generation *gen );
             DEBUG_ONLY( gen = ws->gen );
             debugTrace(DEBUG_gc, "push todo block %p (%ld words), step %d, todo_q: %ld",
                   bd->start, (unsigned long)(bd->free - bd->u.scan),
                   gen->no, dequeElements(ws->todo_q));
 
-            if (!pushWSDeque(ws->todo_q, bd)) {
-                bd->link = ws->todo_overflow;
-                ws->todo_overflow = bd;
-                ws->n_todo_overflow++;
-            }
+            push_todo_block(bd, ws);
         }
     }
 
diff --git a/rts/sm/GCUtils.h b/rts/sm/GCUtils.h
index 798a795deb5d349dbc682be4e51b3daee295d01a..13f3a87521af2ded3c030712e3bf069ee9a67b02 100644
--- a/rts/sm/GCUtils.h
+++ b/rts/sm/GCUtils.h
@@ -34,6 +34,7 @@ void    freeChain_sync(bdescr *bd);
 void    freeGroup_sync(bdescr *bd);
 
 void    push_scanned_block   (bdescr *bd, gen_workspace *ws);
+void    push_todo_block      (bdescr *bd, gen_workspace *ws);
 StgPtr  todo_block_full      (uint32_t size, gen_workspace *ws);
 StgPtr  alloc_todo_block     (gen_workspace *ws, uint32_t size);
 
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index c48fc83248bf1674159d38f633161a71f0a59907..0a00423b96f52b6f53a6738d0da651d22fb9df4f 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -2084,18 +2084,7 @@ static StgWord refill_wsdeque(gen_workspace *ws)
         bdescr* temp2 = temp;
         temp = temp2->link;
         temp2->link = NULL;
-        if(!pushWSDeque(ws->todo_q, temp2)) {
-            temp2->link = temp;
-            temp = temp2;
-            break;
-        }
-    }
-    while(temp != NULL) {
-        bdescr* temp2 = temp;
-        temp = temp2->link;
-        temp2->link = ws->todo_overflow;
-        ws->todo_overflow = temp2;
-        ws->n_todo_overflow++;
+        push_todo_block(temp2, ws);
     }
     return n_todo_overflow;
 }