Ben Gamari · Ben Gamari · Ben Gamari · Ben Gamari · Ben Gamari · Ben Gamari
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -940,19 +940,23 @@
    return (dst);


+//
+// Nonmoving write barrier helpers
+//
+// See Note [Update remembered set] in NonMovingMark.c.
+
 #if defined(THREADED_RTS)
-#define IF_WRITE_BARRIER_ENABLED                               \
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
    if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False)
 #else
 // A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C--
-#define IF_WRITE_BARRIER_ENABLED                               \
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
    if (0)
 #define nonmoving_write_barrier_enabled 0
 #endif

 // A useful helper for pushing a pointer to the update remembered set.
-// See Note [Update remembered set] in NonMovingMark.c.
 #define updateRemembSetPushPtr(p)                                    \
-    IF_WRITE_BARRIER_ENABLED {                                       \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                             \
      ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr");     \
    }
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -74,6 +74,10 @@ extern "C" {
 #define RTS_UNREACHABLE abort()
 #endif

+/* Prefetch primitives */
+#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
+#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
+
 /* Fix for mingw stat problem (done here so it's early enough) */
 #if defined(mingw32_HOST_OS)
 #define __MSVCRT__ 1

--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -151,6 +151,23 @@ typedef struct GCDetails_ {
  Time cpu_ns;
    // The time elapsed during GC itself
  Time elapsed_ns;
+
+    //
+    // Concurrent garbage collector
+    //
+
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_elapsed_ns;
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_elapsed_ns;
 } GCDetails;

 //
@@ -241,6 +258,28 @@ typedef struct _RTSStats {
    // The number of times a GC thread has iterated it's outer loop across all
    // parallel GCs
  uint64_t scav_find_work;
+
+  // ----------------------------------
+  // Concurrent garbage collector
+
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_elapsed_ns;
+    // The maximum time elapsed during the post-mark pause phase of the
+    // concurrent nonmoving GC.
+  Time nonmoving_gc_sync_max_elapsed_ns;
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_elapsed_ns;
+    // The maximum time elapsed during the post-mark pause phase of the
+    // concurrent nonmoving GC.
+  Time nonmoving_gc_max_elapsed_ns;
 } RTSStats;

 void getRTSStats (RTSStats *s);

--- a/includes/rts/EventLogFormat.h
+++ b/includes/rts/EventLogFormat.h
@@ -182,12 +182,21 @@

 #define EVENT_USER_BINARY_MSG              181

+#define EVENT_CONC_MARK_BEGIN              200
+#define EVENT_CONC_MARK_END                201
+#define EVENT_CONC_SYNC_BEGIN              202
+#define EVENT_CONC_SYNC_END                203
+#define EVENT_CONC_SWEEP_BEGIN             204
+#define EVENT_CONC_SWEEP_END               205
+#define EVENT_CONC_UPD_REM_SET_FLUSH       206
+#define EVENT_NONMOVING_HEAP_CENSUS        207
+
 /*
 * The highest event code +1 that ghc itself emits. Note that some event
 * ranges higher than this are reserved but not currently emitted by ghc.
 * This must match the size of the EventDesc[] array in EventLog.c
 */
-#define NUM_GHC_EVENT_TAGS        182
+#define NUM_GHC_EVENT_TAGS        208

 #if 0  /* DEPRECATED EVENTS: */
 /* we don't actually need to record the thread, it's implicit */

--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -169,6 +169,7 @@ typedef struct _TRACE_FLAGS {
    bool timestamp;      /* show timestamp in stderr output */
    bool scheduler;      /* trace scheduler events */
    bool gc;             /* trace GC events */
+    bool nonmoving_gc;   /* trace nonmoving GC events */
    bool sparks_sampled; /* trace spark events by a sampled method */
    bool sparks_full;    /* trace spark events 100% accurately */
    bool user;           /* trace user events (emitted from Haskell code) */

--- a/includes/rts/NonMoving.h
+++ b/includes/rts/NonMoving.h
@@ -21,4 +21,7 @@ void updateRemembSetPushClosure(Capability *cap, StgClosure *p);

 void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p);

+// Note that RTS code should not condition on this directly by rather
+// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that
+// the barrier is eliminated in the non-threaded RTS.
 extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled);
--- a/includes/rts/storage/Block.h
+++ b/includes/rts/storage/Block.h
@@ -88,17 +88,23 @@ typedef struct bdescr_ {

    StgPtr start;              // [READ ONLY] start addr of memory

-    StgPtr free;               // First free byte of memory.
-                               // allocGroup() sets this to the value of start.
-                               // NB. during use this value should lie
-                               // between start and start + blocks *
-                               // BLOCK_SIZE.  Values outside this
-                               // range are reserved for use by the
-                               // block allocator.  In particular, the
-                               // value (StgPtr)(-1) is used to
-                               // indicate that a block is unallocated.
-                               //
-                               // Unused by the non-moving allocator.
+    union {
+        StgPtr free;               // First free byte of memory.
+                                   // allocGroup() sets this to the value of start.
+                                   // NB. during use this value should lie
+                                   // between start and start + blocks *
+                                   // BLOCK_SIZE.  Values outside this
+                                   // range are reserved for use by the
+                                   // block allocator.  In particular, the
+                                   // value (StgPtr)(-1) is used to
+                                   // indicate that a block is unallocated.
+                                   //
+                                   // Unused by the non-moving allocator.
+        struct NonmovingSegmentInfo {
+            StgWord8 log_block_size;
+            StgWord16 next_free_snap;
+        } nonmoving_segment;
+    };

    struct bdescr_ *link;      // used for chaining blocks together


--- a/includes/rts/storage/TSO.h
+++ b/includes/rts/storage/TSO.h
@@ -231,6 +231,9 @@ typedef struct StgTSO_ {
 *       setting the stack's mark bit (either the BF_MARKED bit for large objects
 *       or otherwise its bit in its segment's mark bitmap).
 *
+ * To ensure that mutation does not proceed until the stack is fully marked the
+ * mark phase must not set the mark bit until it has finished tracing.
+ *
 */

 #define STACK_DIRTY 1

--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -292,6 +292,8 @@ data TraceFlags = TraceFlags
    , timestamp      :: Bool -- ^ show timestamp in stderr output
    , traceScheduler :: Bool -- ^ trace scheduler events
    , traceGc        :: Bool -- ^ trace GC events
+    , traceNonmovingGc
+                     :: Bool -- ^ trace nonmoving GC heap census samples
    , sparksSampled  :: Bool -- ^ trace spark events by a sampled method
    , sparksFull     :: Bool -- ^ trace spark events 100% accurately
    , user           :: Bool -- ^ trace user events (emitted from Haskell code)
@@ -525,6 +527,8 @@ getTraceFlags = do
                   (#{peek TRACE_FLAGS, scheduler} ptr :: IO CBool))
             <*> (toBool <$>
                   (#{peek TRACE_FLAGS, gc} ptr :: IO CBool))
+             <*> (toBool <$>
+                   (#{peek TRACE_FLAGS, nonmoving_gc} ptr :: IO CBool))
             <*> (toBool <$>
                   (#{peek TRACE_FLAGS, sparks_sampled} ptr :: IO CBool))
             <*> (toBool <$>

--- a/libraries/base/GHC/Stats.hsc
+++ b/libraries/base/GHC/Stats.hsc
@@ -103,6 +103,25 @@ data RTSStats = RTSStats {
    -- | Total elapsed time (at the previous GC)
  , elapsed_ns :: RtsTime

+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_sync_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_sync_elapsed_ns :: RtsTime
+    -- | The maximum time elapsed during the post-mark pause phase of the
+    -- concurrent nonmoving GC.
+  , nonmoving_gc_sync_max_elapsed_ns :: RtsTime
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_elapsed_ns :: RtsTime
+    -- | The maximum time elapsed during the post-mark pause phase of the
+    -- concurrent nonmoving GC.
+  , nonmoving_gc_max_elapsed_ns :: RtsTime
+
    -- | Details about the most recent GC
  , gc :: GCDetails
  } deriving ( Read -- ^ @since 4.10.0.0
@@ -146,6 +165,13 @@ data GCDetails = GCDetails {
  , gcdetails_cpu_ns :: RtsTime
    -- | The time elapsed during GC itself
  , gcdetails_elapsed_ns :: RtsTime
+
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , gcdetails_nonmoving_gc_sync_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , gcdetails_nonmoving_gc_sync_elapsed_ns :: RtsTime
  } deriving ( Read -- ^ @since 4.10.0.0
             , Show -- ^ @since 4.10.0.0
             )
@@ -192,6 +218,12 @@ getRTSStats = do
    gc_elapsed_ns <- (# peek RTSStats, gc_elapsed_ns) p
    cpu_ns <- (# peek RTSStats, cpu_ns) p
    elapsed_ns <- (# peek RTSStats, elapsed_ns) p
+    nonmoving_gc_sync_cpu_ns <- (# peek RTSStats, nonmoving_gc_sync_cpu_ns) p
+    nonmoving_gc_sync_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_elapsed_ns) p
+    nonmoving_gc_sync_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_max_elapsed_ns) p
+    nonmoving_gc_cpu_ns <- (# peek RTSStats, nonmoving_gc_cpu_ns) p
+    nonmoving_gc_elapsed_ns <- (# peek RTSStats, nonmoving_gc_elapsed_ns) p
+    nonmoving_gc_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_max_elapsed_ns) p
    let pgc = (# ptr RTSStats, gc) p
    gc <- do
      gcdetails_gen <- (# peek GCDetails, gen) pgc
@@ -211,5 +243,7 @@ getRTSStats = do
      gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc
      gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc
      gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc
+      gcdetails_nonmoving_gc_sync_cpu_ns <- (# peek GCDetails, nonmoving_gc_sync_cpu_ns) pgc
+      gcdetails_nonmoving_gc_sync_elapsed_ns <- (# peek GCDetails, nonmoving_gc_sync_elapsed_ns) pgc
      return GCDetails{..}
    return RTSStats{..}
--- a/rts/GetTime.h
+++ b/rts/GetTime.h
@@ -13,6 +13,7 @@
 void initializeTimer       (void);

 Time getProcessCPUTime     (void);
+Time getMyThreadCPUTime    (void);
 void getProcessTimes       (Time *user, Time *elapsed);

 /* Get the current date and time.

--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -256,7 +256,7 @@ loop:

        // point to the BLOCKING_QUEUE from the BLACKHOLE
        write_barrier(); // make the BQ visible
-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
            updateRemembSetPushClosure(cap, (StgClosure*)p);
        }
        ((StgInd*)bh)->indirectee = (StgClosure *)bq;
@@ -287,7 +287,7 @@ loop:
        }
 #endif

-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
            // We are about to overwrite bq->queue; make sure its current value
            // makes it into the update remembered set
            updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);

--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -474,7 +474,7 @@ stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n)
    end = p + WDS(n);

 again:
-    IF_WRITE_BARRIER_ENABLED {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
        ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr");
    }
    p = p + WDS(1);
@@ -490,7 +490,7 @@ stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
    W_ dst_p, src_p, bytes;

    if (n > 0) {
-        IF_WRITE_BARRIER_ENABLED {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
            call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
                                      dst, dst_off, n);
        }
@@ -511,7 +511,7 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
    W_ dst_p, src_p, bytes;

    if (n > 0) {
-        IF_WRITE_BARRIER_ENABLED {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
            call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
                                      dst, dst_off, n);
        }

--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -218,6 +218,7 @@ void initRtsFlagsDefaults(void)
    RtsFlags.TraceFlags.timestamp     = false;
    RtsFlags.TraceFlags.scheduler     = false;
    RtsFlags.TraceFlags.gc            = false;
+    RtsFlags.TraceFlags.nonmoving_gc  = false;
    RtsFlags.TraceFlags.sparks_sampled= false;
    RtsFlags.TraceFlags.sparks_full   = false;
    RtsFlags.TraceFlags.user          = false;
@@ -2103,6 +2104,10 @@ static void read_trace_flags(const char *arg)
            RtsFlags.TraceFlags.gc        = enabled;
            enabled = true;
            break;
+        case 'n':
+            RtsFlags.TraceFlags.nonmoving_gc = enabled;
+            enabled = true;
+            break;
        case 'u':
            RtsFlags.TraceFlags.user      = enabled;
            enabled = true;

--- a/rts/STM.c
+++ b/rts/STM.c
@@ -297,8 +297,10 @@ static StgClosure *lock_tvar(Capability *cap,
  } while (cas((void *)&(s -> current_value),
               (StgWord)result, (StgWord)trec) != (StgWord)result);

-  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
-      updateRemembSetPushClosure(cap, result);
+
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, result);
  }
  return result;
 }
@@ -323,8 +325,9 @@ static StgBool cond_lock_tvar(Capability *cap,
  TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected);
  w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec);
  result = (StgClosure *)w;
-  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
-      updateRemembSetPushClosure(cap, expected);
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, expected);
  }
  TRACE("%p : %s", trec, result ? "success" : "failure");
  return (result == expected);

--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -164,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t );
 static bool scheduleHandleThreadFinished( Capability *cap, Task *task,
                                          StgTSO *t );
 static bool scheduleNeedHeapProfile(bool ready_to_gc);
-static void scheduleDoGC(Capability **pcap, Task *task, bool force_major);
+static void scheduleDoGC( Capability **pcap, Task *task,
+                          bool force_major, bool deadlock_detect );

 static void deleteThread (StgTSO *tso);
 static void deleteAllThreads (void);
@@ -264,7 +265,7 @@ schedule (Capability *initialCapability, Task *task)
    case SCHED_INTERRUPTING:
        debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
        /* scheduleDoGC() deletes all the threads */
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);

        // after scheduleDoGC(), we must be shutting down.  Either some
        // other Capability did the final GC, or we did it above,
@@ -561,7 +562,7 @@ run_thread:
    }

    if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
-      scheduleDoGC(&cap,task,false);
+      scheduleDoGC(&cap,task,false,false);
    }
  } /* end of while() */
 }
@@ -935,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
        // they are unreachable and will therefore be sent an
        // exception.  Any threads thus released will be immediately
        // runnable.
-        scheduleDoGC (pcap, task, true/*force major GC*/);
+        scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/);
        cap = *pcap;
        // when force_major == true. scheduleDoGC sets
        // recent_activity to ACTIVITY_DONE_GC and turns off the timer
@@ -1005,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
    while (!emptyInbox(cap)) {
        // Executing messages might use heap, so we should check for GC.
        if (doYouWantToGC(cap)) {
-            scheduleDoGC(pcap, cap->running_task, false);
+            scheduleDoGC(pcap, cap->running_task, false, false);
            cap = *pcap;
        }

@@ -1552,9 +1553,11 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
 * Perform a garbage collection if necessary
 * -------------------------------------------------------------------------- */

+// N.B. See Note [Deadlock detection under nonmoving collector] for rationale
+// behind deadlock_detect argument.
 static void
 scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
-              bool force_major)
+              bool force_major, bool deadlock_detect)
 {
    Capability *cap = *pcap;
    bool heap_census;
@@ -1847,9 +1850,9 @@ delete_threads_and_gc:
    // emerge they don't immediately re-enter the GC.
    pending_sync = 0;
    signalCondition(&sync_finished_cond);
-    GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap);
 #else
-    GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL);
 #endif

    // If we're shutting down, don't leave any idle GC work to do.
@@ -2500,7 +2503,7 @@ resumeThread (void *task_)
    incall->suspended_tso = NULL;
    incall->suspended_cap = NULL;
    // we will modify tso->_link
-    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
        updateRemembSetPushClosure(cap, (StgClosure *)tso->_link);
    }
    tso->_link = END_TSO_QUEUE;
@@ -2717,7 +2720,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
        nonmovingExit();
        Capability *cap = task->cap;
        waitForCapability(&cap,task);
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
        ASSERT(task->incall->tso == NULL);
        releaseCapability(cap);
    }
@@ -2785,7 +2788,7 @@ performGC_(bool force_major)
    // TODO: do we need to traceTask*() here?

    waitForCapability(&cap,task);
-    scheduleDoGC(&cap,task,force_major);
+    scheduleDoGC(&cap,task,force_major,false);
    releaseCapability(cap);
    boundTaskExiting(task);
 }

--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -33,7 +33,9 @@ static Time
    end_init_cpu,   end_init_elapsed,
    start_exit_cpu, start_exit_elapsed,
    start_exit_gc_elapsed, start_exit_gc_cpu,
-    end_exit_cpu,   end_exit_elapsed;
+    end_exit_cpu,   end_exit_elapsed,
+    start_nonmoving_gc_cpu, start_nonmoving_gc_elapsed,
+    start_nonmoving_gc_sync_elapsed;

 #if defined(PROFILING)
 static Time RP_start_time  = 0, RP_tot_time  = 0;  // retainer prof user time
@@ -84,7 +86,7 @@ Time stat_getElapsedTime(void)
 double
 mut_user_time_until( Time t )
 {
-    return TimeToSecondsDbl(t - stats.gc_cpu_ns);
+    return TimeToSecondsDbl(t - stats.gc_cpu_ns - stats.nonmoving_gc_cpu_ns);
    // heapCensus() time is included in GC_tot_cpu, so we don't need
    // to subtract it here.

@@ -125,6 +127,10 @@ initStats0(void)
    end_init_cpu     = 0;
    end_init_elapsed  = 0;

+    start_nonmoving_gc_cpu = 0;
+    start_nonmoving_gc_elapsed = 0;
+    start_nonmoving_gc_sync_elapsed = 0;
+
    start_exit_cpu    = 0;
    start_exit_elapsed = 0;
    start_exit_gc_cpu    = 0;
@@ -175,6 +181,11 @@ initStats0(void)
        .gc_elapsed_ns = 0,
        .cpu_ns = 0,
        .elapsed_ns = 0,
+        .nonmoving_gc_cpu_ns = 0,
+        .nonmoving_gc_elapsed_ns = 0,
+        .nonmoving_gc_max_elapsed_ns = 0,
+        .nonmoving_gc_sync_elapsed_ns = 0,
+        .nonmoving_gc_sync_max_elapsed_ns = 0,
        .gc = {
            .gen = 0,
            .threads = 0,
@@ -189,7 +200,10 @@ initStats0(void)
            .par_balanced_copied_bytes = 0,
            .sync_elapsed_ns = 0,
            .cpu_ns = 0,
-            .elapsed_ns = 0
+            .elapsed_ns = 0,
+            .nonmoving_gc_cpu_ns = 0,
+            .nonmoving_gc_elapsed_ns = 0,
+            .nonmoving_gc_sync_elapsed_ns = 0,
        }
    };
 }
@@ -274,6 +288,11 @@ stat_startExit(void)
    start_exit_gc_cpu = stats.gc_cpu_ns;
 }

+/* -----------------------------------------------------------------------------
+   Nonmoving (concurrent) collector statistics
+
+   These two measure the time taken in the concurrent mark & sweep collector.
+   -------------------------------------------------------------------------- */
 void
 stat_endExit(void)
 {
@@ -286,10 +305,87 @@ stat_startGCSync (gc_thread *gct)
    gct->gc_sync_start_elapsed = getProcessElapsedTime();
 }

+void
+stat_startNonmovingGc ()
+{
+    start_nonmoving_gc_cpu = getMyThreadCPUTime();
+    start_nonmoving_gc_elapsed = getProcessCPUTime();
+}
+
+void
+stat_endNonmovingGc ()
+{
+    Time cpu = getMyThreadCPUTime();
+    Time elapsed = getProcessCPUTime();
+    stats.gc.nonmoving_gc_elapsed_ns = elapsed - start_nonmoving_gc_elapsed;
+    stats.nonmoving_gc_elapsed_ns += stats.gc.nonmoving_gc_elapsed_ns;
+
+    stats.gc.nonmoving_gc_cpu_ns = cpu - start_nonmoving_gc_cpu;
+    stats.nonmoving_gc_cpu_ns += stats.gc.nonmoving_gc_cpu_ns;
+
+    stats.nonmoving_gc_max_elapsed_ns =
+      stg_max(stats.gc.nonmoving_gc_elapsed_ns,
+              stats.nonmoving_gc_max_elapsed_ns);
+}
+
+void
+stat_startNonmovingGcSync ()
+{
+    start_nonmoving_gc_sync_elapsed = getProcessElapsedTime();
+    traceConcSyncBegin();
+}
+
+void
+stat_endNonmovingGcSync ()
+{
+    Time end_elapsed = getProcessElapsedTime();
+    stats.gc.nonmoving_gc_sync_elapsed_ns = end_elapsed - start_nonmoving_gc_sync_elapsed;
+    stats.nonmoving_gc_sync_elapsed_ns +=  stats.gc.nonmoving_gc_sync_elapsed_ns;
+    stats.nonmoving_gc_sync_max_elapsed_ns =
+      stg_max(stats.gc.nonmoving_gc_sync_elapsed_ns,
+              stats.nonmoving_gc_sync_max_elapsed_ns);
+    traceConcSyncEnd();
+}
+
 /* -----------------------------------------------------------------------------
   Called at the beginning of each GC
   -------------------------------------------------------------------------- */

+/*
+ * GC CPU time is collected on a per-gc_thread basis: The CPU time of each GC
+ * thread worker is recorded in its gc_thread at the beginning and end of
+ * scavenging. These are then summed over at the end of the GC.
+ *
+ * By contrast, the elapsed time is recorded only by the thread driving the GC.
+ *
+ * Mutator time is derived from the process's CPU time, subtracting out
+ * contributions from stop-the-world and concurrent GCs.
+ */
+
+void
+stat_startGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
+{
+    bool stats_enabled =
+        RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
+        rtsConfig.gcDoneHook != NULL;
+
+    if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
+        gct->gc_start_cpu = getMyThreadCPUTime();
+    }
+}
+
+void
+stat_endGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
+{
+    bool stats_enabled =
+        RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
+        rtsConfig.gcDoneHook != NULL;
+
+    if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
+        gct->gc_end_cpu = getMyThreadCPUTime();
+    }
+}
+
 void
 stat_startGC (Capability *cap, gc_thread *gct)
 {
@@ -297,7 +393,15 @@ stat_startGC (Capability *cap, gc_thread *gct)
        debugBelch("\007");
    }

-    getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
+    bool stats_enabled =
+        RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
+        rtsConfig.gcDoneHook != NULL;
+
+    if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
+        gct->gc_start_cpu = getMyThreadCPUTime();
+    }
+
+    gct->gc_start_elapsed = getProcessElapsedTime();

    // Post EVENT_GC_START with the same timestamp as used for stats
    // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
@@ -320,9 +424,9 @@ stat_startGC (Capability *cap, gc_thread *gct)
   -------------------------------------------------------------------------- */

 void
-stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
-            uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
-            W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
+stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
+            uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
+            W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
            W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
            W_ scav_find_work)
 {
@@ -364,9 +468,13 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
        stats.elapsed_ns = current_elapsed - start_init_elapsed;

        stats.gc.sync_elapsed_ns =
-            gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
-        stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
-        stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
+            initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed;
+        stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed;
+        stats.gc.cpu_ns = 0;
+        for (unsigned int i=0; i < par_n_threads; i++) {
+            gc_thread *gct = gc_threads[i];
+            stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
+        }
    }
    // -------------------------------------------------
    // Update the cumulative stats
@@ -473,8 +581,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
                    TimeToSecondsDbl(stats.gc.elapsed_ns),
                    TimeToSecondsDbl(stats.cpu_ns),
                    TimeToSecondsDbl(stats.elapsed_ns),
-                    faults - gct->gc_start_faults,
-                        gct->gc_start_faults - GC_end_faults,
+                    faults - initiating_gct->gc_start_faults,
+                        initiating_gct->gc_start_faults - GC_end_faults,
                    gen);

            GC_end_faults = faults;
@@ -706,6 +814,21 @@ static void report_summary(const RTSSummaryStats* sum)
                    TimeToSecondsDbl(gen_stats->avg_pause_ns),
                    TimeToSecondsDbl(gen_stats->max_pause_ns));
    }
+    if (RtsFlags.GcFlags.useNonmoving) {
+        const int n_major_colls = sum->gc_summary_stats[RtsFlags.GcFlags.generations-1].collections;
+        statsPrintf("  Gen  1     %5d syncs"
+                    ",                      %6.3fs     %3.4fs    %3.4fs\n",
+                    n_major_colls,
+                    TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns),
+                    TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns) / n_major_colls,
+                    TimeToSecondsDbl(stats.nonmoving_gc_sync_max_elapsed_ns));
+        statsPrintf("  Gen  1      concurrent"
+                    ",             %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
+                    TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
+                    TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns),
+                    TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns) / n_major_colls,
+                    TimeToSecondsDbl(stats.nonmoving_gc_max_elapsed_ns));
+    }

    statsPrintf("\n");

@@ -742,6 +865,12 @@ static void report_summary(const RTSSummaryStats* sum)
    statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
                TimeToSecondsDbl(stats.gc_cpu_ns),
                TimeToSecondsDbl(stats.gc_elapsed_ns));
+    if (RtsFlags.GcFlags.useNonmoving) {
+        statsPrintf(
+                "  CONC GC time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
+                TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns));
+    }

 #if defined(PROFILING)
    statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
@@ -1100,7 +1229,8 @@ stat_exit (void)

            stats.mutator_cpu_ns     = start_exit_cpu
                                 - end_init_cpu
-                                 - (stats.gc_cpu_ns - exit_gc_cpu);
+                                 - (stats.gc_cpu_ns - exit_gc_cpu)
+                                 - stats.nonmoving_gc_cpu_ns;
            stats.mutator_elapsed_ns = start_exit_elapsed
                                 - end_init_elapsed
                                 - (stats.gc_elapsed_ns - exit_gc_elapsed);
@@ -1510,7 +1640,8 @@ void getRTSStats( RTSStats *s )
    s->cpu_ns = current_cpu - end_init_cpu;
    s->elapsed_ns = current_elapsed - end_init_elapsed;

-    s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns;
+    s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns -
+        stats.nonmoving_gc_cpu_ns;
    s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
        stats.gc_elapsed_ns;
 }

--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -30,13 +30,21 @@ void      stat_endInit(void);

 void      stat_startGCSync(struct gc_thread_ *_gct);
 void      stat_startGC(Capability *cap, struct gc_thread_ *_gct);
-void      stat_endGC  (Capability *cap, struct gc_thread_ *_gct, W_ live,
-                       W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
+void      stat_startGCWorker (Capability *cap, struct gc_thread_ *_gct);
+void      stat_endGCWorker (Capability *cap, struct gc_thread_ *_gct);
+void      stat_endGC  (Capability *cap, struct gc_thread_ *initiating_gct, W_ live,
+                       W_ copied, W_ slop, uint32_t gen,
+                       uint32_t n_gc_threads, struct gc_thread_ **gc_threads,
                       W_ par_max_copied, W_ par_balanced_copied,
                       W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
                       W_ mut_spin_yield, W_ any_work, W_ no_work,
                       W_ scav_find_work);

+void      stat_startNonmovingGcSync(void);
+void      stat_endNonmovingGcSync(void);
+void      stat_startNonmovingGc (void);
+void      stat_endNonmovingGc (void);
+
 #if defined(PROFILING)
 void      stat_startRP(void);
 void      stat_endRP(uint32_t,

--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -330,15 +330,16 @@ threadPaused(Capability *cap, StgTSO *tso)
            }
 #endif

-            if (RTS_UNLIKELY(nonmoving_write_barrier_enabled
-                             && ip_THUNK(INFO_PTR_TO_STRUCT(bh_info)))) {
-                // We are about to replace a thunk with a blackhole.
-                // Add the free variables of the closure we are about to
-                // overwrite to the update remembered set.
-                // N.B. We caught the WHITEHOLE case above.
-                updateRemembSetPushThunkEager(cap,
-                                             THUNK_INFO_PTR_TO_STRUCT(bh_info),
-                                             (StgThunk *) bh);
+            IF_NONMOVING_WRITE_BARRIER_ENABLED {
+                if (ip_THUNK(INFO_PTR_TO_STRUCT(bh_info))) {
+                    // We are about to replace a thunk with a blackhole.
+                    // Add the free variables of the closure we are about to
+                    // overwrite to the update remembered set.
+                    // N.B. We caught the WHITEHOLE case above.
+                    updateRemembSetPushThunkEager(cap,
+                                                 THUNK_INFO_PTR_TO_STRUCT(bh_info),
+                                                 (StgThunk *) bh);
+                }
            }

            // The payload of the BLACKHOLE points to the TSO

--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -711,7 +711,7 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
            barf("threadStackUnderflow: not enough space for return values");
        }

-        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
            // ensure that values that we copy into the new stack are marked
            // for the nonmoving collector. Note that these values won't
            // necessarily form a full closure so we need to handle them