Ömer Sinan Ağacan · Ben Gamari · Ben Gamari · Ben Gamari · Ben Gamari · Ben Gamari
--- a/compiler/cmm/CLabel.hs
+++ b/compiler/cmm/CLabel.hs
@@ -40,6 +40,7 @@ module CLabel (
        mkAsmTempDieLabel,

        mkDirty_MUT_VAR_Label,
+        mkNonmovingWriteBarrierEnabledLabel,
        mkUpdInfoLabel,
        mkBHUpdInfoLabel,
        mkIndStaticInfoLabel,
@@ -484,7 +485,9 @@ mkBlockInfoTableLabel name c = IdLabel name c BlockInfoTable
                               -- See Note [Proc-point local block entry-point].

 -- Constructing Cmm Labels
-mkDirty_MUT_VAR_Label, mkUpdInfoLabel,
+mkDirty_MUT_VAR_Label,
+    mkNonmovingWriteBarrierEnabledLabel,
+    mkUpdInfoLabel,
    mkBHUpdInfoLabel, mkIndStaticInfoLabel, mkMainCapabilityLabel,
    mkMAP_FROZEN_CLEAN_infoLabel, mkMAP_FROZEN_DIRTY_infoLabel,
    mkMAP_DIRTY_infoLabel,
@@ -494,6 +497,8 @@ mkDirty_MUT_VAR_Label, mkUpdInfoLabel,
    mkSMAP_FROZEN_CLEAN_infoLabel, mkSMAP_FROZEN_DIRTY_infoLabel,
    mkSMAP_DIRTY_infoLabel, mkBadAlignmentLabel :: CLabel
 mkDirty_MUT_VAR_Label           = mkForeignLabel (fsLit "dirty_MUT_VAR") Nothing ForeignLabelInExternalPackage IsFunction
+mkNonmovingWriteBarrierEnabledLabel
+                                = CmmLabel rtsUnitId (fsLit "nonmoving_write_barrier_enabled") CmmData
 mkUpdInfoLabel                  = CmmLabel rtsUnitId (fsLit "stg_upd_frame")         CmmInfo
 mkBHUpdInfoLabel                = CmmLabel rtsUnitId (fsLit "stg_bh_upd_frame" )     CmmInfo
 mkIndStaticInfoLabel            = CmmLabel rtsUnitId (fsLit "stg_IND_STATIC")        CmmInfo

--- a/compiler/codeGen/StgCmmBind.hs
+++ b/compiler/codeGen/StgCmmBind.hs
@@ -631,6 +631,7 @@ emitBlackHoleCode node = do
             -- work with profiling.

  when eager_blackholing $ do
+    whenUpdRemSetEnabled dflags $ emitUpdRemSetPushThunk node
    emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr
    emitPrimCall [] MO_WriteBarrier []
    emitStore node (CmmReg (CmmGlobal EagerBlackholeInfo))

--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -37,6 +37,7 @@ import BlockId
 import MkGraph
 import StgSyn
 import Cmm
+import Module   ( rtsUnitId )
 import Type     ( Type, tyConAppTyCon )
 import TyCon
 import CLabel
@@ -314,14 +315,21 @@ emitPrimOp dflags [res] ReadMutVarOp [mutv]
   = emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags))

 emitPrimOp dflags res@[] WriteMutVarOp [mutv,var]
-   = do -- Without this write barrier, other CPUs may see this pointer before
+   = do old_val <- CmmLocal <$> newTemp (cmmExprType dflags var)
+        emitAssign old_val (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags))
+
+        -- Without this write barrier, other CPUs may see this pointer before
        -- the writes for the closure it points to have occurred.
+        -- Note that this also must come after we read the old value to ensure
+        -- that the read of old_val comes before another core's write to the
+        -- MutVar's value.
        emitPrimCall res MO_WriteBarrier []
+
        emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var
        emitCCall
                [{-no results-}]
                (CmmLit (CmmLabel mkDirty_MUT_VAR_Label))
-                [(baseExpr, AddrHint), (mutv,AddrHint)]
+                [(baseExpr, AddrHint), (mutv, AddrHint), (CmmReg old_val, AddrHint)]

 --  #define sizzeofByteArrayzh(r,a) \
 --     r = ((StgArrBytes *)(a))->bytes
@@ -1622,17 +1630,21 @@ doWritePtrArrayOp :: CmmExpr
 doWritePtrArrayOp addr idx val
  = do dflags <- getDynFlags
       let ty = cmmExprType dflags val
+           hdr_size = arrPtrsHdrSize dflags
+       -- Update remembered set for non-moving collector
+       whenUpdRemSetEnabled dflags
+           $ emitUpdRemSetPush (cmmLoadIndexOffExpr dflags hdr_size ty addr ty idx)
       -- This write barrier is to ensure that the heap writes to the object
       -- referred to by val have happened before we write val into the array.
       -- See #12469 for details.
       emitPrimCall [] MO_WriteBarrier []
-       mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val
+       mkBasicIndexedWrite hdr_size Nothing addr ty idx val
       emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
-  -- the write barrier.  We must write a byte into the mark table:
-  -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N]
+       -- the write barrier.  We must write a byte into the mark table:
+       -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N]
       emit $ mkStore (
         cmmOffsetExpr dflags
-          (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags))
+          (cmmOffsetExprW dflags (cmmOffsetB dflags addr hdr_size)
                         (loadArrPtrsSize dflags addr))
          (CmmMachOp (mo_wordUShr dflags) [idx,
                                           mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)])
@@ -2223,6 +2235,8 @@ emitCopyArray copy src0 src_off dst0 dst_off0 n =
        dst     <- assignTempE dst0
        dst_off <- assignTempE dst_off0

+        emitCopyUpdRemSetPush dflags (arrPtrsHdrSizeW dflags) dst dst_off n
+
        -- Set the dirty bit in the header.
        emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))

@@ -2285,6 +2299,8 @@ emitCopySmallArray copy src0 src_off dst0 dst_off n =
        src     <- assignTempE src0
        dst     <- assignTempE dst0

+        emitCopyUpdRemSetPush dflags (smallArrPtrsHdrSizeW dflags) dst dst_off n
+
        -- Set the dirty bit in the header.
        emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))

@@ -2413,6 +2429,12 @@ doWriteSmallPtrArrayOp :: CmmExpr
 doWriteSmallPtrArrayOp addr idx val = do
    dflags <- getDynFlags
    let ty = cmmExprType dflags val
+
+    -- Update remembered set for non-moving collector
+    tmp <- newTemp ty
+    mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing ty tmp addr ty idx
+    whenUpdRemSetEnabled dflags $ emitUpdRemSetPush (CmmReg (CmmLocal tmp))
+
    emitPrimCall [] MO_WriteBarrier [] -- #12469
    mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val
    emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
@@ -2592,3 +2614,31 @@ emitCtzCall res x width = do
        [ res ]
        (MO_Ctz width)
        [ x ]
+
+---------------------------------------------------------------------------
+-- Pushing to the update remembered set
+---------------------------------------------------------------------------
+
+-- | Push a range of pointer-array elements that are about to be copied over to
+-- the update remembered set.
+emitCopyUpdRemSetPush :: DynFlags
+                      -> WordOff    -- ^ array header size
+                      -> CmmExpr    -- ^ destination array
+                      -> CmmExpr    -- ^ offset in destination array (in words)
+                      -> Int        -- ^ number of elements to copy
+                      -> FCode ()
+emitCopyUpdRemSetPush _dflags _hdr_size _dst _dst_off 0 = return ()
+emitCopyUpdRemSetPush dflags hdr_size dst dst_off n =
+    whenUpdRemSetEnabled dflags $ do
+        updfr_off <- getUpdFrameOff
+        graph <- mkCall lbl (NativeNodeCall,NativeReturn) [] args updfr_off []
+        emit graph
+  where
+    lbl = mkLblExpr $ mkPrimCallLabel
+          $ PrimCall (fsLit "stg_copyArray_barrier") rtsUnitId
+    args =
+      [ mkIntExpr dflags hdr_size
+      , dst
+      , dst_off
+      , mkIntExpr dflags n
+      ]
--- a/compiler/codeGen/StgCmmUtils.hs
+++ b/compiler/codeGen/StgCmmUtils.hs
@@ -39,6 +39,11 @@ module StgCmmUtils (
        mkWordCLit,
        newStringCLit, newByteStringCLit,
        blankWord,
+
+        -- * Update remembered set operations
+        whenUpdRemSetEnabled,
+        emitUpdRemSetPush,
+        emitUpdRemSetPushThunk,
  ) where

 #include "HsVersions.h"
@@ -576,3 +581,40 @@ assignTemp' e
       let reg = CmmLocal lreg
       emitAssign reg e
       return (CmmReg reg)
+
+
+---------------------------------------------------------------------------
+-- Pushing to the update remembered set
+---------------------------------------------------------------------------
+
+whenUpdRemSetEnabled :: DynFlags -> FCode a -> FCode ()
+whenUpdRemSetEnabled dflags code = do
+    do_it <- getCode code
+    the_if <- mkCmmIfThenElse' is_enabled do_it mkNop (Just False)
+    emit the_if
+  where
+    enabled = CmmLoad (CmmLit $ CmmLabel mkNonmovingWriteBarrierEnabledLabel) (bWord dflags)
+    zero = zeroExpr dflags
+    is_enabled = cmmNeWord dflags enabled zero
+
+-- | Emit code to add an entry to a now-overwritten pointer to the update
+-- remembered set.
+emitUpdRemSetPush :: CmmExpr   -- ^ value of pointer which was overwritten
+                  -> FCode ()
+emitUpdRemSetPush ptr = do
+    emitRtsCall
+      rtsUnitId
+      (fsLit "updateRemembSetPushClosure_")
+      [(CmmReg (CmmGlobal BaseReg), AddrHint),
+       (ptr, AddrHint)]
+      False
+
+emitUpdRemSetPushThunk :: CmmExpr -- ^ the thunk
+                       -> FCode ()
+emitUpdRemSetPushThunk ptr = do
+    emitRtsCall
+      rtsUnitId
+      (fsLit "updateRemembSetPushThunk_")
+      [(CmmReg (CmmGlobal BaseReg), AddrHint),
+       (ptr, AddrHint)]
+      False
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -832,6 +832,10 @@
      __gen = TO_W_(bdescr_gen_no(__bd));                       \
      if (__gen > 0) { recordMutableCap(__p, __gen); }

+/* -----------------------------------------------------------------------------
+   Update remembered set write barrier
+   -------------------------------------------------------------------------- */
+
 /* -----------------------------------------------------------------------------
   Arrays
   -------------------------------------------------------------------------- */
@@ -934,3 +938,25 @@
    prim %memcpy(dst_p, src_p, n * SIZEOF_W, SIZEOF_W);        \
                                                               \
    return (dst);
+
+
+//
+// Nonmoving write barrier helpers
+//
+// See Note [Update remembered set] in NonMovingMark.c.
+
+#if defined(THREADED_RTS)
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
+    if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False)
+#else
+// A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C--
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
+    if (0)
+#define nonmoving_write_barrier_enabled 0
+#endif
+
+// A useful helper for pushing a pointer to the update remembered set.
+#define updateRemembSetPushPtr(p)                                    \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                             \
+      ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr");     \
+    }
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -74,6 +74,10 @@ extern "C" {
 #define RTS_UNREACHABLE abort()
 #endif

+/* Prefetch primitives */
+#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
+#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
+
 /* Fix for mingw stat problem (done here so it's early enough) */
 #if defined(mingw32_HOST_OS)
 #define __MSVCRT__ 1
@@ -189,6 +193,7 @@ void _assertFail(const char *filename, unsigned int linenum)
 #include "rts/storage/ClosureMacros.h"
 #include "rts/storage/MBlock.h"
 #include "rts/storage/GC.h"
+#include "rts/NonMoving.h"

 /* Other RTS external APIs */
 #include "rts/Parallel.h"

--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -151,6 +151,23 @@ typedef struct GCDetails_ {
  Time cpu_ns;
    // The time elapsed during GC itself
  Time elapsed_ns;
+
+    //
+    // Concurrent garbage collector
+    //
+
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_elapsed_ns;
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_elapsed_ns;
 } GCDetails;

 //
@@ -241,6 +258,28 @@ typedef struct _RTSStats {
    // The number of times a GC thread has iterated it's outer loop across all
    // parallel GCs
  uint64_t scav_find_work;
+
+  // ----------------------------------
+  // Concurrent garbage collector
+
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_elapsed_ns;
+    // The maximum time elapsed during the post-mark pause phase of the
+    // concurrent nonmoving GC.
+  Time nonmoving_gc_sync_max_elapsed_ns;
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_elapsed_ns;
+    // The maximum time elapsed during the post-mark pause phase of the
+    // concurrent nonmoving GC.
+  Time nonmoving_gc_max_elapsed_ns;
 } RTSStats;

 void getRTSStats (RTSStats *s);

--- a/includes/rts/EventLogFormat.h
+++ b/includes/rts/EventLogFormat.h
@@ -182,12 +182,21 @@

 #define EVENT_USER_BINARY_MSG              181

+#define EVENT_CONC_MARK_BEGIN              200
+#define EVENT_CONC_MARK_END                201
+#define EVENT_CONC_SYNC_BEGIN              202
+#define EVENT_CONC_SYNC_END                203
+#define EVENT_CONC_SWEEP_BEGIN             204
+#define EVENT_CONC_SWEEP_END               205
+#define EVENT_CONC_UPD_REM_SET_FLUSH       206
+#define EVENT_NONMOVING_HEAP_CENSUS        207
+
 /*
 * The highest event code +1 that ghc itself emits. Note that some event
 * ranges higher than this are reserved but not currently emitted by ghc.
 * This must match the size of the EventDesc[] array in EventLog.c
 */
-#define NUM_GHC_EVENT_TAGS        182
+#define NUM_GHC_EVENT_TAGS        208

 #if 0  /* DEPRECATED EVENTS: */
 /* we don't actually need to record the thread, it's implicit */

--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -169,6 +169,7 @@ typedef struct _TRACE_FLAGS {
    bool timestamp;      /* show timestamp in stderr output */
    bool scheduler;      /* trace scheduler events */
    bool gc;             /* trace GC events */
+    bool nonmoving_gc;   /* trace nonmoving GC events */
    bool sparks_sampled; /* trace spark events by a sampled method */
    bool sparks_full;    /* trace spark events 100% accurately */
    bool user;           /* trace user events (emitted from Haskell code) */

--- a/includes/rts/NonMoving.h
+++ b/includes/rts/NonMoving.h
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2018-2019
+ *
+ * Non-moving garbage collector
+ *
+ * Do not #include this file directly: #include "Rts.h" instead.
+ *
+ * To understand the structure of the RTS headers, see the wiki:
+ *   http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
+ *
+ * -------------------------------------------------------------------------- */
+
+#pragma once
+
+/* This is called by the code generator */
+extern DLL_IMPORT_RTS
+void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p);
+
+void updateRemembSetPushClosure(Capability *cap, StgClosure *p);
+
+void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p);
+
+// Note that RTS code should not condition on this directly by rather
+// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that
+// the barrier is eliminated in the non-threaded RTS.
+extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled);
--- a/includes/rts/storage/Block.h
+++ b/includes/rts/storage/Block.h
@@ -97,6 +97,8 @@ typedef struct bdescr_ {
                               // block allocator.  In particular, the
                               // value (StgPtr)(-1) is used to
                               // indicate that a block is unallocated.
+                               //
+                               // Unused by the non-moving allocator.

    struct bdescr_ *link;      // used for chaining blocks together

@@ -141,7 +143,8 @@ typedef struct bdescr_ {
 #define BF_LARGE     2
 /* Block is pinned */
 #define BF_PINNED    4
-/* Block is to be marked, not copied */
+/* Block is to be marked, not copied. Also used for marked large objects in
+ * non-moving heap. */
 #define BF_MARKED    8
 /* Block is executable */
 #define BF_EXEC      32
@@ -153,6 +156,12 @@ typedef struct bdescr_ {
 #define BF_SWEPT     256
 /* Block is part of a Compact */
 #define BF_COMPACT   512
+/* A non-moving allocator segment (see NonMoving.c) */
+#define BF_NONMOVING 1024
+/* A large object which has been moved to off of oldest_gen->large_objects and
+ * onto nonmoving_large_objects. The mark phase ignores objects which aren't
+ * so-flagged */
+#define BF_NONMOVING_SWEEPING 2048
 /* Maximum flag value (do not define anything higher than this!) */
 #define BF_FLAG_MAX  (1 << 15)


--- a/includes/rts/storage/ClosureMacros.h
+++ b/includes/rts/storage/ClosureMacros.h
@@ -107,6 +107,14 @@ INLINE_HEADER const StgConInfoTable *get_con_itbl(const StgClosure *c)
   return CON_INFO_PTR_TO_STRUCT((c)->header.info);
 }

+/* Used when we expect another thread to be mutating the info table pointer of
+ * a closure (e.g. when busy-waiting on a WHITEHOLE).
+ */
+INLINE_HEADER const StgInfoTable *get_volatile_itbl(StgClosure *c) {
+    return INFO_PTR_TO_STRUCT((StgInfoTable*) VOLATILE_LOAD(&c->header.info));
+}
+
+
 INLINE_HEADER StgHalfWord GET_TAG(const StgClosure *con)
 {
    return get_itbl(con)->srt;

--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -234,7 +234,7 @@ void setKeepCAFs (void);
   and is put on the mutable list.
   -------------------------------------------------------------------------- */

-void dirty_MUT_VAR(StgRegTable *reg, StgClosure *p);
+void dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mv, StgClosure *old);

 /* set to disable CAF garbage collection in GHCi. */
 /* (needed when dynamic libraries are used). */

--- a/includes/rts/storage/TSO.h
+++ b/includes/rts/storage/TSO.h
@@ -185,6 +185,53 @@ typedef struct StgTSO_ {

 } *StgTSOPtr; // StgTSO defined in rts/Types.h

+/* Note [StgStack dirtiness flags and concurrent marking]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Without concurrent collection by the nonmoving collector the stack dirtiness story
+ * is quite simple: The stack is either STACK_DIRTY (meaning it has been added to mut_list)
+ * or not.
+ *
+ * However, things are considerably more complicated with concurrent collection
+ * (namely, when nonmoving_write_barrier_enabled is set): In addition to adding
+ * the stack to mut_list and flagging it as STACK_DIRTY, we also must ensure
+ * that stacks are marked in accordance with the nonmoving collector's snapshot
+ * invariant. This is: every stack alive at the time the snapshot is taken must
+ * be marked at some point after the moment the snapshot is taken and before it
+ * is mutated or the commencement of the sweep phase.
+ *
+ * This marking may be done by the concurrent mark phase (in the case of a
+ * thread that never runs during the concurrent mark) or by the mutator when
+ * dirtying the stack. However, it is unsafe for the concurrent collector to
+ * traverse the stack while it is under mutation. Consequently, the following
+ * handshake is obeyed by the mutator's write barrier and the concurrent mark to
+ * ensure this doesn't happen:
+ *
+ * 1. The entity seeking to mark first checks that the stack lives in the nonmoving
+ *    generation; if not then the stack was not alive at the time the snapshot
+ *    was taken and therefore we need not mark it.
+ *
+ * 2. The entity seeking to mark checks the stack's mark bit. If it is set then
+ *    no mark is necessary.
+ *
+ * 3. The entity seeking to mark tries to lock the stack for marking by
+ *    atomically setting its `marking` field to the current non-moving mark
+ *    epoch:
+ *
+ *    a. If the mutator finds the concurrent collector has already locked the
+ *       stack then it waits until it is finished (indicated by the mark bit
+ *       being set) before proceeding with execution.
+ *
+ *    b. If the concurrent collector finds that the mutator has locked the stack
+ *       then it moves on, leaving the mutator to mark it. There is no need to wait;
+ *       the mark is guaranteed to finish before sweep due to the post-mark
+ *       synchronization with mutators.
+ *
+ *    c. Whoever succeeds in locking the stack is responsible for marking it and
+ *       setting the stack's mark bit (either the BF_MARKED bit for large objects
+ *       or otherwise its bit in its segment's mark bitmap).
+ *
+ */

 #define STACK_DIRTY 1
 // used by sanity checker to verify that all dirty stacks are on the mutable list
@@ -193,7 +240,8 @@ typedef struct StgTSO_ {
 typedef struct StgStack_ {
    StgHeader  header;
    StgWord32  stack_size;     // stack size in *words*
-    StgWord32  dirty;          // non-zero => dirty
+    StgWord    dirty;          // non-zero => dirty
+    StgWord    marking;        // non-zero => someone is currently marking the stack
    StgPtr     sp;             // current stack pointer
    StgWord    stack[];
 } StgStack;

--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -542,5 +542,6 @@ void * pushCostCentre (void *ccs, void *cc);

 // Capability.c
 extern unsigned int n_capabilities;
+extern void updateRemembSetPushThunk_(void *reg, void *p1);

 #endif
--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -292,6 +292,8 @@ data TraceFlags = TraceFlags
    , timestamp      :: Bool -- ^ show timestamp in stderr output
    , traceScheduler :: Bool -- ^ trace scheduler events
    , traceGc        :: Bool -- ^ trace GC events
+    , traceNonmovingGc
+                     :: Bool -- ^ trace nonmoving GC heap census samples
    , sparksSampled  :: Bool -- ^ trace spark events by a sampled method
    , sparksFull     :: Bool -- ^ trace spark events 100% accurately
    , user           :: Bool -- ^ trace user events (emitted from Haskell code)
@@ -525,6 +527,8 @@ getTraceFlags = do
                   (#{peek TRACE_FLAGS, scheduler} ptr :: IO CBool))
             <*> (toBool <$>
                   (#{peek TRACE_FLAGS, gc} ptr :: IO CBool))
+             <*> (toBool <$>
+                   (#{peek TRACE_FLAGS, nonmoving_gc} ptr :: IO CBool))
             <*> (toBool <$>
                   (#{peek TRACE_FLAGS, sparks_sampled} ptr :: IO CBool))
             <*> (toBool <$>

--- a/libraries/base/GHC/Stats.hsc
+++ b/libraries/base/GHC/Stats.hsc
@@ -103,6 +103,25 @@ data RTSStats = RTSStats {
    -- | Total elapsed time (at the previous GC)
  , elapsed_ns :: RtsTime

+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_sync_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_sync_elapsed_ns :: RtsTime
+    -- | The maximum time elapsed during the post-mark pause phase of the
+    -- concurrent nonmoving GC.
+  , nonmoving_gc_sync_max_elapsed_ns :: RtsTime
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_elapsed_ns :: RtsTime
+    -- | The maximum time elapsed during the post-mark pause phase of the
+    -- concurrent nonmoving GC.
+  , nonmoving_gc_max_elapsed_ns :: RtsTime
+
    -- | Details about the most recent GC
  , gc :: GCDetails
  } deriving ( Read -- ^ @since 4.10.0.0
@@ -146,6 +165,13 @@ data GCDetails = GCDetails {
  , gcdetails_cpu_ns :: RtsTime
    -- | The time elapsed during GC itself
  , gcdetails_elapsed_ns :: RtsTime
+
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , gcdetails_nonmoving_gc_sync_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , gcdetails_nonmoving_gc_sync_elapsed_ns :: RtsTime
  } deriving ( Read -- ^ @since 4.10.0.0
             , Show -- ^ @since 4.10.0.0
             )
@@ -192,6 +218,12 @@ getRTSStats = do
    gc_elapsed_ns <- (# peek RTSStats, gc_elapsed_ns) p
    cpu_ns <- (# peek RTSStats, cpu_ns) p
    elapsed_ns <- (# peek RTSStats, elapsed_ns) p
+    nonmoving_gc_sync_cpu_ns <- (# peek RTSStats, nonmoving_gc_sync_cpu_ns) p
+    nonmoving_gc_sync_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_elapsed_ns) p
+    nonmoving_gc_sync_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_max_elapsed_ns) p
+    nonmoving_gc_cpu_ns <- (# peek RTSStats, nonmoving_gc_cpu_ns) p
+    nonmoving_gc_elapsed_ns <- (# peek RTSStats, nonmoving_gc_elapsed_ns) p
+    nonmoving_gc_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_max_elapsed_ns) p
    let pgc = (# ptr RTSStats, gc) p
    gc <- do
      gcdetails_gen <- (# peek GCDetails, gen) pgc
@@ -211,5 +243,7 @@ getRTSStats = do
      gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc
      gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc
      gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc
+      gcdetails_nonmoving_gc_sync_cpu_ns <- (# peek GCDetails, nonmoving_gc_sync_cpu_ns) pgc
+      gcdetails_nonmoving_gc_sync_elapsed_ns <- (# peek GCDetails, nonmoving_gc_sync_elapsed_ns) pgc
      return GCDetails{..}
    return RTSStats{..}
--- a/libraries/ghc-heap/tests/all.T
+++ b/libraries/ghc-heap/tests/all.T
@@ -2,7 +2,11 @@ test('heap_all',
     [when(have_profiling(), extra_ways(['prof'])),
      # These ways produce slightly different heap representations.
      # Currently we don't test them.
-      omit_ways(['ghci', 'hpc'])
+      omit_ways(['ghci', 'hpc',
+                 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
+      # The debug RTS initializes some fields with 0xaa and so
+      # this test spuriously fails.
+      when(compiler_debugged(), skip)
     ],
     compile_and_run, [''])


--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -652,6 +652,8 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
    /* someone else beat us to it */
    jump ENTRY_LBL(stg_WHITEHOLE) (ap);
  }
+  // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is
+  // not reachable.
  StgInd_indirectee(ap) = CurrentTSO;
  prim_write_barrier;
  SET_INFO(ap, __stg_EAGER_BLACKHOLE_info);

--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -27,6 +27,7 @@
 #include "STM.h"
 #include "RtsUtils.h"
 #include "sm/OSMem.h"
+#include "sm/BlockAlloc.h" // for countBlocks()

 #if !defined(mingw32_HOST_OS)
 #include "rts/IOManager.h" // for setIOManagerControlFd()
@@ -291,6 +292,11 @@ initCapability (Capability *cap, uint32_t i)
                                          RtsFlags.GcFlags.generations,
                                          "initCapability");

+
+    // At this point storage manager is not initialized yet, so this will be
+    // initialized in initStorage().
+    cap->upd_rem_set.queue.blocks = NULL;
+
    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
        cap->mut_lists[g] = NULL;
    }
@@ -860,16 +866,27 @@ yieldCapability (Capability** pCap, Task *task, bool gcAllowed)
    {
        PendingSync *sync = pending_sync;

-        if (sync && sync->type == SYNC_GC_PAR) {
-            if (! sync->idle[cap->no]) {
-                traceEventGcStart(cap);
-                gcWorkerThread(cap);
-                traceEventGcEnd(cap);
-                traceSparkCounters(cap);
-                // See Note [migrated bound threads 2]
-                if (task->cap == cap) {
-                    return true;
+        if (sync) {
+            switch (sync->type) {
+            case SYNC_GC_PAR:
+                if (! sync->idle[cap->no]) {
+                    traceEventGcStart(cap);
+                    gcWorkerThread(cap);
+                    traceEventGcEnd(cap);
+                    traceSparkCounters(cap);
+                    // See Note [migrated bound threads 2]
+                    if (task->cap == cap) {
+                        return true;
+                    }
                }
+                break;
+
+            case SYNC_FLUSH_UPD_REM_SET:
+                debugTrace(DEBUG_nonmoving_gc, "Flushing update remembered set blocks...");
+                break;
+
+            default:
+                break;
            }
        }
    }