diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs
index 44316cacb0919a4d3b897c85577013859b282aa2..9e192a0ac8a86631fd25b6dcb540fcb9e07127e4 100644
--- a/compiler/GHC/StgToCmm/Bind.hs
+++ b/compiler/GHC/StgToCmm/Bind.hs
@@ -631,6 +631,7 @@ emitBlackHoleCode node = do
              -- work with profiling.
 
   when eager_blackholing $ do
+    whenUpdRemSetEnabled dflags $ emitUpdRemSetPushThunk node
     emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr
     -- See Note [Heap memory barriers] in SMP.h.
     emitPrimCall [] MO_WriteBarrier []
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
index cdbc8d9fd9a4052d7415061f6961d4ef2ee4de32..155cdcbf80a988ff6c98785ddb21557d80b8bb91 100644
--- a/compiler/GHC/StgToCmm/Prim.hs
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -42,6 +42,7 @@ import BlockId
 import MkGraph
 import StgSyn
 import Cmm
+import Module   ( rtsUnitId )
 import Type     ( Type, tyConAppTyCon )
 import TyCon
 import CLabel
@@ -339,14 +340,20 @@ dispatchPrimop dflags = \case
     emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags))
 
   WriteMutVarOp -> \[mutv, var] -> OpDest_AllDone $ \res@[] -> do
+    old_val <- CmmLocal <$> newTemp (cmmExprType dflags var)
+    emitAssign old_val (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags))
+
     -- Without this write barrier, other CPUs may see this pointer before
     -- the writes for the closure it points to have occurred.
+    -- Note that this also must come after we read the old value to ensure
+    -- that the read of old_val comes before another core's write to the
+    -- MutVar's value.
     emitPrimCall res MO_WriteBarrier []
     emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var
     emitCCall
             [{-no results-}]
             (CmmLit (CmmLabel mkDirty_MUT_VAR_Label))
-            [(baseExpr, AddrHint), (mutv,AddrHint)]
+            [(baseExpr, AddrHint), (mutv, AddrHint), (CmmReg old_val, AddrHint)]
 
 --  #define sizzeofByteArrayzh(r,a) \
 --     r = ((StgArrBytes *)(a))->bytes
@@ -1983,17 +1990,21 @@ doWritePtrArrayOp :: CmmExpr
 doWritePtrArrayOp addr idx val
   = do dflags <- getDynFlags
        let ty = cmmExprType dflags val
+           hdr_size = arrPtrsHdrSize dflags
+       -- Update remembered set for non-moving collector
+       whenUpdRemSetEnabled dflags
+           $ emitUpdRemSetPush (cmmLoadIndexOffExpr dflags hdr_size ty addr ty idx)
        -- This write barrier is to ensure that the heap writes to the object
        -- referred to by val have happened before we write val into the array.
        -- See #12469 for details.
        emitPrimCall [] MO_WriteBarrier []
-       mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val
+       mkBasicIndexedWrite hdr_size Nothing addr ty idx val
        emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
-  -- the write barrier.  We must write a byte into the mark table:
-  -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N]
+       -- the write barrier.  We must write a byte into the mark table:
+       -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N]
        emit $ mkStore (
          cmmOffsetExpr dflags
-          (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags))
+          (cmmOffsetExprW dflags (cmmOffsetB dflags addr hdr_size)
                          (loadArrPtrsSize dflags addr))
           (CmmMachOp (mo_wordUShr dflags) [idx,
                                            mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)])
@@ -2584,6 +2595,9 @@ emitCopyArray copy src0 src_off dst0 dst_off0 n =
         dst     <- assignTempE dst0
         dst_off <- assignTempE dst_off0
 
+        -- Nonmoving collector write barrier
+        emitCopyUpdRemSetPush dflags (arrPtrsHdrSizeW dflags) dst dst_off n
+
         -- Set the dirty bit in the header.
         emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
 
@@ -2646,6 +2660,9 @@ emitCopySmallArray copy src0 src_off dst0 dst_off n =
         src     <- assignTempE src0
         dst     <- assignTempE dst0
 
+        -- Nonmoving collector write barrier
+        emitCopyUpdRemSetPush dflags (smallArrPtrsHdrSizeW dflags) dst dst_off n
+
         -- Set the dirty bit in the header.
         emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
 
@@ -2774,6 +2791,12 @@ doWriteSmallPtrArrayOp :: CmmExpr
 doWriteSmallPtrArrayOp addr idx val = do
     dflags <- getDynFlags
     let ty = cmmExprType dflags val
+
+    -- Update remembered set for non-moving collector
+    tmp <- newTemp ty
+    mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing ty tmp addr ty idx
+    whenUpdRemSetEnabled dflags $ emitUpdRemSetPush (CmmReg (CmmLocal tmp))
+
     emitPrimCall [] MO_WriteBarrier [] -- #12469
     mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val
     emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
@@ -2953,3 +2976,31 @@ emitCtzCall res x width = do
         [ res ]
         (MO_Ctz width)
         [ x ]
+
+---------------------------------------------------------------------------
+-- Pushing to the update remembered set
+---------------------------------------------------------------------------
+
+-- | Push a range of pointer-array elements that are about to be copied over to
+-- the update remembered set.
+emitCopyUpdRemSetPush :: DynFlags
+                      -> WordOff    -- ^ array header size
+                      -> CmmExpr    -- ^ destination array
+                      -> CmmExpr    -- ^ offset in destination array (in words)
+                      -> Int        -- ^ number of elements to copy
+                      -> FCode ()
+emitCopyUpdRemSetPush _dflags _hdr_size _dst _dst_off 0 = return ()
+emitCopyUpdRemSetPush dflags hdr_size dst dst_off n =
+    whenUpdRemSetEnabled dflags $ do
+        updfr_off <- getUpdFrameOff
+        graph <- mkCall lbl (NativeNodeCall,NativeReturn) [] args updfr_off []
+        emit graph
+  where
+    lbl = mkLblExpr $ mkPrimCallLabel
+          $ PrimCall (fsLit "stg_copyArray_barrier") rtsUnitId
+    args =
+      [ mkIntExpr dflags hdr_size
+      , dst
+      , dst_off
+      , mkIntExpr dflags n
+      ]
diff --git a/compiler/GHC/StgToCmm/Utils.hs b/compiler/GHC/StgToCmm/Utils.hs
index 30e37bb930b4f1a1603db74bf25c8d04c9a1f2b9..0b3a8d8b080f3e4cfd8523aa893ea8bc774ec6df 100644
--- a/compiler/GHC/StgToCmm/Utils.hs
+++ b/compiler/GHC/StgToCmm/Utils.hs
@@ -39,6 +39,11 @@ module GHC.StgToCmm.Utils (
         mkWordCLit,
         newStringCLit, newByteStringCLit,
         blankWord,
+
+        -- * Update remembered set operations
+        whenUpdRemSetEnabled,
+        emitUpdRemSetPush,
+        emitUpdRemSetPushThunk,
   ) where
 
 #include "HsVersions.h"
@@ -576,3 +581,40 @@ assignTemp' e
        let reg = CmmLocal lreg
        emitAssign reg e
        return (CmmReg reg)
+
+
+---------------------------------------------------------------------------
+-- Pushing to the update remembered set
+---------------------------------------------------------------------------
+
+whenUpdRemSetEnabled :: DynFlags -> FCode a -> FCode ()
+whenUpdRemSetEnabled dflags code = do
+    do_it <- getCode code
+    the_if <- mkCmmIfThenElse' is_enabled do_it mkNop (Just False)
+    emit the_if
+  where
+    enabled = CmmLoad (CmmLit $ CmmLabel mkNonmovingWriteBarrierEnabledLabel) (bWord dflags)
+    zero = zeroExpr dflags
+    is_enabled = cmmNeWord dflags enabled zero
+
+-- | Emit code to add an entry to a now-overwritten pointer to the update
+-- remembered set.
+emitUpdRemSetPush :: CmmExpr   -- ^ value of pointer which was overwritten
+                  -> FCode ()
+emitUpdRemSetPush ptr = do
+    emitRtsCall
+      rtsUnitId
+      (fsLit "updateRemembSetPushClosure_")
+      [(CmmReg (CmmGlobal BaseReg), AddrHint),
+       (ptr, AddrHint)]
+      False
+
+emitUpdRemSetPushThunk :: CmmExpr -- ^ the thunk
+                       -> FCode ()
+emitUpdRemSetPushThunk ptr = do
+    emitRtsCall
+      rtsUnitId
+      (fsLit "updateRemembSetPushThunk_")
+      [(CmmReg (CmmGlobal BaseReg), AddrHint),
+       (ptr, AddrHint)]
+      False
diff --git a/compiler/cmm/CLabel.hs b/compiler/cmm/CLabel.hs
index 0c3dae8001bdf2c93c02d25bfada3831761f81de..66e39f0d69d75c1d4d04d6ccf6ee670acc4ccc6e 100644
--- a/compiler/cmm/CLabel.hs
+++ b/compiler/cmm/CLabel.hs
@@ -40,6 +40,7 @@ module CLabel (
         mkAsmTempDieLabel,
 
         mkDirty_MUT_VAR_Label,
+        mkNonmovingWriteBarrierEnabledLabel,
         mkUpdInfoLabel,
         mkBHUpdInfoLabel,
         mkIndStaticInfoLabel,
@@ -484,7 +485,9 @@ mkBlockInfoTableLabel name c = IdLabel name c BlockInfoTable
                                -- See Note [Proc-point local block entry-point].
 
 -- Constructing Cmm Labels
-mkDirty_MUT_VAR_Label, mkUpdInfoLabel,
+mkDirty_MUT_VAR_Label,
+    mkNonmovingWriteBarrierEnabledLabel,
+    mkUpdInfoLabel,
     mkBHUpdInfoLabel, mkIndStaticInfoLabel, mkMainCapabilityLabel,
     mkMAP_FROZEN_CLEAN_infoLabel, mkMAP_FROZEN_DIRTY_infoLabel,
     mkMAP_DIRTY_infoLabel,
@@ -494,6 +497,8 @@ mkDirty_MUT_VAR_Label, mkUpdInfoLabel,
     mkSMAP_FROZEN_CLEAN_infoLabel, mkSMAP_FROZEN_DIRTY_infoLabel,
     mkSMAP_DIRTY_infoLabel, mkBadAlignmentLabel :: CLabel
 mkDirty_MUT_VAR_Label           = mkForeignLabel (fsLit "dirty_MUT_VAR") Nothing ForeignLabelInExternalPackage IsFunction
+mkNonmovingWriteBarrierEnabledLabel
+                                = CmmLabel rtsUnitId (fsLit "nonmoving_write_barrier_enabled") CmmData
 mkUpdInfoLabel                  = CmmLabel rtsUnitId (fsLit "stg_upd_frame")         CmmInfo
 mkBHUpdInfoLabel                = CmmLabel rtsUnitId (fsLit "stg_bh_upd_frame" )     CmmInfo
 mkIndStaticInfoLabel            = CmmLabel rtsUnitId (fsLit "stg_IND_STATIC")        CmmInfo
diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index 665c8c08e062bf096fa9235ed47e456832346782..add0b6c53724f0492147cf162383d4cfe868e576 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -313,6 +313,24 @@ collection. Hopefully, you won't need any of these in normal operation,
 but there are several things that can be tweaked for maximum
 performance.
 
+.. rts-flag:: -xn
+
+    :default: off
+    :since: 8.8.1
+
+    .. index::
+       single: concurrent mark and sweep
+
+    Enable the concurrent mark-and-sweep garbage collector for old generation
+    collectors. Typically GHC uses a stop-the-world copying garbage collector
+    for all generations. This can cause long pauses in execution during major
+    garbage collections. :rts-flag:`-xn` enables the use of a concurrent
+    mark-and-sweep garbage collector for oldest generation collections.
+    Under this collection strategy oldest-generation garbage collection
+    can proceed concurrently with mutation.
+
+    Note that :rts-flag:`-xn` cannot be used with ``-G1`` nor :rts-flag:`-c`.
+
 .. rts-flag:: -A ⟨size⟩
 
     :default: 1MB
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 21d5da310c341855744ead6f1358418d473ed03e..546e81e8f6b7088afcf0a87c401b55c138282c48 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -842,6 +842,10 @@
       __gen = TO_W_(bdescr_gen_no(__bd));                       \
       if (__gen > 0) { recordMutableCap(__p, __gen); }
 
+/* -----------------------------------------------------------------------------
+   Update remembered set write barrier
+   -------------------------------------------------------------------------- */
+
 /* -----------------------------------------------------------------------------
    Arrays
    -------------------------------------------------------------------------- */
@@ -944,3 +948,25 @@
     prim %memcpy(dst_p, src_p, n * SIZEOF_W, SIZEOF_W);        \
                                                                \
     return (dst);
+
+
+//
+// Nonmoving write barrier helpers
+//
+// See Note [Update remembered set] in NonMovingMark.c.
+
+#if defined(THREADED_RTS)
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
+    if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False)
+#else
+// A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C--
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED                     \
+    if (0)
+#define nonmoving_write_barrier_enabled 0
+#endif
+
+// A useful helper for pushing a pointer to the update remembered set.
+#define updateRemembSetPushPtr(p)                                    \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                             \
+      ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr");     \
+    }
diff --git a/includes/Rts.h b/includes/Rts.h
index 256a3e586cf48cb9711a6078fe6a8b2bb1526a62..d0f53710076e4c9321ffe54978fb6007b631a4a0 100644
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -80,6 +80,10 @@ extern "C" {
 #define RTS_UNREACHABLE abort()
 #endif
 
+/* Prefetch primitives */
+#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
+#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
+
 /* Fix for mingw stat problem (done here so it's early enough) */
 #if defined(mingw32_HOST_OS)
 #define __MSVCRT__ 1
@@ -203,6 +207,7 @@ void _assertFail(const char *filename, unsigned int linenum)
 #include "rts/storage/ClosureMacros.h"
 #include "rts/storage/MBlock.h"
 #include "rts/storage/GC.h"
+#include "rts/NonMoving.h"
 
 /* Other RTS external APIs */
 #include "rts/Parallel.h"
@@ -287,26 +292,27 @@ TICK_VAR(2)
 #define IF_RTSFLAGS(c,s)  if (RtsFlags.c) { s; } doNothing()
 
 #if defined(DEBUG)
+/* See Note [RtsFlags is a pointer in STG code] */
 #if IN_STG_CODE
 #define IF_DEBUG(c,s)  if (RtsFlags[0].DebugFlags.c) { s; } doNothing()
 #else
 #define IF_DEBUG(c,s)  if (RtsFlags.DebugFlags.c) { s; } doNothing()
-#endif
+#endif /* IN_STG_CODE */
 #else
 #define IF_DEBUG(c,s)  doNothing()
-#endif
+#endif /* DEBUG */
 
 #if defined(DEBUG)
 #define DEBUG_ONLY(s) s
 #else
 #define DEBUG_ONLY(s) doNothing()
-#endif
+#endif /* DEBUG */
 
 #if defined(DEBUG)
 #define DEBUG_IS_ON   1
 #else
 #define DEBUG_IS_ON   0
-#endif
+#endif /* DEBUG */
 
 /* -----------------------------------------------------------------------------
    Useful macros and inline functions
diff --git a/includes/Stg.h b/includes/Stg.h
index 73de97055f29a68cea88144c6333b153154c4081..46f71c02417888313ada428bc4e90e68ffaf1a93 100644
--- a/includes/Stg.h
+++ b/includes/Stg.h
@@ -597,3 +597,4 @@ typedef union {
   c;                                            \
 })
 #endif
+
diff --git a/includes/rts/EventLogFormat.h b/includes/rts/EventLogFormat.h
index 7b989b014b7d338841e079bcd7e03736b0757696..d5ed01a8649c8f3e0b911d02b98e799c5b262fb8 100644
--- a/includes/rts/EventLogFormat.h
+++ b/includes/rts/EventLogFormat.h
@@ -185,12 +185,21 @@
 
 #define EVENT_USER_BINARY_MSG              181
 
+#define EVENT_CONC_MARK_BEGIN              200
+#define EVENT_CONC_MARK_END                201
+#define EVENT_CONC_SYNC_BEGIN              202
+#define EVENT_CONC_SYNC_END                203
+#define EVENT_CONC_SWEEP_BEGIN             204
+#define EVENT_CONC_SWEEP_END               205
+#define EVENT_CONC_UPD_REM_SET_FLUSH       206
+#define EVENT_NONMOVING_HEAP_CENSUS        207
+
 /*
  * The highest event code +1 that ghc itself emits. Note that some event
  * ranges higher than this are reserved but not currently emitted by ghc.
  * This must match the size of the EventDesc[] array in EventLog.c
  */
-#define NUM_GHC_EVENT_TAGS        182
+#define NUM_GHC_EVENT_TAGS        208
 
 #if 0  /* DEPRECATED EVENTS: */
 /* we don't actually need to record the thread, it's implicit */
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index b3caf13c1f44ba1416108de6fa1d6639cdabf1e4..f27ce23b0b88f385205b7cfca81fa15fcfe3f71e 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -52,6 +52,9 @@ typedef struct _GC_FLAGS {
     double  oldGenFactor;
     double  pcFreeHeap;
 
+    bool         useNonmoving; // default = false
+    bool         nonmovingSelectorOpt; // Do selector optimization in the
+                                       // non-moving heap, default = false
     uint32_t     generations;
     bool squeezeUpdFrames;
 
@@ -95,6 +98,7 @@ typedef struct _DEBUG_FLAGS {
     bool weak;           /* 'w' */
     bool gccafs;         /* 'G' */
     bool gc;             /* 'g' */
+    bool nonmoving_gc;   /* 'n' */
     bool block_alloc;    /* 'b' */
     bool sanity;         /* 'S'   warning: might be expensive! */
     bool zero_on_gc;     /* 'Z' */
@@ -168,6 +172,7 @@ typedef struct _TRACE_FLAGS {
     bool timestamp;      /* show timestamp in stderr output */
     bool scheduler;      /* trace scheduler events */
     bool gc;             /* trace GC events */
+    bool nonmoving_gc;   /* trace nonmoving GC events */
     bool sparks_sampled; /* trace spark events by a sampled method */
     bool sparks_full;    /* trace spark events 100% accurately */
     bool user;           /* trace user events (emitted from Haskell code) */
@@ -268,7 +273,11 @@ typedef struct _RTS_FLAGS {
 #if defined(COMPILING_RTS_MAIN)
 extern DLLIMPORT RTS_FLAGS RtsFlags;
 #elif IN_STG_CODE
-/* Hack because the C code generator can't generate '&label'. */
+/* Note [RtsFlags is a pointer in STG code]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * When compiling with IN_STG_CODE the RtsFlags symbol is defined as a pointer.
+ * This is necessary because the C code generator can't generate '&label'.
+ */
 extern RTS_FLAGS RtsFlags[];
 #else
 extern RTS_FLAGS RtsFlags;
diff --git a/includes/rts/NonMoving.h b/includes/rts/NonMoving.h
new file mode 100644
index 0000000000000000000000000000000000000000..314c582a1e158f2703cc505f5d6ea407dec693a1
--- /dev/null
+++ b/includes/rts/NonMoving.h
@@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2018-2019
+ *
+ * Non-moving garbage collector
+ *
+ * Do not #include this file directly: #include "Rts.h" instead.
+ *
+ * To understand the structure of the RTS headers, see the wiki:
+ *   http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes
+ *
+ * -------------------------------------------------------------------------- */
+
+#pragma once
+
+// Forward declaration for Stg.h
+struct StgClosure_;
+struct StgThunk_;
+struct Capability_;
+
+/* This is called by the code generator */
+extern DLL_IMPORT_RTS
+void updateRemembSetPushClosure_(StgRegTable *reg, struct StgClosure_ *p);
+
+extern DLL_IMPORT_RTS
+void updateRemembSetPushThunk_(StgRegTable *reg, struct StgThunk_ *p);
+
+// Forward declaration for unregisterised backend.
+EF_(stg_copyArray_barrier);
+
+// Note that RTS code should not condition on this directly by rather
+// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that
+// the barrier is eliminated in the non-threaded RTS.
+extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled);
+
+// A similar macro is defined in includes/Cmm.h for C-- code.
+#if defined(THREADED_RTS)
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+#else
+#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
+    if (0)
+#endif
diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h
index ecd6bf5dd8f62cdcde4f17e2f2d05b63f5709b6b..4afc3689cb5591ba22eb9b31b8a46dcf6268578b 100644
--- a/includes/rts/storage/Block.h
+++ b/includes/rts/storage/Block.h
@@ -88,15 +88,23 @@ typedef struct bdescr_ {
 
     StgPtr start;              // [READ ONLY] start addr of memory
 
-    StgPtr free;               // First free byte of memory.
-                               // allocGroup() sets this to the value of start.
-                               // NB. during use this value should lie
-                               // between start and start + blocks *
-                               // BLOCK_SIZE.  Values outside this
-                               // range are reserved for use by the
-                               // block allocator.  In particular, the
-                               // value (StgPtr)(-1) is used to
-                               // indicate that a block is unallocated.
+    union {
+        StgPtr free;               // First free byte of memory.
+                                   // allocGroup() sets this to the value of start.
+                                   // NB. during use this value should lie
+                                   // between start and start + blocks *
+                                   // BLOCK_SIZE.  Values outside this
+                                   // range are reserved for use by the
+                                   // block allocator.  In particular, the
+                                   // value (StgPtr)(-1) is used to
+                                   // indicate that a block is unallocated.
+                                   //
+                                   // Unused by the non-moving allocator.
+        struct NonmovingSegmentInfo {
+            StgWord8 log_block_size;
+            StgWord16 next_free_snap;
+        } nonmoving_segment;
+    };
 
     struct bdescr_ *link;      // used for chaining blocks together
 
@@ -141,7 +149,8 @@ typedef struct bdescr_ {
 #define BF_LARGE     2
 /* Block is pinned */
 #define BF_PINNED    4
-/* Block is to be marked, not copied */
+/* Block is to be marked, not copied. Also used for marked large objects in
+ * non-moving heap. */
 #define BF_MARKED    8
 /* Block is executable */
 #define BF_EXEC      32
@@ -153,6 +162,12 @@ typedef struct bdescr_ {
 #define BF_SWEPT     256
 /* Block is part of a Compact */
 #define BF_COMPACT   512
+/* A non-moving allocator segment (see NonMoving.c) */
+#define BF_NONMOVING 1024
+/* A large object which has been moved to off of oldest_gen->large_objects and
+ * onto nonmoving_large_objects. The mark phase ignores objects which aren't
+ * so-flagged */
+#define BF_NONMOVING_SWEEPING 2048
 /* Maximum flag value (do not define anything higher than this!) */
 #define BF_FLAG_MAX  (1 << 15)
 
@@ -290,6 +305,13 @@ EXTERN_INLINE bdescr* allocBlock(void)
 
 bdescr *allocGroupOnNode(uint32_t node, W_ n);
 
+// Allocate n blocks, aligned at n-block boundary. The returned bdescr will
+// have this invariant
+//
+//     bdescr->start % BLOCK_SIZE*n == 0
+//
+bdescr *allocAlignedGroupOnNode(uint32_t node, W_ n);
+
 EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node);
 EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node)
 {
diff --git a/includes/rts/storage/ClosureMacros.h b/includes/rts/storage/ClosureMacros.h
index a3873cc49dadcf0b4e3347fcac9ae000ef084084..2af50863d0d7f4393f53baf8572936b1b883f171 100644
--- a/includes/rts/storage/ClosureMacros.h
+++ b/includes/rts/storage/ClosureMacros.h
@@ -107,6 +107,20 @@ INLINE_HEADER const StgConInfoTable *get_con_itbl(const StgClosure *c)
    return CON_INFO_PTR_TO_STRUCT((c)->header.info);
 }
 
+/* Used when we expect another thread to be mutating the info table pointer of
+ * a closure (e.g. when busy-waiting on a WHITEHOLE).
+ */
+INLINE_HEADER const StgInfoTable *get_volatile_itbl(StgClosure *c) {
+    // The volatile here is import to ensure that the compiler does not
+    // optimise away multiple loads, e.g. in a busy-wait loop. Note that
+    // we can't use VOLATILE_LOAD here as the casts result in strict aliasing
+    // rule violations and this header may be compiled outside of the RTS
+    // (where we use -fno-strict-aliasing).
+    StgInfoTable * *volatile p = (StgInfoTable * *volatile) &c->header.info;
+    return INFO_PTR_TO_STRUCT(*p);
+}
+
+
 INLINE_HEADER StgHalfWord GET_TAG(const StgClosure *con)
 {
     return get_itbl(con)->srt;
diff --git a/includes/rts/storage/Closures.h b/includes/rts/storage/Closures.h
index 6088fc8a1012b5c42f8ced778f2b30949d2081ad..b2b5eda407df07a5278fbbcc031f52b2589e1e31 100644
--- a/includes/rts/storage/Closures.h
+++ b/includes/rts/storage/Closures.h
@@ -94,7 +94,7 @@ typedef struct StgClosure_ {
     struct StgClosure_ *payload[];
 } *StgClosurePtr; // StgClosure defined in rts/Types.h
 
-typedef struct {
+typedef struct StgThunk_ {
     StgThunkHeader  header;
     struct StgClosure_ *payload[];
 } StgThunk;
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 1571975852c0220fe274b8c9f4cd52a4e252ca95..7931433019869f2fe13e884441dd030a348c421f 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -234,15 +234,23 @@ void setKeepCAFs (void);
    and is put on the mutable list.
    -------------------------------------------------------------------------- */
 
-void dirty_MUT_VAR(StgRegTable *reg, StgClosure *p);
+void dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mv, StgClosure *old);
 
 /* set to disable CAF garbage collection in GHCi. */
 /* (needed when dynamic libraries are used). */
 extern bool keepCAFs;
 
+#include "rts/Flags.h"
+
 INLINE_HEADER void initBdescr(bdescr *bd, generation *gen, generation *dest)
 {
     bd->gen     = gen;
     bd->gen_no  = gen->no;
     bd->dest_no = dest->no;
+
+#if !IN_STG_CODE
+    /* See Note [RtsFlags is a pointer in STG code] */
+    ASSERT(gen->no < RtsFlags.GcFlags.generations);
+    ASSERT(dest->no < RtsFlags.GcFlags.generations);
+#endif
 }
diff --git a/includes/rts/storage/InfoTables.h b/includes/rts/storage/InfoTables.h
index 4de5207b4df2ecbcd2aee395fe184cbf621a1f1b..b97e12982b36d19ae32c3eed5a93c557a5d4fc8f 100644
--- a/includes/rts/storage/InfoTables.h
+++ b/includes/rts/storage/InfoTables.h
@@ -355,7 +355,7 @@ typedef struct StgConInfoTable_ {
  */
 #if defined(TABLES_NEXT_TO_CODE)
 #define GET_CON_DESC(info) \
-            ((const char *)((StgWord)((info)+1) + (info->con_desc)))
+            ((const char *)((StgWord)((info)+1) + ((info)->con_desc)))
 #else
 #define GET_CON_DESC(info) ((const char *)(info)->con_desc)
 #endif
diff --git a/includes/rts/storage/TSO.h b/includes/rts/storage/TSO.h
index 93018581fdca46d143b9e6d74e44867631254226..d706282796b321b2a684d58f0cb02cb299471133 100644
--- a/includes/rts/storage/TSO.h
+++ b/includes/rts/storage/TSO.h
@@ -185,10 +185,66 @@ typedef struct StgTSO_ {
 
 } *StgTSOPtr; // StgTSO defined in rts/Types.h
 
+/* Note [StgStack dirtiness flags and concurrent marking]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Without concurrent collection by the nonmoving collector the stack dirtiness story
+ * is quite simple: The stack is either STACK_DIRTY (meaning it has been added to mut_list)
+ * or not.
+ *
+ * However, things are considerably more complicated with concurrent collection
+ * (namely, when nonmoving_write_barrier_enabled is set): In addition to adding
+ * the stack to mut_list and flagging it as STACK_DIRTY, we also must ensure
+ * that stacks are marked in accordance with the nonmoving collector's snapshot
+ * invariant. This is: every stack alive at the time the snapshot is taken must
+ * be marked at some point after the moment the snapshot is taken and before it
+ * is mutated or the commencement of the sweep phase.
+ *
+ * This marking may be done by the concurrent mark phase (in the case of a
+ * thread that never runs during the concurrent mark) or by the mutator when
+ * dirtying the stack. However, it is unsafe for the concurrent collector to
+ * traverse the stack while it is under mutation. Consequently, the following
+ * handshake is obeyed by the mutator's write barrier and the concurrent mark to
+ * ensure this doesn't happen:
+ *
+ * 1. The entity seeking to mark first checks that the stack lives in the nonmoving
+ *    generation; if not then the stack was not alive at the time the snapshot
+ *    was taken and therefore we need not mark it.
+ *
+ * 2. The entity seeking to mark checks the stack's mark bit. If it is set then
+ *    no mark is necessary.
+ *
+ * 3. The entity seeking to mark tries to lock the stack for marking by
+ *    atomically setting its `marking` field to the current non-moving mark
+ *    epoch:
+ *
+ *    a. If the mutator finds the concurrent collector has already locked the
+ *       stack then it waits until it is finished (indicated by the mark bit
+ *       being set) before proceeding with execution.
+ *
+ *    b. If the concurrent collector finds that the mutator has locked the stack
+ *       then it moves on, leaving the mutator to mark it. There is no need to wait;
+ *       the mark is guaranteed to finish before sweep due to the post-mark
+ *       synchronization with mutators.
+ *
+ *    c. Whoever succeeds in locking the stack is responsible for marking it and
+ *       setting the stack's mark bit (either the BF_MARKED bit for large objects
+ *       or otherwise its bit in its segment's mark bitmap).
+ *
+ * To ensure that mutation does not proceed until the stack is fully marked the
+ * mark phase must not set the mark bit until it has finished tracing.
+ *
+ */
+
+#define STACK_DIRTY 1
+// used by sanity checker to verify that all dirty stacks are on the mutable list
+#define STACK_SANE 64
+
 typedef struct StgStack_ {
     StgHeader  header;
     StgWord32  stack_size;     // stack size in *words*
-    StgWord32  dirty;          // non-zero => dirty
+    StgWord8   dirty;          // non-zero => dirty
+    StgWord8   marking;        // non-zero => someone is currently marking the stack
     StgPtr     sp;             // current stack pointer
     StgWord    stack[];
 } StgStack;
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 217b1bc89d85ce78f3bc947260af791af0b1d430..7a2ac2ef5101189805cdcca11b32b284e0c7d144 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -543,4 +543,11 @@ void * pushCostCentre (void *ccs, void *cc);
 // Capability.c
 extern unsigned int n_capabilities;
 
+/* -----------------------------------------------------------------------------
+   Nonmoving GC write barrier
+   -------------------------------------------------------------------------- */
+
+#include <rts/NonMoving.h>
+
+
 #endif
diff --git a/includes/stg/SMP.h b/includes/stg/SMP.h
index 2d6a220a9e283138b9ba01b46e1fa05b33be9db7..60f084be9a964a942d4020fed267de601b2606ee 100644
--- a/includes/stg/SMP.h
+++ b/includes/stg/SMP.h
@@ -49,6 +49,7 @@ EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w);
  * }
  */
 EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n);
+EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n);
 
 /*
  * Atomic addition by the provided quantity
@@ -283,6 +284,12 @@ cas(StgVolatilePtr p, StgWord o, StgWord n)
     return __sync_val_compare_and_swap(p, o, n);
 }
 
+EXTERN_INLINE StgWord8
+cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n)
+{
+    return __sync_val_compare_and_swap(p, o, n);
+}
+
 // RRN: Generalized to arbitrary increments to enable fetch-and-add in
 // Haskell code (fetchAddIntArray#).
 // PT: add-and-fetch, returns new value
@@ -434,6 +441,18 @@ cas(StgVolatilePtr p, StgWord o, StgWord n)
     return result;
 }
 
+EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n);
+EXTERN_INLINE StgWord8
+cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n)
+{
+    StgWord8 result;
+    result = *p;
+    if (result == o) {
+        *p = n;
+    }
+    return result;
+}
+
 EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr);
 EXTERN_INLINE StgWord
 atomic_inc(StgVolatilePtr p, StgWord incr)
diff --git a/includes/stg/Types.h b/includes/stg/Types.h
index 08ba58c799ee4e95261a24062cda1956b32c4340..8ce9e3c156d0ebb98439dc751b54deecb4baa178 100644
--- a/includes/stg/Types.h
+++ b/includes/stg/Types.h
@@ -192,3 +192,10 @@ typedef StgWord8*          StgByteArray;
 
 typedef void  *(*(*StgFunPtr)(void))(void);
 typedef StgFunPtr StgFun(void);
+
+// Forward declarations for the unregisterised backend, which
+// only depends upon Stg.h and not the entirety of Rts.h, which
+// is where these are defined.
+struct StgClosure_;
+struct StgThunk_;
+struct Capability_;
diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc
index 249bcd5a98e2df72fb46fdce6cfb78148888bb52..913344c1660db8a1b645df937bf9725784fbddd0 100644
--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -150,21 +150,22 @@ data MiscFlags = MiscFlags
 --
 -- @since 4.8.0.0
 data DebugFlags = DebugFlags
-    { scheduler   :: Bool -- ^ @s@
-    , interpreter :: Bool -- ^ @i@
-    , weak        :: Bool -- ^ @w@
-    , gccafs      :: Bool -- ^ @G@
-    , gc          :: Bool -- ^ @g@
-    , block_alloc :: Bool -- ^ @b@
-    , sanity      :: Bool -- ^ @S@
-    , stable      :: Bool -- ^ @t@
-    , prof        :: Bool -- ^ @p@
-    , linker      :: Bool -- ^ @l@ the object linker
-    , apply       :: Bool -- ^ @a@
-    , stm         :: Bool -- ^ @m@
-    , squeeze     :: Bool -- ^ @z@ stack squeezing & lazy blackholing
-    , hpc         :: Bool -- ^ @c@ coverage
-    , sparks      :: Bool -- ^ @r@
+    { scheduler      :: Bool -- ^ @s@
+    , interpreter    :: Bool -- ^ @i@
+    , weak           :: Bool -- ^ @w@
+    , gccafs         :: Bool -- ^ @G@
+    , gc             :: Bool -- ^ @g@
+    , nonmoving_gc   :: Bool -- ^ @n@
+    , block_alloc    :: Bool -- ^ @b@
+    , sanity         :: Bool -- ^ @S@
+    , stable         :: Bool -- ^ @t@
+    , prof           :: Bool -- ^ @p@
+    , linker         :: Bool -- ^ @l@ the object linker
+    , apply          :: Bool -- ^ @a@
+    , stm            :: Bool -- ^ @m@
+    , squeeze        :: Bool -- ^ @z@ stack squeezing & lazy blackholing
+    , hpc            :: Bool -- ^ @c@ coverage
+    , sparks         :: Bool -- ^ @r@
     } deriving ( Show -- ^ @since 4.8.0.0
                )
 
@@ -291,6 +292,8 @@ data TraceFlags = TraceFlags
     , timestamp      :: Bool -- ^ show timestamp in stderr output
     , traceScheduler :: Bool -- ^ trace scheduler events
     , traceGc        :: Bool -- ^ trace GC events
+    , traceNonmovingGc
+                     :: Bool -- ^ trace nonmoving GC heap census samples
     , sparksSampled  :: Bool -- ^ trace spark events by a sampled method
     , sparksFull     :: Bool -- ^ trace spark events 100% accurately
     , user           :: Bool -- ^ trace user events (emitted from Haskell code)
@@ -462,6 +465,8 @@ getDebugFlags = do
                    (#{peek DEBUG_FLAGS, gccafs} ptr :: IO CBool))
              <*> (toBool <$>
                    (#{peek DEBUG_FLAGS, gc} ptr :: IO CBool))
+             <*> (toBool <$>
+                   (#{peek DEBUG_FLAGS, nonmoving_gc} ptr :: IO CBool))
              <*> (toBool <$>
                    (#{peek DEBUG_FLAGS, block_alloc} ptr :: IO CBool))
              <*> (toBool <$>
@@ -522,6 +527,8 @@ getTraceFlags = do
                    (#{peek TRACE_FLAGS, scheduler} ptr :: IO CBool))
              <*> (toBool <$>
                    (#{peek TRACE_FLAGS, gc} ptr :: IO CBool))
+             <*> (toBool <$>
+                   (#{peek TRACE_FLAGS, nonmoving_gc} ptr :: IO CBool))
              <*> (toBool <$>
                    (#{peek TRACE_FLAGS, sparks_sampled} ptr :: IO CBool))
              <*> (toBool <$>
diff --git a/libraries/base/tests/all.T b/libraries/base/tests/all.T
index 61367e44918d87b64cbdf357cc9ac54e5eeef4c5..2d4119e54366fb022b498eda2f6e5e31d496a52a 100644
--- a/libraries/base/tests/all.T
+++ b/libraries/base/tests/all.T
@@ -74,7 +74,7 @@ test('length001',
      # excessive amounts of stack space. So we specifically set a low
      # stack limit and mark it as failing under a few conditions.
      [extra_run_opts('+RTS -K8m -RTS'),
-     expect_fail_for(['normal', 'threaded1', 'llvm'])],
+     expect_fail_for(['normal', 'threaded1', 'llvm', 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])],
      compile_and_run, [''])
 
 test('ratio001', 	normal, compile_and_run, [''])
diff --git a/libraries/ghc-heap/tests/all.T b/libraries/ghc-heap/tests/all.T
index afa224fde7dc218a211052fd1ba14d0e6bc35d6f..89e6f47ecb028f5164b1fbb4a7bdeb88c14625c9 100644
--- a/libraries/ghc-heap/tests/all.T
+++ b/libraries/ghc-heap/tests/all.T
@@ -2,7 +2,11 @@ test('heap_all',
      [when(have_profiling(), extra_ways(['prof'])),
       # These ways produce slightly different heap representations.
       # Currently we don't test them.
-      omit_ways(['ghci', 'hpc'])
+      omit_ways(['ghci', 'hpc',
+                 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
+      # The debug RTS initializes some fields with 0xaa and so
+      # this test spuriously fails.
+      when(compiler_debugged(), skip)
      ],
      compile_and_run, [''])
 
diff --git a/libraries/stm b/libraries/stm
index a925aaa505d9259f26e2f3fb2ffa2e9b66b48749..f9979c926ca539362b5a2412359750e8b498e53a 160000
--- a/libraries/stm
+++ b/libraries/stm
@@ -1 +1 @@
-Subproject commit a925aaa505d9259f26e2f3fb2ffa2e9b66b48749
+Subproject commit f9979c926ca539362b5a2412359750e8b498e53a
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index 8d7fc3c01224a42fccb321a79ed700c076a1e0f8..eeb760c5ed073539e2fa7c843275474d12a315a9 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -654,6 +654,8 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
     /* someone else beat us to it */
     jump ENTRY_LBL(stg_WHITEHOLE) (ap);
   }
+  // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is
+  // not reachable.
   StgInd_indirectee(ap) = CurrentTSO;
   prim_write_barrier;
   SET_INFO(ap, __stg_EAGER_BLACKHOLE_info);
diff --git a/rts/Capability.c b/rts/Capability.c
index 33a94398cd17705fa7ce9fa2ab8794f2a61f6814..0baa4ef205c8f64361e442f18af6b77d83bfd0a5 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -27,6 +27,7 @@
 #include "STM.h"
 #include "RtsUtils.h"
 #include "sm/OSMem.h"
+#include "sm/BlockAlloc.h" // for countBlocks()
 
 #if !defined(mingw32_HOST_OS)
 #include "rts/IOManager.h" // for setIOManagerControlFd()
@@ -291,6 +292,11 @@ initCapability (Capability *cap, uint32_t i)
                                           RtsFlags.GcFlags.generations,
                                           "initCapability");
 
+
+    // At this point storage manager is not initialized yet, so this will be
+    // initialized in initStorage().
+    cap->upd_rem_set.queue.blocks = NULL;
+
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         cap->mut_lists[g] = NULL;
     }
@@ -748,6 +754,8 @@ static Capability * waitForReturnCapability (Task *task)
  * result of the external call back to the Haskell thread that
  * made it.
  *
+ * pCap is strictly an output.
+ *
  * ------------------------------------------------------------------------- */
 
 void waitForCapability (Capability **pCap, Task *task)
@@ -840,6 +848,9 @@ void waitForCapability (Capability **pCap, Task *task)
  *      SYNC_GC_PAR), either to do a sequential GC, forkProcess, or
  *      setNumCapabilities.  We should give up the Capability temporarily.
  *
+ * When yieldCapability returns *pCap will have been updated to the new
+ * capability held by the caller.
+ *
  * ------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
@@ -855,16 +866,27 @@ yieldCapability (Capability** pCap, Task *task, bool gcAllowed)
     {
         PendingSync *sync = pending_sync;
 
-        if (sync && sync->type == SYNC_GC_PAR) {
-            if (! sync->idle[cap->no]) {
-                traceEventGcStart(cap);
-                gcWorkerThread(cap);
-                traceEventGcEnd(cap);
-                traceSparkCounters(cap);
-                // See Note [migrated bound threads 2]
-                if (task->cap == cap) {
-                    return true;
+        if (sync) {
+            switch (sync->type) {
+            case SYNC_GC_PAR:
+                if (! sync->idle[cap->no]) {
+                    traceEventGcStart(cap);
+                    gcWorkerThread(cap);
+                    traceEventGcEnd(cap);
+                    traceSparkCounters(cap);
+                    // See Note [migrated bound threads 2]
+                    if (task->cap == cap) {
+                        return true;
+                    }
                 }
+                break;
+
+            case SYNC_FLUSH_UPD_REM_SET:
+                debugTrace(DEBUG_nonmoving_gc, "Flushing update remembered set blocks...");
+                break;
+
+            default:
+                break;
             }
         }
     }
diff --git a/rts/Capability.h b/rts/Capability.h
index 0833006b0c848a03a8d3e1a20ac5efed4cd98cfc..3078680aa6d2bb7e1e4072b1ce596613cfa4f2b2 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -23,6 +23,7 @@
 #include "sm/GC.h" // for evac_fn
 #include "Task.h"
 #include "Sparks.h"
+#include "sm/NonMovingMark.h" // for MarkQueue
 
 #include "BeginPrivate.h"
 
@@ -84,6 +85,9 @@ struct Capability_ {
     bdescr **mut_lists;
     bdescr **saved_mut_lists; // tmp use during GC
 
+    // The update remembered set for the non-moving collector
+    UpdRemSet upd_rem_set;
+
     // block for allocating pinned objects into
     bdescr *pinned_object_block;
     // full pinned object blocks allocated since the last GC
@@ -258,7 +262,8 @@ extern Capability **capabilities;
 typedef enum {
     SYNC_OTHER,
     SYNC_GC_SEQ,
-    SYNC_GC_PAR
+    SYNC_GC_PAR,
+    SYNC_FLUSH_UPD_REM_SET
 } SyncType;
 
 //
diff --git a/rts/Exception.cmm b/rts/Exception.cmm
index 8ea94b19f279d66ca34126486846b249fcb1f8bd..334d0ef82360dc4a9815880202fb5ec2270e6f28 100644
--- a/rts/Exception.cmm
+++ b/rts/Exception.cmm
@@ -318,6 +318,7 @@ stg_killThreadzh (P_ target, P_ exception)
             return ();
         } else {
             StgTSO_why_blocked(CurrentTSO) = BlockedOnMsgThrowTo;
+            updateRemembSetPushPtr(StgTSO_block_info(CurrentTSO));
             StgTSO_block_info(CurrentTSO) = msg;
             // we must block, and unlock the message before returning
             jump stg_block_throwto (target, exception);
@@ -489,6 +490,8 @@ retry_pop_stack:
       ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
       ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
 
+      // No need to push `trec` to update remembered set; it will be no longer
+      // reachable after we overwrite StgTSO.trec.
       StgTSO_trec(CurrentTSO) = NO_TREC;
       if (r != 0) {
         // Transaction was valid: continue searching for a catch frame
@@ -607,6 +610,8 @@ retry_pop_stack:
       outer  = StgTRecHeader_enclosing_trec(trec);
       ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
       ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
+      // No need to push `trec` to update remembered set since we just freed
+      // it; it is no longer reachable.
       StgTSO_trec(CurrentTSO) = outer;
       Sp = Sp + SIZEOF_StgCatchSTMFrame;
     }
diff --git a/rts/Messages.c b/rts/Messages.c
index d878db5eda2ea9a4ad9b9f9daa29273e4772f3e9..374f3d673e7a31dd854898566f5f53afcfae75ca 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -244,8 +244,8 @@ loop:
         // a barrier is necessary to ensure that all writes are visible.
         // See Note [Heap memory barriers] in SMP.h.
         write_barrier();
+        dirty_TSO(cap, owner); // we will modify owner->bq
         owner->bq = bq;
-        dirty_TSO(cap, owner); // we modified owner->bq
 
         // If the owner of the blackhole is currently runnable, then
         // bump it to the front of the run queue.  This gives the
@@ -262,6 +262,9 @@ loop:
 
         // point to the BLOCKING_QUEUE from the BLACKHOLE
         write_barrier(); // make the BQ visible, see Note [Heap memory barriers].
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            updateRemembSetPushClosure(cap, (StgClosure*)p);
+        }
         ((StgInd*)bh)->indirectee = (StgClosure *)bq;
         recordClosureMutated(cap,bh); // bh was mutated
 
@@ -290,6 +293,11 @@ loop:
         }
 #endif
 
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            // We are about to overwrite bq->queue; make sure its current value
+            // makes it into the update remembered set
+            updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);
+        }
         msg->link = bq->queue;
         bq->queue = msg;
         // No barrier is necessary here: we are only exposing the
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index ec35ee42b479482cea8ffb6d2ecf4f404a88e60d..b66c561dcb8a6d72e9a59c0e25109ead34d079bf 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -349,8 +349,13 @@ stg_casArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new )
         // Compare and Swap Succeeded:
         SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
         len = StgMutArrPtrs_ptrs(arr);
+
         // The write barrier.  We must write a byte into the mark table:
         I8[arr + SIZEOF_StgMutArrPtrs + WDS(len) + (ind >> MUT_ARR_PTRS_CARD_BITS )] = 1;
+
+        // Concurrent GC write barrier
+        updateRemembSetPushPtr(old);
+
         return (0,new);
     }
 }
@@ -462,16 +467,45 @@ stg_thawSmallArrayzh ( gcptr src, W_ offset, W_ n )
     cloneSmallArray(stg_SMALL_MUT_ARR_PTRS_DIRTY_info, src, offset, n)
 }
 
+// Concurrent GC write barrier for pointer array copies
+//
+// hdr_size in bytes. dst_off in words, n in words.
+stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n)
+{
+    W_ end, p;
+    ASSERT(n > 0);  // Assumes n==0 is handled by caller
+    p = dst + hdr_size + WDS(dst_off);
+    end = p + WDS(n);
+
+again:
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr");
+    }
+    p = p + WDS(1);
+    if (p < end) {
+        goto again;
+    }
+
+    return ();
+}
+
 stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
 {
     W_ dst_p, src_p, bytes;
 
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+    if (n > 0) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
+                                      dst, dst_off, n);
+        }
 
-    dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
-    src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
-    bytes = WDS(n);
-    prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+        SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
+        dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
+        src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
+        bytes = WDS(n);
+        prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+    }
 
     return ();
 }
@@ -480,15 +514,22 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
 {
     W_ dst_p, src_p, bytes;
 
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+    if (n > 0) {
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
+                                      dst, dst_off, n);
+        }
 
-    dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
-    src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
-    bytes = WDS(n);
-    if (src == dst) {
-        prim %memmove(dst_p, src_p, bytes, SIZEOF_W);
-    } else {
-        prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+        SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
+        dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
+        src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
+        bytes = WDS(n);
+        if (src == dst) {
+            prim %memmove(dst_p, src_p, bytes, SIZEOF_W);
+        } else {
+            prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+        }
     }
 
     return ();
@@ -510,6 +551,10 @@ stg_casSmallArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new )
     } else {
         // Compare and Swap Succeeded:
         SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS);
+
+        // Concurrent GC write barrier
+        updateRemembSetPushPtr(old);
+
         return (0,new);
     }
 }
@@ -549,7 +594,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new )
         return (1,h);
     } else {
         if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
+            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old);
         }
         return (0,new);
     }
@@ -562,7 +607,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new )
     } else {
         StgMutVar_var(mv) = new;
         if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
+            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old);
         }
         return (0,new);
     }
@@ -629,11 +674,12 @@ stg_atomicModifyMutVar2zh ( gcptr mv, gcptr f )
     (h) = prim %cmpxchgW(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y);
     if (h != x) { goto retry; }
 #else
+    h = StgMutVar_var(mv);
     StgMutVar_var(mv) = y;
 #endif
 
     if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-        ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
+        ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", h);
     }
 
     return (x,z);
@@ -755,6 +801,9 @@ stg_addCFinalizzerToWeakzh ( W_ fptr,   // finalizer
         return (0);
     }
 
+    // Write barrier for concurrent non-moving collector
+    updateRemembSetPushPtr(StgWeak_cfinalizers(w))
+
     StgCFinalizerList_link(c) = StgWeak_cfinalizers(w);
     StgWeak_cfinalizers(w) = c;
 
@@ -835,6 +884,8 @@ stg_deRefWeakzh ( gcptr w )
     if (info == stg_WEAK_info) {
         code = 1;
         val = StgWeak_value(w);
+        // See Note [Concurrent read barrier on deRefWeak#] in NonMovingMark.c
+        updateRemembSetPushPtr(val);
     } else {
         code = 0;
         val = w;
@@ -1515,7 +1566,7 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ )
      */
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
 
         // We want to put the heap check down here in the slow path,
@@ -1561,6 +1612,9 @@ loop:
         // If the MVar is not already dirty, then we don't need to make
         // it dirty, as it is empty with nothing blocking on it.
         unlockClosure(mvar, info);
+        // However, we do need to ensure that the nonmoving collector
+        // knows about the reference to the value that we just removed...
+        updateRemembSetPushPtr(val);
         return (val);
     }
     qinfo = StgHeader_info(q);
@@ -1574,7 +1628,7 @@ loop:
     // There are putMVar(s) waiting... wake up the first thread on the queue
 
     if (info == stg_MVAR_CLEAN_info) {
-        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr");
     }
 
     tso = StgMVarTSOQueue_tso(q);
@@ -1643,7 +1697,7 @@ loop:
     // There are putMVar(s) waiting... wake up the first thread on the queue
 
     if (info == stg_MVAR_CLEAN_info) {
-        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr");
     }
 
     tso = StgMVarTSOQueue_tso(q);
@@ -1681,7 +1735,7 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */
     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
 
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
 
         // We want to put the heap check down here in the slow path,
@@ -1715,14 +1769,20 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */
         jump stg_block_putmvar(mvar,val);
     }
 
+    // We are going to mutate the closure, make sure its current pointers
+    // are marked.
+    if (info == stg_MVAR_CLEAN_info) {
+        ccall update_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
+    }
+
     q = StgMVar_head(mvar);
 loop:
     if (q == stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
+        StgMVar_value(mvar) = val;
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
-        StgMVar_value(mvar) = val;
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
         return ();
     }
@@ -1758,7 +1818,7 @@ loop:
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
 
-    if (TO_W_(StgStack_dirty(stack)) == 0) {
+    if ((TO_W_(StgStack_dirty(stack)) & STACK_DIRTY) == 0) {
         ccall dirty_STACK(MyCapability() "ptr", stack "ptr");
     }
 
@@ -1804,7 +1864,7 @@ loop:
     if (q == stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
 
         StgMVar_value(mvar) = val;
@@ -1843,7 +1903,7 @@ loop:
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
 
-    if (TO_W_(StgStack_dirty(stack)) == 0) {
+    if ((TO_W_(StgStack_dirty(stack)) & STACK_DIRTY) == 0) {
         ccall dirty_STACK(MyCapability() "ptr", stack "ptr");
     }
 
@@ -1875,7 +1935,7 @@ stg_readMVarzh ( P_ mvar, /* :: MVar a */ )
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
 
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar));
         }
 
         ALLOC_PRIM_WITH_CUSTOM_FAILURE
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index 807c3e3d30a2783c3234ab758312980674947e7f..50cddff0513458179d238e1141d6774f200f7f5b 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -515,9 +515,9 @@ blockedThrowTo (Capability *cap, StgTSO *target, MessageThrowTo *msg)
 
     ASSERT(target->cap == cap);
 
+    dirty_TSO(cap,target); // we will modify the blocked_exceptions queue
     msg->link = target->blocked_exceptions;
     target->blocked_exceptions = msg;
-    dirty_TSO(cap,target); // we modified the blocked_exceptions queue
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index d36e9ffc6626d71a7517bb8fbbd2d54556c9504c..0e28b980ac6ee4174809ebfc7dbfe0f1a9ccaad6 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -156,6 +156,8 @@ void initRtsFlagsDefaults(void)
     RtsFlags.GcFlags.heapSizeSuggestionAuto = false;
     RtsFlags.GcFlags.pcFreeHeap         = 3;    /* 3% */
     RtsFlags.GcFlags.oldGenFactor       = 2;
+    RtsFlags.GcFlags.useNonmoving       = false;
+    RtsFlags.GcFlags.nonmovingSelectorOpt = false;
     RtsFlags.GcFlags.generations        = 2;
     RtsFlags.GcFlags.squeezeUpdFrames   = true;
     RtsFlags.GcFlags.compact            = false;
@@ -179,6 +181,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.DebugFlags.weak            = false;
     RtsFlags.DebugFlags.gccafs          = false;
     RtsFlags.DebugFlags.gc              = false;
+    RtsFlags.DebugFlags.nonmoving_gc    = false;
     RtsFlags.DebugFlags.block_alloc     = false;
     RtsFlags.DebugFlags.sanity          = false;
     RtsFlags.DebugFlags.zero_on_gc      = false;
@@ -220,6 +223,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.TraceFlags.timestamp     = false;
     RtsFlags.TraceFlags.scheduler     = false;
     RtsFlags.TraceFlags.gc            = false;
+    RtsFlags.TraceFlags.nonmoving_gc  = false;
     RtsFlags.TraceFlags.sparks_sampled= false;
     RtsFlags.TraceFlags.sparks_full   = false;
     RtsFlags.TraceFlags.user          = false;
@@ -299,6 +303,7 @@ usage_text[] = {
 "  -xb<addr> Sets the address from which a suitable start for the heap memory",
 "            will be searched from. This is useful if the default address",
 "            clashes with some third-party library.",
+"  -xn       Use the non-moving collector for the old generation.",
 "  -m<n>     Minimum % of heap which must be available (default 3%)",
 "  -G<n>     Number of generations (default: 2)",
 "  -c<n>     Use in-place compaction instead of copying in the oldest generation",
@@ -404,6 +409,7 @@ usage_text[] = {
 "  -Dw  DEBUG: weak",
 "  -DG  DEBUG: gccafs",
 "  -Dg  DEBUG: gc",
+"  -Dn  DEBUG: non-moving gc",
 "  -Db  DEBUG: block",
 "  -DS  DEBUG: sanity",
 "  -DZ  DEBUG: zero freed memory during GC",
@@ -1533,6 +1539,16 @@ error = true;
                     break;
 #endif
 
+                case 'n':
+                    OPTION_SAFE;
+                    RtsFlags.GcFlags.useNonmoving = true;
+                    unchecked_arg_start++;
+                    if (rts_argv[arg][3] == 's') {
+                        RtsFlags.GcFlags.nonmovingSelectorOpt = true;
+                        unchecked_arg_start++;
+                    }
+                    break;
+
                 case 'c': /* Debugging tool: show current cost centre on
                            an exception */
                     OPTION_SAFE;
@@ -1706,6 +1722,16 @@ static void normaliseRtsOpts (void)
     if (RtsFlags.MiscFlags.generate_dump_file) {
         RtsFlags.MiscFlags.install_seh_handlers = true;
     }
+
+    if (RtsFlags.GcFlags.useNonmoving && RtsFlags.GcFlags.generations == 1) {
+        barf("The non-moving collector doesn't support -G1");
+    }
+
+    if (RtsFlags.GcFlags.compact && RtsFlags.GcFlags.useNonmoving) {
+        errorBelch("The non-moving collector cannot be used in conjunction with\n"
+                   "the compacting collector.");
+        errorUsage();
+    }
 }
 
 static void errorUsage (void)
@@ -1871,6 +1897,9 @@ static void read_debug_flags(const char* arg)
         case 'g':
             RtsFlags.DebugFlags.gc = true;
             break;
+        case 'n':
+            RtsFlags.DebugFlags.nonmoving_gc = true;
+            break;
         case 'b':
             RtsFlags.DebugFlags.block_alloc = true;
             break;
@@ -2108,6 +2137,10 @@ static void read_trace_flags(const char *arg)
             RtsFlags.TraceFlags.gc        = enabled;
             enabled = true;
             break;
+        case 'n':
+            RtsFlags.TraceFlags.nonmoving_gc = enabled;
+            enabled = true;
+            break;
         case 'u':
             RtsFlags.TraceFlags.user      = enabled;
             enabled = true;
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
index a202d5396068f5356b299551960d34cc5a2ec6e3..d0d08a249594394e3fdc505f93f02d84be82ce82 100644
--- a/rts/RtsStartup.c
+++ b/rts/RtsStartup.c
@@ -392,7 +392,8 @@ hs_exit_(bool wait_foreign)
     ioManagerDie();
 #endif
 
-    /* stop all running tasks */
+    /* stop all running tasks. This is also where we stop concurrent non-moving
+     * collection if it's running */
     exitScheduler(wait_foreign);
 
     /* run C finalizers for all active weak pointers */
diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c
index 4da4258e9525cd4b747d8e14df17258c50984446..0611de11cca41c3e0cfa78dee5f12fb1f706e267 100644
--- a/rts/RtsSymbols.c
+++ b/rts/RtsSymbols.c
@@ -14,6 +14,7 @@
 #include "HsFFI.h"
 
 #include "sm/Storage.h"
+#include "sm/NonMovingMark.h"
 #include <stdbool.h>
 
 #if !defined(mingw32_HOST_OS)
@@ -716,6 +717,9 @@
       SymI_HasProto(stg_shrinkMutableByteArrayzh)                       \
       SymI_HasProto(stg_resizzeMutableByteArrayzh)                      \
       SymI_HasProto(newSpark)                                           \
+      SymI_HasProto(updateRemembSetPushThunk)                             \
+      SymI_HasProto(updateRemembSetPushThunk_)                            \
+      SymI_HasProto(updateRemembSetPushClosure_)                          \
       SymI_HasProto(performGC)                                          \
       SymI_HasProto(performMajorGC)                                     \
       SymI_HasProto(prog_argc)                                          \
@@ -1071,6 +1075,7 @@ RtsSymbolVal rtsSyms[] = {
       RTS_OPENBSD_ONLY_SYMBOLS
       RTS_LIBGCC_SYMBOLS
       RTS_LIBFFI_SYMBOLS
+      SymI_HasDataProto(nonmoving_write_barrier_enabled)
 #if defined(darwin_HOST_OS) && defined(i386_HOST_ARCH)
       // dyld stub code contains references to this,
       // but it should never be called because we treat
diff --git a/rts/STM.c b/rts/STM.c
index dc0b0ebb7876672e1de27abf07f66288ba9b72af..1dde70b485f903c75d8c6307b5c9da4c4ae72b72 100644
--- a/rts/STM.c
+++ b/rts/STM.c
@@ -182,7 +182,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
   TRACE("%p : unlock_stm()", trec);
 }
 
-static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgClosure *lock_tvar(Capability *cap STG_UNUSED,
+                             StgTRecHeader *trec STG_UNUSED,
                              StgTVar *s STG_UNUSED) {
   StgClosure *result;
   TRACE("%p : lock_tvar(%p)", trec, s);
@@ -197,12 +198,14 @@ static void unlock_tvar(Capability *cap,
                         StgBool force_update) {
   TRACE("%p : unlock_tvar(%p)", trec, s);
   if (force_update) {
+    StgClosure *old_value = s -> current_value;
     s -> current_value = c;
-    dirty_TVAR(cap,s);
+    dirty_TVAR(cap, s, old_value);
   }
 }
 
-static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgBool cond_lock_tvar(Capability *cap STG_UNUSED,
+                              StgTRecHeader *trec STG_UNUSED,
                               StgTVar *s STG_UNUSED,
                               StgClosure *expected) {
   StgClosure *result;
@@ -231,7 +234,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
   smp_locked = 0;
 }
 
-static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgClosure *lock_tvar(Capability *cap STG_UNUSED,
+                             StgTRecHeader *trec STG_UNUSED,
                              StgTVar *s STG_UNUSED) {
   StgClosure *result;
   TRACE("%p : lock_tvar(%p)", trec, s);
@@ -248,12 +252,14 @@ static void *unlock_tvar(Capability *cap,
   TRACE("%p : unlock_tvar(%p, %p)", trec, s, c);
   ASSERT(smp_locked == trec);
   if (force_update) {
+    StgClosure *old_value = s -> current_value;
     s -> current_value = c;
-    dirty_TVAR(cap,s);
+    dirty_TVAR(cap, s, old_value);
   }
 }
 
-static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgBool cond_lock_tvar(Capability *cap STG_UNUSED,
+                              StgTRecHeader *trec STG_UNUSED,
                                StgTVar *s STG_UNUSED,
                                StgClosure *expected) {
   StgClosure *result;
@@ -279,7 +285,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
   TRACE("%p : unlock_stm()", trec);
 }
 
-static StgClosure *lock_tvar(StgTRecHeader *trec,
+static StgClosure *lock_tvar(Capability *cap,
+                             StgTRecHeader *trec,
                              StgTVar *s STG_UNUSED) {
   StgClosure *result;
   TRACE("%p : lock_tvar(%p)", trec, s);
@@ -289,6 +296,12 @@ static StgClosure *lock_tvar(StgTRecHeader *trec,
     } while (GET_INFO(UNTAG_CLOSURE(result)) == &stg_TREC_HEADER_info);
   } while (cas((void *)&(s -> current_value),
                (StgWord)result, (StgWord)trec) != (StgWord)result);
+
+
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, result);
+  }
   return result;
 }
 
@@ -300,10 +313,11 @@ static void unlock_tvar(Capability *cap,
   TRACE("%p : unlock_tvar(%p, %p)", trec, s, c);
   ASSERT(s -> current_value == (StgClosure *)trec);
   s -> current_value = c;
-  dirty_TVAR(cap,s);
+  dirty_TVAR(cap, s, (StgClosure *) trec);
 }
 
-static StgBool cond_lock_tvar(StgTRecHeader *trec,
+static StgBool cond_lock_tvar(Capability *cap,
+                              StgTRecHeader *trec,
                               StgTVar *s,
                               StgClosure *expected) {
   StgClosure *result;
@@ -311,6 +325,10 @@ static StgBool cond_lock_tvar(StgTRecHeader *trec,
   TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected);
   w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec);
   result = (StgClosure *)w;
+  IF_NONMOVING_WRITE_BARRIER_ENABLED {
+      if (result)
+          updateRemembSetPushClosure(cap, expected);
+  }
   TRACE("%p : %s", trec, result ? "success" : "failure");
   return (result == expected);
 }
@@ -525,7 +543,7 @@ static void build_watch_queue_entries_for_trec(Capability *cap,
     }
     s -> first_watch_queue_entry = q;
     e -> new_value = (StgClosure *) q;
-    dirty_TVAR(cap,s); // we modified first_watch_queue_entry
+    dirty_TVAR(cap, s, (StgClosure *) fq); // we modified first_watch_queue_entry
   });
 }
 
@@ -545,7 +563,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap,
     StgTVarWatchQueue *q;
     StgClosure *saw;
     s = e -> tvar;
-    saw = lock_tvar(trec, s);
+    saw = lock_tvar(cap, trec, s);
     q = (StgTVarWatchQueue *) (e -> new_value);
     TRACE("%p : removing tso=%p from watch queue for tvar=%p",
           trec,
@@ -562,7 +580,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap,
     } else {
       ASSERT(s -> first_watch_queue_entry == q);
       s -> first_watch_queue_entry = nq;
-      dirty_TVAR(cap,s); // we modified first_watch_queue_entry
+      dirty_TVAR(cap, s, (StgClosure *) q); // we modified first_watch_queue_entry
     }
     free_stg_tvar_watch_queue(cap, q);
     unlock_tvar(cap, trec, s, saw, false);
@@ -773,7 +791,7 @@ static StgBool validate_and_acquire_ownership (Capability *cap,
       s = e -> tvar;
       if (acquire_all || entry_is_update(e)) {
         TRACE("%p : trying to acquire %p", trec, s);
-        if (!cond_lock_tvar(trec, s, e -> expected_value)) {
+        if (!cond_lock_tvar(cap, trec, s, e -> expected_value)) {
           TRACE("%p : failed to acquire %p", trec, s);
           result = false;
           BREAK_FOR_EACH;
diff --git a/rts/Schedule.c b/rts/Schedule.c
index eced4d4fb6484a247cf866de2d226eecbfeebeba..9323915dfefdc2bf939d28d4c7a045fa74eb16d9 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -44,6 +44,8 @@
 #include "StablePtr.h"
 #include "StableName.h"
 #include "TopHandler.h"
+#include "sm/NonMoving.h"
+#include "sm/NonMovingMark.h"
 
 #if defined(HAVE_SYS_TYPES_H)
 #include <sys/types.h>
@@ -110,6 +112,19 @@ Mutex sched_mutex;
 #define FORKPROCESS_PRIMOP_SUPPORTED
 #endif
 
+/*
+ * sync_finished_cond allows threads which do not own any capability (e.g. the
+ * concurrent mark thread) to participate in the sync protocol. In particular,
+ * if such a thread requests a sync while sync is already in progress it will
+ * block on sync_finished_cond, which will be signalled when the sync is
+ * finished (by releaseAllCapabilities).
+ */
+#if defined(THREADED_RTS)
+static Condition sync_finished_cond;
+static Mutex sync_finished_mutex;
+#endif
+
+
 /* -----------------------------------------------------------------------------
  * static function prototypes
  * -------------------------------------------------------------------------- */
@@ -130,7 +145,6 @@ static void scheduleYield (Capability **pcap, Task *task);
 static bool requestSync (Capability **pcap, Task *task,
                          PendingSync *sync_type, SyncType *prev_sync_type);
 static void acquireAllCapabilities(Capability *cap, Task *task);
-static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task);
 static void startWorkerTasks (uint32_t from USED_IF_THREADS,
                               uint32_t to USED_IF_THREADS);
 #endif
@@ -150,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t );
 static bool scheduleHandleThreadFinished( Capability *cap, Task *task,
                                           StgTSO *t );
 static bool scheduleNeedHeapProfile(bool ready_to_gc);
-static void scheduleDoGC(Capability **pcap, Task *task, bool force_major);
+static void scheduleDoGC( Capability **pcap, Task *task,
+                          bool force_major, bool deadlock_detect );
 
 static void deleteThread (StgTSO *tso);
 static void deleteAllThreads (void);
@@ -250,7 +265,7 @@ schedule (Capability *initialCapability, Task *task)
     case SCHED_INTERRUPTING:
         debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
         /* scheduleDoGC() deletes all the threads */
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
 
         // after scheduleDoGC(), we must be shutting down.  Either some
         // other Capability did the final GC, or we did it above,
@@ -547,7 +562,7 @@ run_thread:
     }
 
     if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
-      scheduleDoGC(&cap,task,false);
+      scheduleDoGC(&cap,task,false,false);
     }
   } /* end of while() */
 }
@@ -921,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
         // they are unreachable and will therefore be sent an
         // exception.  Any threads thus released will be immediately
         // runnable.
-        scheduleDoGC (pcap, task, true/*force major GC*/);
+        scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/);
         cap = *pcap;
         // when force_major == true. scheduleDoGC sets
         // recent_activity to ACTIVITY_DONE_GC and turns off the timer
@@ -991,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
     while (!emptyInbox(cap)) {
         // Executing messages might use heap, so we should check for GC.
         if (doYouWantToGC(cap)) {
-            scheduleDoGC(pcap, cap->running_task, false);
+            scheduleDoGC(pcap, cap->running_task, false, false);
             cap = *pcap;
         }
 
@@ -1368,17 +1383,24 @@ scheduleNeedHeapProfile( bool ready_to_gc )
  * change to the system, such as altering the number of capabilities, or
  * forking.
  *
+ * pCap may be NULL in the event that the caller doesn't yet own a capability.
+ *
  * To resume after stopAllCapabilities(), use releaseAllCapabilities().
  * -------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
-static void stopAllCapabilities (Capability **pCap, Task *task)
+void stopAllCapabilities (Capability **pCap, Task *task)
+{
+    stopAllCapabilitiesWith(pCap, task, SYNC_OTHER);
+}
+
+void stopAllCapabilitiesWith (Capability **pCap, Task *task, SyncType sync_type)
 {
     bool was_syncing;
     SyncType prev_sync_type;
 
     PendingSync sync = {
-        .type = SYNC_OTHER,
+        .type = sync_type,
         .idle = NULL,
         .task = task
     };
@@ -1387,9 +1409,10 @@ static void stopAllCapabilities (Capability **pCap, Task *task)
         was_syncing = requestSync(pCap, task, &sync, &prev_sync_type);
     } while (was_syncing);
 
-    acquireAllCapabilities(*pCap,task);
+    acquireAllCapabilities(pCap ? *pCap : NULL, task);
 
     pending_sync = 0;
+    signalCondition(&sync_finished_cond);
 }
 #endif
 
@@ -1400,6 +1423,16 @@ static void stopAllCapabilities (Capability **pCap, Task *task)
  * directly, instead use stopAllCapabilities().  This is used by the GC, which
  * has some special synchronisation requirements.
  *
+ * Note that this can be called in two ways:
+ *
+ * - where *pcap points to a capability owned by the caller: in this case
+ *   *prev_sync_type will reflect the in-progress sync type on return, if one
+ *   *was found
+ *
+ *  - where pcap == NULL: in this case the caller doesn't hold a capability.
+ *    we only return whether or not a pending sync was found and prev_sync_type
+ *    is unchanged.
+ *
  * Returns:
  *    false if we successfully got a sync
  *    true  if there was another sync request in progress,
@@ -1424,13 +1457,25 @@ static bool requestSync (
         // After the sync is completed, we cannot read that struct any
         // more because it has been freed.
         *prev_sync_type = sync->type;
-        do {
-            debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...",
-                       sync->type);
-            ASSERT(*pcap);
-            yieldCapability(pcap,task,true);
-            sync = pending_sync;
-        } while (sync != NULL);
+        if (pcap == NULL) {
+            // The caller does not hold a capability (e.g. may be a concurrent
+            // mark thread). Consequently we must wait until the pending sync is
+            // finished before proceeding to ensure we don't loop.
+            // TODO: Don't busy-wait
+            ACQUIRE_LOCK(&sync_finished_mutex);
+            while (pending_sync) {
+                waitCondition(&sync_finished_cond, &sync_finished_mutex);
+            }
+            RELEASE_LOCK(&sync_finished_mutex);
+        } else {
+            do {
+                debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...",
+                          sync->type);
+                ASSERT(*pcap);
+                yieldCapability(pcap,task,true);
+                sync = pending_sync;
+            } while (sync != NULL);
+        }
 
         // NOTE: task->cap might have changed now
         return true;
@@ -1445,9 +1490,9 @@ static bool requestSync (
 /* -----------------------------------------------------------------------------
  * acquireAllCapabilities()
  *
- * Grab all the capabilities except the one we already hold.  Used
- * when synchronising before a single-threaded GC (SYNC_SEQ_GC), and
- * before a fork (SYNC_OTHER).
+ * Grab all the capabilities except the one we already hold (cap may be NULL is
+ * the caller does not currently hold a capability). Used when synchronising
+ * before a single-threaded GC (SYNC_SEQ_GC), and before a fork (SYNC_OTHER).
  *
  * Only call this after requestSync(), otherwise a deadlock might
  * ensue if another thread is trying to synchronise.
@@ -1477,29 +1522,30 @@ static void acquireAllCapabilities(Capability *cap, Task *task)
             }
         }
     }
-    task->cap = cap;
+    task->cap = cap == NULL ? tmpcap : cap;
 }
 #endif
 
 /* -----------------------------------------------------------------------------
- * releaseAllcapabilities()
+ * releaseAllCapabilities()
  *
- * Assuming this thread holds all the capabilities, release them all except for
- * the one passed in as cap.
+ * Assuming this thread holds all the capabilities, release them all (except for
+ * the one passed in as keep_cap, if non-NULL).
  * -------------------------------------------------------------------------- */
 
 #if defined(THREADED_RTS)
-static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task)
+void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
 {
     uint32_t i;
 
     for (i = 0; i < n; i++) {
-        if (cap->no != i) {
-            task->cap = capabilities[i];
-            releaseCapability(capabilities[i]);
+        Capability *tmpcap = capabilities[i];
+        if (keep_cap != tmpcap) {
+            task->cap = tmpcap;
+            releaseCapability(tmpcap);
         }
     }
-    task->cap = cap;
+    task->cap = keep_cap;
 }
 #endif
 
@@ -1507,9 +1553,11 @@ static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task)
  * Perform a garbage collection if necessary
  * -------------------------------------------------------------------------- */
 
+// N.B. See Note [Deadlock detection under nonmoving collector] for rationale
+// behind deadlock_detect argument.
 static void
 scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
-              bool force_major)
+              bool force_major, bool deadlock_detect)
 {
     Capability *cap = *pcap;
     bool heap_census;
@@ -1801,9 +1849,10 @@ delete_threads_and_gc:
     // reset pending_sync *before* GC, so that when the GC threads
     // emerge they don't immediately re-enter the GC.
     pending_sync = 0;
-    GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
+    signalCondition(&sync_finished_cond);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap);
 #else
-    GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL);
 #endif
 
     // If we're shutting down, don't leave any idle GC work to do.
@@ -2453,7 +2502,11 @@ resumeThread (void *task_)
     tso = incall->suspended_tso;
     incall->suspended_tso = NULL;
     incall->suspended_cap = NULL;
-    tso->_link = END_TSO_QUEUE; // no write barrier reqd
+    // we will modify tso->_link
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        updateRemembSetPushClosure(cap, (StgClosure *)tso->_link);
+    }
+    tso->_link = END_TSO_QUEUE;
 
     traceEventRunThread(cap, tso);
 
@@ -2627,6 +2680,8 @@ initScheduler(void)
   /* Initialise the mutex and condition variables used by
    * the scheduler. */
   initMutex(&sched_mutex);
+  initMutex(&sync_finished_mutex);
+  initCondition(&sync_finished_cond);
 #endif
 
   ACQUIRE_LOCK(&sched_mutex);
@@ -2662,9 +2717,10 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
     // If we haven't killed all the threads yet, do it now.
     if (sched_state < SCHED_SHUTTING_DOWN) {
         sched_state = SCHED_INTERRUPTING;
+        nonmovingStop();
         Capability *cap = task->cap;
         waitForCapability(&cap,task);
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
         ASSERT(task->incall->tso == NULL);
         releaseCapability(cap);
     }
@@ -2732,7 +2788,7 @@ performGC_(bool force_major)
     // TODO: do we need to traceTask*() here?
 
     waitForCapability(&cap,task);
-    scheduleDoGC(&cap,task,force_major);
+    scheduleDoGC(&cap,task,force_major,false);
     releaseCapability(cap);
     boundTaskExiting(task);
 }
diff --git a/rts/Schedule.h b/rts/Schedule.h
index 31979800411eb7f0c02d28d4b9b5ea1422e09218..6434515604e22be28be314850fc51d734fb90c5b 100644
--- a/rts/Schedule.h
+++ b/rts/Schedule.h
@@ -52,6 +52,12 @@ StgWord findAtomicallyFrameHelper (Capability *cap, StgTSO *tso);
 /* Entry point for a new worker */
 void scheduleWorker (Capability *cap, Task *task);
 
+#if defined(THREADED_RTS)
+void stopAllCapabilitiesWith (Capability **pCap, Task *task, SyncType sync_type);
+void stopAllCapabilities (Capability **pCap, Task *task);
+void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task);
+#endif
+
 /* The state of the scheduler.  This is used to control the sequence
  * of events during shutdown.  See Note [shutdown] in Schedule.c.
  */
diff --git a/rts/StableName.c b/rts/StableName.c
index 757eb59180e72cebc1030e24725fbf72a4eae5ab..4b26fee396d31a1e8ff9ecf05fc28721ec906d9f 100644
--- a/rts/StableName.c
+++ b/rts/StableName.c
@@ -21,7 +21,7 @@
 
 snEntry *stable_name_table = NULL;
 static snEntry *stable_name_free = NULL;
-static unsigned int SNT_size = 0;
+unsigned int SNT_size = 0;
 #define INIT_SNT_SIZE 64
 
 #if defined(THREADED_RTS)
@@ -128,7 +128,7 @@ exitStableNameTable(void)
 #endif
 }
 
-STATIC_INLINE void
+void
 freeSnEntry(snEntry *sn)
 {
   ASSERT(sn->sn_obj == NULL);
@@ -218,27 +218,6 @@ lookupStableName (StgPtr p)
  * Remember old stable name addresses
  * -------------------------------------------------------------------------- */
 
-#define FOR_EACH_STABLE_NAME(p, CODE)                                   \
-    do {                                                                \
-        snEntry *p;                                                     \
-        snEntry *__end_ptr = &stable_name_table[SNT_size];              \
-        for (p = stable_name_table + 1; p < __end_ptr; p++) {           \
-            /* Internal pointers are free slots.  */                    \
-            /* If p->addr == NULL, it's a */                            \
-            /* stable name where the object has been GC'd, but the */   \
-            /* StableName object (sn_obj) is still alive. */            \
-            if ((p->addr < (P_)stable_name_table ||                     \
-                 p->addr >= (P_)__end_ptr))                             \
-            {                                                           \
-                /* NOTE: There is an ambiguity here if p->addr == NULL */ \
-                /* it is either the last item in the free list or it */ \
-                /* is a stable name whose pointee died. sn_obj == NULL */ \
-                /* disambiguates as last free list item. */             \
-                do { CODE } while(0);                                   \
-            }                                                           \
-        }                                                               \
-    } while(0)
-
 void
 rememberOldStableNameAddresses(void)
 {
@@ -284,6 +263,9 @@ threadStableNameTable( evac_fn evac, void *user )
 void
 gcStableNameTable( void )
 {
+    // We must take the stable name lock lest we race with the nonmoving
+    // collector (namely nonmovingSweepStableNameTable).
+    stableNameLock();
     FOR_EACH_STABLE_NAME(
         p, {
             // FOR_EACH_STABLE_NAME traverses free entries too, so
@@ -307,6 +289,7 @@ gcStableNameTable( void )
                 }
             }
         });
+    stableNameUnlock();
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/StableName.h b/rts/StableName.h
index 6b5e551add3cd20d6947b60504aa9f6d7dbb88c6..e5903bb3b5e87a803a6dc4d599b4221eb5fb7f59 100644
--- a/rts/StableName.h
+++ b/rts/StableName.h
@@ -11,7 +11,8 @@
 #include "BeginPrivate.h"
 
 void    initStableNameTable   ( void );
-void    exitStableNameTable      ( void );
+void    freeSnEntry           ( snEntry *sn );
+void    exitStableNameTable   ( void );
 StgWord lookupStableName      ( StgPtr p );
 
 void    rememberOldStableNameAddresses ( void );
@@ -23,6 +24,29 @@ void    updateStableNameTable ( bool full );
 void    stableNameLock            ( void );
 void    stableNameUnlock          ( void );
 
+extern unsigned int SNT_size;
+
+#define FOR_EACH_STABLE_NAME(p, CODE)                                   \
+    do {                                                                \
+        snEntry *p;                                                     \
+        snEntry *__end_ptr = &stable_name_table[SNT_size];              \
+        for (p = stable_name_table + 1; p < __end_ptr; p++) {           \
+            /* Internal pointers are free slots.  */                    \
+            /* If p->addr == NULL, it's a */                            \
+            /* stable name where the object has been GC'd, but the */   \
+            /* StableName object (sn_obj) is still alive. */            \
+            if ((p->addr < (P_)stable_name_table ||                     \
+                 p->addr >= (P_)__end_ptr))                             \
+            {                                                           \
+                /* NOTE: There is an ambiguity here if p->addr == NULL */ \
+                /* it is either the last item in the free list or it */ \
+                /* is a stable name whose pointee died. sn_obj == NULL */ \
+                /* disambiguates as last free list item. */             \
+                do { CODE } while(0);                                   \
+            }                                                           \
+        }                                                               \
+    } while(0)
+
 #if defined(THREADED_RTS)
 // needed by Schedule.c:forkProcess()
 extern Mutex stable_name_mutex;
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index cccc7ad0b0a241be52eba7f8fca97163b80a5e41..83c621e386baf72090f5a196bbd9ee4a1756a77c 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -15,6 +15,7 @@
 #include "RaiseAsync.h"
 #include "Trace.h"
 #include "Threads.h"
+#include "sm/NonMovingMark.h"
 
 #include <string.h> // for memmove()
 
@@ -243,6 +244,9 @@ threadPaused(Capability *cap, StgTSO *tso)
 
             bh = ((StgUpdateFrame *)frame)->updatee;
             bh_info = bh->header.info;
+            IF_NONMOVING_WRITE_BARRIER_ENABLED {
+                updateRemembSetPushClosure(cap, (StgClosure *) bh);
+            }
 
 #if defined(THREADED_RTS)
         retry:
@@ -334,6 +338,18 @@ threadPaused(Capability *cap, StgTSO *tso)
             }
 #endif
 
+            IF_NONMOVING_WRITE_BARRIER_ENABLED {
+                if (ip_THUNK(INFO_PTR_TO_STRUCT(bh_info))) {
+                    // We are about to replace a thunk with a blackhole.
+                    // Add the free variables of the closure we are about to
+                    // overwrite to the update remembered set.
+                    // N.B. We caught the WHITEHOLE case above.
+                    updateRemembSetPushThunkEager(cap,
+                                                 THUNK_INFO_PTR_TO_STRUCT(bh_info),
+                                                 (StgThunk *) bh);
+                }
+            }
+
             // The payload of the BLACKHOLE points to the TSO
             ((StgInd *)bh)->indirectee = (StgClosure *)tso;
             write_barrier();
diff --git a/rts/Threads.c b/rts/Threads.c
index 2bdcea1c0056c0f96597b1d8d1abdce339321d1a..8334c5a5acc1e5d54366aa09aa8ce67f28524a35 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -85,7 +85,8 @@ createThread(Capability *cap, W_ size)
     SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS);
     stack->stack_size   = stack_size - sizeofW(StgStack);
     stack->sp           = stack->stack + stack->stack_size;
-    stack->dirty        = 1;
+    stack->dirty        = STACK_DIRTY;
+    stack->marking      = 0;
 
     tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
     TICK_ALLOC_TSO();
@@ -611,6 +612,7 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
     TICK_ALLOC_STACK(chunk_size);
 
     new_stack->dirty = 0; // begin clean, we'll mark it dirty below
+    new_stack->marking = 0;
     new_stack->stack_size = chunk_size - sizeofW(StgStack);
     new_stack->sp = new_stack->stack + new_stack->stack_size;
 
@@ -721,9 +723,17 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
             barf("threadStackUnderflow: not enough space for return values");
         }
 
-        new_stack->sp -= retvals;
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            // ensure that values that we copy into the new stack are marked
+            // for the nonmoving collector. Note that these values won't
+            // necessarily form a full closure so we need to handle them
+            // specially.
+            for (unsigned int i = 0; i < retvals; i++) {
+                updateRemembSetPushClosure(cap, (StgClosure *) old_stack->sp[i]);
+            }
+        }
 
-        memcpy(/* dest */ new_stack->sp,
+        memcpy(/* dest */ new_stack->sp - retvals,
                /* src  */ old_stack->sp,
                /* size */ retvals * sizeof(W_));
     }
@@ -735,8 +745,12 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
     // restore the stack parameters, and update tot_stack_size
     tso->tot_stack_size -= old_stack->stack_size;
 
-    // we're about to run it, better mark it dirty
+    // we're about to run it, better mark it dirty.
+    //
+    // N.B. the nonmoving collector may mark the stack, meaning that sp must
+    // point at a valid stack frame.
     dirty_STACK(cap, new_stack);
+    new_stack->sp -= retvals;
 
     return retvals;
 }
@@ -768,7 +782,7 @@ loop:
     if (q == (StgMVarTSOQueue*)&stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
         if (info == &stg_MVAR_CLEAN_info) {
-            dirty_MVAR(&cap->r, (StgClosure*)mvar);
+            dirty_MVAR(&cap->r, (StgClosure*)mvar, mvar->value);
         }
 
         mvar->value = value;
@@ -804,7 +818,7 @@ loop:
     // indicate that the MVar operation has now completed.
     tso->_link = (StgTSO*)&stg_END_TSO_QUEUE_closure;
 
-    if (stack->dirty == 0) {
+    if ((stack->dirty & STACK_DIRTY) == 0) {
         dirty_STACK(cap, stack);
     }
 
diff --git a/rts/Trace.c b/rts/Trace.c
index c8a951a51056ded6e4102ce248b35b12a96dd3c7..8e44716eb0e603abdc5f5f1e1afc571815700eaf 100644
--- a/rts/Trace.c
+++ b/rts/Trace.c
@@ -30,6 +30,7 @@
 // events
 int TRACE_sched;
 int TRACE_gc;
+int TRACE_nonmoving_gc;
 int TRACE_spark_sampled;
 int TRACE_spark_full;
 int TRACE_user;
@@ -72,6 +73,9 @@ void initTracing (void)
         RtsFlags.GcFlags.giveStats = COLLECT_GC_STATS;
     }
 
+    TRACE_nonmoving_gc =
+        RtsFlags.TraceFlags.nonmoving_gc;
+
     TRACE_spark_sampled =
         RtsFlags.TraceFlags.sparks_sampled;
 
@@ -818,6 +822,55 @@ void traceThreadLabel_(Capability *cap,
     }
 }
 
+void traceConcMarkBegin()
+{
+    if (eventlog_enabled)
+        postEventNoCap(EVENT_CONC_MARK_BEGIN);
+}
+
+void traceConcMarkEnd(StgWord32 marked_obj_count)
+{
+    if (eventlog_enabled)
+        postConcMarkEnd(marked_obj_count);
+}
+
+void traceConcSyncBegin()
+{
+    if (eventlog_enabled)
+        postEventNoCap(EVENT_CONC_SYNC_BEGIN);
+}
+
+void traceConcSyncEnd()
+{
+    if (eventlog_enabled)
+        postEventNoCap(EVENT_CONC_SYNC_END);
+}
+
+void traceConcSweepBegin()
+{
+    if (eventlog_enabled)
+        postEventNoCap(EVENT_CONC_SWEEP_BEGIN);
+}
+
+void traceConcSweepEnd()
+{
+    if (eventlog_enabled)
+        postEventNoCap(EVENT_CONC_SWEEP_END);
+}
+
+void traceConcUpdRemSetFlush(Capability *cap)
+{
+    if (eventlog_enabled)
+        postConcUpdRemSetFlush(cap);
+}
+
+void traceNonmovingHeapCensus(uint32_t log_blk_size,
+                              const struct NonmovingAllocCensus *census)
+{
+    if (eventlog_enabled && TRACE_nonmoving_gc)
+        postNonmovingHeapCensus(log_blk_size, census);
+}
+
 void traceThreadStatus_ (StgTSO *tso USED_IF_DEBUG)
 {
 #if defined(DEBUG)
diff --git a/rts/Trace.h b/rts/Trace.h
index b7db0ca912f0696eeb158331a4dd502b3ff6d201..ec25a09d7b14eac8b39ce053f86e9b348ea5da75 100644
--- a/rts/Trace.h
+++ b/rts/Trace.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "rts/EventLogFormat.h"
+#include "sm/NonMovingCensus.h"
 #include "Capability.h"
 
 #if defined(DTRACE)
@@ -50,6 +51,7 @@ enum CapsetType { CapsetTypeCustom = CAPSET_TYPE_CUSTOM,
 #define DEBUG_weak        RtsFlags.DebugFlags.weak
 #define DEBUG_gccafs      RtsFlags.DebugFlags.gccafs
 #define DEBUG_gc          RtsFlags.DebugFlags.gc
+#define DEBUG_nonmoving_gc RtsFlags.DebugFlags.nonmoving_gc
 #define DEBUG_block_alloc RtsFlags.DebugFlags.alloc
 #define DEBUG_sanity      RtsFlags.DebugFlags.sanity
 #define DEBUG_zero_on_gc  RtsFlags.DebugFlags.zero_on_gc
@@ -71,6 +73,7 @@ extern int TRACE_spark_sampled;
 extern int TRACE_spark_full;
 /* extern int TRACE_user; */  // only used in Trace.c
 extern int TRACE_cap;
+extern int TRACE_nonmoving_gc;
 
 // -----------------------------------------------------------------------------
 // Posting events
@@ -307,6 +310,16 @@ void traceProfSampleCostCentre(Capability *cap,
 void traceProfBegin(void);
 #endif /* PROFILING */
 
+void traceConcMarkBegin(void);
+void traceConcMarkEnd(StgWord32 marked_obj_count);
+void traceConcSyncBegin(void);
+void traceConcSyncEnd(void);
+void traceConcSweepBegin(void);
+void traceConcSweepEnd(void);
+void traceConcUpdRemSetFlush(Capability *cap);
+void traceNonmovingHeapCensus(uint32_t log_blk_size,
+                              const struct NonmovingAllocCensus *census);
+
 void flushTrace(void);
 
 #else /* !TRACING */
@@ -347,6 +360,15 @@ void flushTrace(void);
 #define traceHeapProfSampleCostCentre(profile_id, stack, residency) /* nothing */
 #define traceHeapProfSampleString(profile_id, label, residency) /* nothing */
 
+#define traceConcMarkBegin() /* nothing */
+#define traceConcMarkEnd(marked_obj_count) /* nothing */
+#define traceConcSyncBegin() /* nothing */
+#define traceConcSyncEnd() /* nothing */
+#define traceConcSweepBegin() /* nothing */
+#define traceConcSweepEnd() /* nothing */
+#define traceConcUpdRemSetFlush(cap) /* nothing */
+#define traceNonmovingHeapCensus(blk_size, census) /* nothing */
+
 #define flushTrace() /* nothing */
 
 #endif /* TRACING */
diff --git a/rts/Updates.h b/rts/Updates.h
index 1bd3e065afbb11348f03d805b0857f71526f36ce..91d1b0b1cbd683075eeae9fcab23808ee60f7104 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -50,6 +50,9 @@
                                                                 \
     prim_write_barrier;                                         \
     OVERWRITING_CLOSURE(p1);                                    \
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {                        \
+      ccall updateRemembSetPushThunk_(BaseReg, p1 "ptr");       \
+    }                                                           \
     StgInd_indirectee(p1) = p2;                                 \
     prim_write_barrier;                                         \
     SET_INFO(p1, stg_BLACKHOLE_info);                           \
@@ -62,7 +65,7 @@
     } else {                                                    \
       TICK_UPD_NEW_IND();                                       \
       and_then;                                                 \
-  }
+    }
 
 #else /* !CMINUSMINUS */
 
@@ -78,6 +81,9 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     /* See Note [Heap memory barriers] in SMP.h */
     write_barrier();
     OVERWRITING_CLOSURE(p1);
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        updateRemembSetPushThunk(cap, (StgThunk*)p1);
+    }
     ((StgInd *)p1)->indirectee = p2;
     write_barrier();
     SET_INFO(p1, &stg_BLACKHOLE_info);
diff --git a/rts/Weak.c b/rts/Weak.c
index ec998c214f46e9194ea02fa11d7cf183392b61e3..fe4516794ab5fff11fcb69695fdb7c3bc994bd69 100644
--- a/rts/Weak.c
+++ b/rts/Weak.c
@@ -93,9 +93,19 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     StgWord size;
     uint32_t n, i;
 
-    ASSERT(n_finalizers == 0);
-
-    finalizer_list = list;
+    // This assertion does not hold with non-moving collection because
+    // non-moving collector does not wait for the list to be consumed (by
+    // doIdleGcWork()) before appending the list with more finalizers.
+    ASSERT(RtsFlags.GcFlags.useNonmoving || n_finalizers == 0);
+
+    // Append finalizer_list with the new list. TODO: Perhaps cache tail of the
+    // list for faster append. NOTE: We can't append `list` here! Otherwise we
+    // end up traversing already visited weaks in the loops below.
+    StgWeak **tl = &finalizer_list;
+    while (*tl) {
+        tl = &(*tl)->link;
+    }
+    *tl = list;
 
     // Traverse the list and
     //  * count the number of Haskell finalizers
@@ -130,7 +140,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
     }
 
-    n_finalizers = i;
+    n_finalizers += i;
 
     // No Haskell finalizers to run?
     if (n == 0) return;
diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c
index 6d7b4871521fe8e5a7f85817d89fd22a4fe33f71..5f22af5bfc33c81a70d5096161a6cfe9f880418a 100644
--- a/rts/eventlog/EventLog.c
+++ b/rts/eventlog/EventLog.c
@@ -109,7 +109,15 @@ char *EventDesc[] = {
   [EVENT_HEAP_PROF_SAMPLE_COST_CENTRE] = "Heap profile cost-centre sample",
   [EVENT_PROF_SAMPLE_COST_CENTRE] = "Time profile cost-centre stack",
   [EVENT_PROF_BEGIN] = "Start of a time profile",
-  [EVENT_USER_BINARY_MSG]     = "User binary message"
+  [EVENT_USER_BINARY_MSG]     = "User binary message",
+  [EVENT_CONC_MARK_BEGIN]        = "Begin concurrent mark phase",
+  [EVENT_CONC_MARK_END]          = "End concurrent mark phase",
+  [EVENT_CONC_SYNC_BEGIN]        = "Begin concurrent GC synchronisation",
+  [EVENT_CONC_SYNC_END]          = "End concurrent GC synchronisation",
+  [EVENT_CONC_SWEEP_BEGIN]       = "Begin concurrent sweep",
+  [EVENT_CONC_SWEEP_END]         = "End concurrent sweep",
+  [EVENT_CONC_UPD_REM_SET_FLUSH] = "Update remembered set flushed",
+  [EVENT_NONMOVING_HEAP_CENSUS]  = "Nonmoving heap census"
 };
 
 // Event type.
@@ -456,6 +464,27 @@ init_event_types(void)
             eventTypes[t].size = EVENT_SIZE_DYNAMIC;
             break;
 
+        case EVENT_CONC_MARK_BEGIN:
+        case EVENT_CONC_SYNC_BEGIN:
+        case EVENT_CONC_SYNC_END:
+        case EVENT_CONC_SWEEP_BEGIN:
+        case EVENT_CONC_SWEEP_END:
+            eventTypes[t].size = 0;
+            break;
+
+        case EVENT_CONC_MARK_END:
+            eventTypes[t].size = 4;
+            break;
+
+        case EVENT_CONC_UPD_REM_SET_FLUSH: // (cap)
+            eventTypes[t].size =
+                sizeof(EventCapNo);
+            break;
+
+        case EVENT_NONMOVING_HEAP_CENSUS: // (cap, blk_size, active_segs, filled_segs, live_blks)
+            eventTypes[t].size = 13;
+            break;
+
         default:
             continue; /* ignore deprecated events */
         }
@@ -497,8 +526,10 @@ initEventLogging(const EventLogWriter *ev_writer)
     event_log_writer = ev_writer;
     initEventLogWriter();
 
-    if (sizeof(EventDesc) / sizeof(char*) != NUM_GHC_EVENT_TAGS) {
-        barf("EventDesc array has the wrong number of elements");
+    int num_descs = sizeof(EventDesc) / sizeof(char*);
+    if (num_descs != NUM_GHC_EVENT_TAGS) {
+        barf("EventDesc array has the wrong number of elements (%d, NUM_GHC_EVENT_TAGS=%d)",
+             num_descs, NUM_GHC_EVENT_TAGS);
     }
 
     /*
@@ -1014,6 +1045,15 @@ void postTaskDeleteEvent (EventTaskId taskId)
     RELEASE_LOCK(&eventBufMutex);
 }
 
+void
+postEventNoCap (EventTypeNum tag)
+{
+    ACQUIRE_LOCK(&eventBufMutex);
+    ensureRoomForEvent(&eventBuf, tag);
+    postEventHeader(&eventBuf, tag);
+    RELEASE_LOCK(&eventBufMutex);
+}
+
 void
 postEvent (Capability *cap, EventTypeNum tag)
 {
@@ -1140,6 +1180,35 @@ void postThreadLabel(Capability    *cap,
     postBuf(eb, (StgWord8*) label, strsize);
 }
 
+void postConcUpdRemSetFlush(Capability *cap)
+{
+    EventsBuf *eb = &capEventBuf[cap->no];
+    ensureRoomForEvent(eb, EVENT_CONC_UPD_REM_SET_FLUSH);
+    postEventHeader(eb, EVENT_CONC_UPD_REM_SET_FLUSH);
+    postCapNo(eb, cap->no);
+}
+
+void postConcMarkEnd(StgWord32 marked_obj_count)
+{
+    ACQUIRE_LOCK(&eventBufMutex);
+    ensureRoomForEvent(&eventBuf, EVENT_CONC_MARK_END);
+    postEventHeader(&eventBuf, EVENT_CONC_MARK_END);
+    postWord32(&eventBuf, marked_obj_count);
+    RELEASE_LOCK(&eventBufMutex);
+}
+
+void postNonmovingHeapCensus(int log_blk_size,
+                             const struct NonmovingAllocCensus *census)
+{
+    ACQUIRE_LOCK(&eventBufMutex);
+    postEventHeader(&eventBuf, EVENT_NONMOVING_HEAP_CENSUS);
+    postWord8(&eventBuf, log_blk_size);
+    postWord32(&eventBuf, census->n_active_segs);
+    postWord32(&eventBuf, census->n_filled_segs);
+    postWord32(&eventBuf, census->n_live_blocks);
+    RELEASE_LOCK(&eventBufMutex);
+}
+
 void closeBlockMarker (EventsBuf *ebuf)
 {
     if (ebuf->marker)
diff --git a/rts/eventlog/EventLog.h b/rts/eventlog/EventLog.h
index ec9a5f34e391553110eb86e36a2d8bbf806fa7c7..5bd3b5dadb40cd8245310de1589a9ce7042c0a14 100644
--- a/rts/eventlog/EventLog.h
+++ b/rts/eventlog/EventLog.h
@@ -11,6 +11,7 @@
 #include "rts/EventLogFormat.h"
 #include "rts/EventLogWriter.h"
 #include "Capability.h"
+#include "sm/NonMovingCensus.h"
 
 #include "BeginPrivate.h"
 
@@ -39,6 +40,7 @@ void postSchedEvent(Capability *cap, EventTypeNum tag,
  * Post a nullary event.
  */
 void postEvent(Capability *cap, EventTypeNum tag);
+void postEventNoCap(EventTypeNum tag);
 
 void postEventAtTimestamp (Capability *cap, EventTimestamp ts,
                            EventTypeNum tag);
@@ -164,6 +166,11 @@ void postProfSampleCostCentre(Capability *cap,
 void postProfBegin(void);
 #endif /* PROFILING */
 
+void postConcUpdRemSetFlush(Capability *cap);
+void postConcMarkEnd(StgWord32 marked_obj_count);
+void postNonmovingHeapCensus(int log_blk_size,
+                             const struct NonmovingAllocCensus *census);
+
 #else /* !TRACING */
 
 INLINE_HEADER void postSchedEvent (Capability *cap  STG_UNUSED,
@@ -177,6 +184,9 @@ INLINE_HEADER void postEvent (Capability *cap  STG_UNUSED,
                               EventTypeNum tag STG_UNUSED)
 { /* nothing */ }
 
+INLINE_HEADER void postEventNoCap (EventTypeNum tag STG_UNUSED)
+{ /* nothing */ }
+
 INLINE_HEADER void postMsg (char *msg STG_UNUSED,
                             va_list ap STG_UNUSED)
 { /* nothing */ }
diff --git a/rts/ghc.mk b/rts/ghc.mk
index dca22fb7338796a5152f55b25fa74dae074525cc..59d5994147aece51cf89c0163a285758729620be 100644
--- a/rts/ghc.mk
+++ b/rts/ghc.mk
@@ -340,6 +340,8 @@ WARNING_OPTS += -Wredundant-decls
 ifeq "$(GccLT46)" "NO"
 WARNING_OPTS += -Wundef
 endif
+# Some gccs annoyingly enable this archaic specimen by default
+WARNING_OPTS += -Wno-aggregate-return
 
 # These ones are hard to avoid:
 #WARNING_OPTS += -Wconversion
diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in
index 30c829ad42ab33f54d219b4323dce13754a71eb0..4b5d837c3a78072ffa227af4a0fd248bd65923f7 100644
--- a/rts/rts.cabal.in
+++ b/rts/rts.cabal.in
@@ -139,6 +139,7 @@ library
                       rts/Linker.h
                       rts/Main.h
                       rts/Messages.h
+                      rts/NonMoving.h
                       rts/OSThreads.h
                       rts/Parallel.h
                       rts/PrimFloat.h
@@ -465,6 +466,12 @@ library
                sm/GCUtils.c
                sm/MBlock.c
                sm/MarkWeak.c
+               sm/NonMoving.c
+               sm/NonMovingCensus.c
+               sm/NonMovingMark.c
+               sm/NonMovingScav.c
+               sm/NonMovingShortcut.c
+               sm/NonMovingSweep.c
                sm/Sanity.c
                sm/Scav.c
                sm/Scav_thr.c
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index f9e3d11407bd79f4607c28f198e0c92186c9e505..b3e1e2ce75121d043c4d283fbe86b3b48e51f470 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -310,7 +310,7 @@ setup_tail (bdescr *bd)
 // Take a free block group bd, and split off a group of size n from
 // it.  Adjust the free list as necessary, and return the new group.
 static bdescr *
-split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
+split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln /* log_2_ceil(n) */)
 {
     bdescr *fg; // free group
 
@@ -325,6 +325,46 @@ split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln)
     return fg;
 }
 
+// Take N blocks off the end, free the rest.
+static bdescr *
+split_block_high (bdescr *bd, W_ n)
+{
+    ASSERT(bd->blocks > n);
+
+    bdescr* ret = bd + bd->blocks - n; // take n blocks off the end
+    ret->blocks = n;
+    ret->start = ret->free = bd->start + (bd->blocks - n)*BLOCK_SIZE_W;
+    ret->link = NULL;
+
+    bd->blocks -= n;
+
+    setup_tail(ret);
+    setup_tail(bd);
+    freeGroup(bd);
+
+    return ret;
+}
+
+// Like `split_block_high`, but takes n blocks off the beginning rather
+// than the end.
+static bdescr *
+split_block_low (bdescr *bd, W_ n)
+{
+    ASSERT(bd->blocks > n);
+
+    bdescr* bd_ = bd + n;
+    bd_->blocks = bd->blocks - n;
+    bd_->start = bd_->free = bd->start + n*BLOCK_SIZE_W;
+
+    bd->blocks = n;
+
+    setup_tail(bd_);
+    setup_tail(bd);
+    freeGroup(bd_);
+
+    return bd;
+}
+
 /* Only initializes the start pointers on the first megablock and the
  * blocks field of the first bdescr; callers are responsible for calling
  * initGroup afterwards.
@@ -461,6 +501,108 @@ finish:
     return bd;
 }
 
+// Allocate `n` blocks aligned to `n` blocks, e.g. when n = 8, the blocks will
+// be aligned at `8 * BLOCK_SIZE`. For a group with `n` blocks this can be used
+// for easily accessing the beginning of the group from a location p in the
+// group with
+//
+//     p % (BLOCK_SIZE*n)
+//
+// Used by the non-moving collector for allocating segments.
+//
+// Because the storage manager does not support aligned allocations, we have to
+// allocate `2*n - 1` blocks here to make sure we'll be able to find an aligned
+// region in the allocated blocks. After finding the aligned area we want to
+// free slop on the low and high sides, and block allocator doesn't support
+// freeing only some portion of a megablock (we can only free whole megablocks).
+// So we disallow allocating megablocks here, and allow allocating at most
+// `BLOCKS_PER_MBLOCK / 2` blocks.
+bdescr *
+allocAlignedGroupOnNode (uint32_t node, W_ n)
+{
+    // allocate enough blocks to have enough space aligned at n-block boundary
+    // free any slops on the low and high side of this space
+
+    // number of blocks to allocate to make sure we have enough aligned space
+    W_ num_blocks = 2*n - 1;
+
+    if (num_blocks >= BLOCKS_PER_MBLOCK) {
+        barf("allocAlignedGroupOnNode: allocating megablocks is not supported\n"
+             "    requested blocks: %" FMT_Word "\n"
+             "    required for alignment: %" FMT_Word "\n"
+             "    megablock size (in blocks): %" FMT_Word,
+             n, num_blocks, (W_) BLOCKS_PER_MBLOCK);
+    }
+
+    W_ group_size = n * BLOCK_SIZE;
+
+    // To reduce splitting and fragmentation we use allocLargeChunkOnNode here.
+    // Tweak the max allocation to avoid allocating megablocks. Splitting slop
+    // below doesn't work with megablocks (freeGroup can't free only a portion
+    // of a megablock so we can't allocate megablocks and free some parts of
+    // them).
+    W_ max_blocks = stg_min(num_blocks * 3, BLOCKS_PER_MBLOCK - 1);
+    bdescr *bd = allocLargeChunkOnNode(node, num_blocks, max_blocks);
+    // We may allocate more than num_blocks, so update it
+    num_blocks = bd->blocks;
+
+    // slop on the low side
+    W_ slop_low = 0;
+    if ((uintptr_t)bd->start % group_size != 0) {
+        slop_low = group_size - ((uintptr_t)bd->start % group_size);
+    }
+
+    W_ slop_high = (num_blocks * BLOCK_SIZE) - group_size - slop_low;
+
+    ASSERT((slop_low % BLOCK_SIZE) == 0);
+    ASSERT((slop_high % BLOCK_SIZE) == 0);
+
+    W_ slop_low_blocks = slop_low / BLOCK_SIZE;
+    W_ slop_high_blocks = slop_high / BLOCK_SIZE;
+
+    ASSERT(slop_low_blocks + slop_high_blocks + n == num_blocks);
+
+#if defined(DEBUG)
+    checkFreeListSanity();
+    W_ free_before = countFreeList();
+#endif
+
+    if (slop_low_blocks != 0) {
+        bd = split_block_high(bd, num_blocks - slop_low_blocks);
+        ASSERT(countBlocks(bd) == num_blocks - slop_low_blocks);
+    }
+
+#if defined(DEBUG)
+    ASSERT(countFreeList() == free_before + slop_low_blocks);
+    checkFreeListSanity();
+#endif
+
+    // At this point the bd should be aligned, but we may have slop on the high side
+    ASSERT((uintptr_t)bd->start % group_size == 0);
+
+#if defined(DEBUG)
+    free_before = countFreeList();
+#endif
+
+    if (slop_high_blocks != 0) {
+        bd = split_block_low(bd, n);
+        ASSERT(bd->blocks == n);
+    }
+
+#if defined(DEBUG)
+    ASSERT(countFreeList() == free_before + slop_high_blocks);
+    checkFreeListSanity();
+#endif
+
+    // Should still be aligned
+    ASSERT((uintptr_t)bd->start % group_size == 0);
+
+    // Just to make sure I get this right
+    ASSERT(Bdescr(bd->start) == bd);
+
+    return bd;
+}
+
 STATIC_INLINE
 uint32_t nodeWithLeastBlocks (void)
 {
diff --git a/rts/sm/CNF.c b/rts/sm/CNF.c
index f85b3904145e1c24bf342f4b906bb328890437d9..87d1d84f50ad71395a7ba059bb4840e52469b452 100644
--- a/rts/sm/CNF.c
+++ b/rts/sm/CNF.c
@@ -276,7 +276,10 @@ compactFree(StgCompactNFData *str)
     for ( ; block; block = next) {
         next = block->next;
         bd = Bdescr((StgPtr)block);
-        ASSERT((bd->flags & BF_EVACUATED) == 0);
+        ASSERT(RtsFlags.GcFlags.useNonmoving || ((bd->flags & BF_EVACUATED) == 0));
+            // When using the non-moving collector we leave compact object
+            // evacuated to the oldset gen as BF_EVACUATED to avoid evacuating
+            // objects in the non-moving heap.
         freeGroup(bd);
     }
 }
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 53a473d26cf8d86a7ea42b9749e4bbed3fed0edd..521fd4eef4540def56c91390f89a0240d5343eb3 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -27,6 +27,7 @@
 #include "LdvProfile.h"
 #include "CNF.h"
 #include "Scav.h"
+#include "NonMoving.h"
 
 #if defined(THREADED_RTS) && !defined(PARALLEL_GC)
 #define evacuate(p) evacuate1(p)
@@ -39,7 +40,19 @@
         copy_tag(p, info, src, size, stp, tag)
 #endif
 
-/* Used to avoid long recursion due to selector thunks
+/* Note [Selector optimisation depth limit]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * MAX_THUNK_SELECTOR_DEPTH is used to avoid long recursion of
+ * eval_thunk_selector due to nested selector thunks. Note that this *only*
+ * counts nested selector thunks, e.g. `fst (fst (... (fst x)))`. The collector
+ * will traverse interleaved selector-constructor pairs without limit, e.g.
+ *
+ *     a = (fst b, _)
+ *     b = (fst c, _)
+ *     c = (fst d, _)
+ *     d = (x, _)
+ *
  */
 #define MAX_THUNK_SELECTOR_DEPTH 16
 
@@ -50,9 +63,12 @@ STATIC_INLINE void evacuate_large(StgPtr p);
    Allocate some space in which to copy an object.
    -------------------------------------------------------------------------- */
 
+/* size is in words */
 STATIC_INLINE StgPtr
 alloc_for_copy (uint32_t size, uint32_t gen_no)
 {
+    ASSERT(gen_no < RtsFlags.GcFlags.generations);
+
     StgPtr to;
     gen_workspace *ws;
 
@@ -69,6 +85,36 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
         }
     }
 
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) {
+        /* See Note [Deadlock detection under nonmoving collector]. */
+        if (deadlock_detect_gc)
+            gen_no = oldest_gen->no;
+
+        if (gen_no == oldest_gen->no) {
+            gct->copied += size;
+            to = nonmovingAllocate(gct->cap, size);
+
+            // Add segment to the todo list unless it's already there
+            // current->todo_link == NULL means not in todo list
+            struct NonmovingSegment *seg = nonmovingGetSegment(to);
+            if (!seg->todo_link) {
+                gen_workspace *ws = &gct->gens[oldest_gen->no];
+                seg->todo_link = ws->todo_seg;
+                ws->todo_seg = seg;
+            }
+
+            // The object which refers to this closure may have been aged (i.e.
+            // retained in a younger generation). Consequently, we must add the
+            // closure to the mark queue to ensure that it will be marked.
+            //
+            // However, if we are in a deadlock detection GC then we disable aging
+            // so there is no need.
+            if (major_gc && !deadlock_detect_gc)
+                markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to);
+            return to;
+        }
+    }
+
     ws = &gct->gens[gen_no];  // zero memory references here
 
     /* chain a new block onto the to-space for the destination gen if
@@ -88,6 +134,7 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
    The evacuate() code
    -------------------------------------------------------------------------- */
 
+/* size is in words */
 STATIC_INLINE GNUC_ATTR_HOT void
 copy_tag(StgClosure **p, const StgInfoTable *info,
          StgClosure *src, uint32_t size, uint32_t gen_no, StgWord tag)
@@ -284,7 +331,10 @@ evacuate_large(StgPtr p)
    */
   new_gen_no = bd->dest_no;
 
-  if (new_gen_no < gct->evac_gen_no) {
+  if (RTS_UNLIKELY(deadlock_detect_gc)) {
+      /* See Note [Deadlock detection under nonmoving collector]. */
+      new_gen_no = oldest_gen->no;
+  } else if (new_gen_no < gct->evac_gen_no) {
       if (gct->eager_promotion) {
           new_gen_no = gct->evac_gen_no;
       } else {
@@ -296,6 +346,9 @@ evacuate_large(StgPtr p)
   new_gen = &generations[new_gen_no];
 
   bd->flags |= BF_EVACUATED;
+  if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen)) {
+      bd->flags |= BF_NONMOVING;
+  }
   initBdescr(bd, new_gen, new_gen->to);
 
   // If this is a block of pinned or compact objects, we don't have to scan
@@ -330,6 +383,13 @@ evacuate_large(StgPtr p)
 STATIC_INLINE void
 evacuate_static_object (StgClosure **link_field, StgClosure *q)
 {
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) {
+        // See Note [Static objects under the nonmoving collector] in Storage.c.
+        if (major_gc && !deadlock_detect_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+        return;
+    }
+
     StgWord link = (StgWord)*link_field;
 
     // See Note [STATIC_LINK fields] for how the link field bits work
@@ -376,12 +436,22 @@ evacuate_compact (StgPtr p)
     bd = Bdescr((StgPtr)str);
     gen_no = bd->gen_no;
 
+    if (bd->flags & BF_NONMOVING) {
+        // We may have evacuated the block to the nonmoving generation. If so
+        // we need to make sure it is added to the mark queue since the only
+        // reference to it may be from the moving heap.
+        if (major_gc && !deadlock_detect_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) str);
+        return;
+    }
+
     // already evacuated? (we're about to do the same check,
     // but we avoid taking the spin-lock)
     if (bd->flags & BF_EVACUATED) {
         /* Don't forget to set the gct->failed_to_evac flag if we didn't get
          * the desired destination (see comments in evacuate()).
          */
+        debugTrace(DEBUG_compact, "Compact %p already evacuated", str);
         if (gen_no < gct->evac_gen_no) {
             gct->failed_to_evac = true;
             TICK_GC_FAILED_PROMOTION();
@@ -430,9 +500,15 @@ evacuate_compact (StgPtr p)
     // for that - the only code touching the generation of the block is
     // in the GC, and that should never see blocks other than the first)
     bd->flags |= BF_EVACUATED;
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen)) {
+        bd->flags |= BF_NONMOVING;
+    }
     initBdescr(bd, new_gen, new_gen->to);
 
     if (str->hash) {
+        // If there is a hash-table for sharing preservation then we need to add
+        // the compact to the scavenging work list to ensure that the hashtable
+        // is scavenged.
         gen_workspace *ws = &gct->gens[new_gen_no];
         bd->link = ws->todo_large_objects;
         ws->todo_large_objects = bd;
@@ -563,7 +639,18 @@ loop:
 
   bd = Bdescr((P_)q);
 
-  if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT)) != 0) {
+  if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT | BF_NONMOVING)) != 0) {
+      // Pointer to non-moving heap. Non-moving heap is collected using
+      // mark-sweep so this object should be marked and then retained in sweep.
+      if (RTS_UNLIKELY(bd->flags & BF_NONMOVING)) {
+          // NOTE: large objects in nonmoving heap are also marked with
+          // BF_NONMOVING. Those are moved to scavenged_large_objects list in
+          // mark phase.
+          if (major_gc && !deadlock_detect_gc)
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          return;
+      }
+
       // pointer into to-space: just return it.  It might be a pointer
       // into a generation that we aren't collecting (> N), or it
       // might just be a pointer into to-space.  The latter doesn't
@@ -594,6 +681,13 @@ loop:
        */
       if (bd->flags & BF_LARGE) {
           evacuate_large((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -894,6 +988,12 @@ evacuate_BLACKHOLE(StgClosure **p)
     // blackholes can't be in a compact
     ASSERT((bd->flags & BF_COMPACT) == 0);
 
+    if (RTS_UNLIKELY(bd->flags & BF_NONMOVING)) {
+        if (major_gc && !deadlock_detect_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+        return;
+    }
+
     // blackholes *can* be in a large object: when raiseAsync() creates an
     // AP_STACK the payload might be large enough to create a large object.
     // See #14497.
@@ -1044,7 +1144,7 @@ selector_chain:
         // save any space in any case, and updating with an indirection is
         // trickier in a non-collected gen: we would have to update the
         // mutable list.
-        if (bd->flags & BF_EVACUATED) {
+        if (bd->flags & (BF_EVACUATED | BF_NONMOVING)) {
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
             *q = (StgClosure *)p;
             // shortcut, behave as for:  if (evac) evacuate(q);
@@ -1257,6 +1357,7 @@ selector_loop:
 
           // recursively evaluate this selector.  We don't want to
           // recurse indefinitely, so we impose a depth bound.
+          // See Note [Selector optimisation depth limit].
           if (gct->thunk_selector_depth >= MAX_THUNK_SELECTOR_DEPTH) {
               goto bale_out;
           }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 76237f35c2d32ed8deaf31507e92d93fe5d286bc..83e9c97bd90ea87ea5cea41778d2b0d443422a5f 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -51,6 +51,7 @@
 #include "CheckUnload.h"
 #include "CNF.h"
 #include "RtsFlags.h"
+#include "NonMoving.h"
 
 #include <string.h> // for memset()
 #include <unistd.h>
@@ -103,6 +104,7 @@
  */
 uint32_t N;
 bool major_gc;
+bool deadlock_detect_gc;
 
 /* Data used for allocation area sizing.
  */
@@ -159,7 +161,6 @@ static void mark_root               (void *user, StgClosure **root);
 static void prepare_collected_gen   (generation *gen);
 static void prepare_uncollected_gen (generation *gen);
 static void init_gc_thread          (gc_thread *t);
-static void resize_generations      (void);
 static void resize_nursery          (void);
 static void start_gc_threads        (void);
 static void scavenge_until_all_done (void);
@@ -193,7 +194,8 @@ StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
 
 void
 GarbageCollect (uint32_t collect_gen,
-                bool do_heap_census,
+                const bool do_heap_census,
+                const bool deadlock_detect,
                 uint32_t gc_type USED_IF_THREADS,
                 Capability *cap,
                 bool idle_cap[])
@@ -263,7 +265,25 @@ GarbageCollect (uint32_t collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
-  if (major_gc) {
+  /* See Note [Deadlock detection under nonmoving collector]. */
+  deadlock_detect_gc = deadlock_detect;
+
+#if defined(THREADED_RTS)
+  if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) {
+      /* If there is already a concurrent major collection running then
+       * there is no benefit to starting another.
+       * TODO: Catch heap-size runaway.
+       */
+      N--;
+      collect_gen--;
+      major_gc = false;
+  }
+#endif
+
+  /* N.B. The nonmoving collector works a bit differently. See
+   * Note [Static objects under the nonmoving collector].
+   */
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) {
       prev_static_flag = static_flag;
       static_flag =
           static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
@@ -572,7 +592,7 @@ GarbageCollect (uint32_t collect_gen,
     gen = &generations[g];
 
     // for generations we collected...
-    if (g <= N) {
+    if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
 
         /* free old memory and shift to-space into from-space for all
          * the collected generations (except the allocation area).  These
@@ -710,8 +730,55 @@ GarbageCollect (uint32_t collect_gen,
     }
   } // for all generations
 
-  // update the max size of older generations after a major GC
-  resize_generations();
+  // Flush the update remembered set. See Note [Eager update remembered set
+  // flushing] in NonMovingMark.c
+  if (RtsFlags.GcFlags.useNonmoving) {
+      RELEASE_SM_LOCK;
+      nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
+      ACQUIRE_SM_LOCK;
+  }
+
+  // Mark and sweep the oldest generation.
+  // N.B. This can only happen after we've moved
+  // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects.
+  ASSERT(oldest_gen->scavenged_large_objects == NULL);
+  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+      // All threads in non-moving heap should be found to be alive, becuase
+      // threads in the non-moving generation's list should live in the
+      // non-moving heap, and we consider non-moving objects alive during
+      // preparation.
+      ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
+      // For weaks, remember that we evacuated all weaks to the non-moving heap
+      // in markWeakPtrList(), and then moved the weak_ptr_list list to
+      // old_weak_ptr_list. We then moved weaks with live keys to the
+      // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in
+      // old_weak_ptr_list to dead_weak_ptr_list. So at this point
+      // old_weak_ptr_list should be empty.
+      ASSERT(oldest_gen->old_weak_ptr_list == NULL);
+
+      // we may need to take the lock to allocate mark queue blocks
+      RELEASE_SM_LOCK;
+      // dead_weak_ptr_list contains weak pointers with dead keys. Those need to
+      // be kept alive because we'll use them in finalizeSchedulers(). Similarly
+      // resurrected_threads are also going to be used in resurrectedThreads()
+      // so we need to mark those too.
+      // Note that in sequential case these lists will be appended with more
+      // weaks and threads found to be dead in mark.
+#if !defined(THREADED_RTS)
+      // In the non-threaded runtime this is the only time we push to the
+      // upd_rem_set
+      nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
+#endif
+      nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
+      ACQUIRE_SM_LOCK;
+  }
+
+  // Update the max size of older generations after a major GC:
+  // We can't resize here in the case of the concurrent collector since we
+  // don't yet know how much live data we have. This will be instead done
+  // once we finish marking.
+  if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving)
+      resizeGenerations();
 
   // Free the mark stack.
   if (mark_stack_top_bd != NULL) {
@@ -735,7 +802,7 @@ GarbageCollect (uint32_t collect_gen,
 
  // mark the garbage collected CAFs as dead
 #if defined(DEBUG)
-  if (major_gc) { gcCAFs(); }
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); }
 #endif
 
   // Update the stable name hash table
@@ -768,8 +835,9 @@ GarbageCollect (uint32_t collect_gen,
   // check sanity after GC
   // before resurrectThreads(), because that might overwrite some
   // closures, which will cause problems with THREADED where we don't
-  // fill slop.
-  IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc));
+  // fill slop. If we are using the nonmoving collector then we can't claim to
+  // be *after* the major GC; it's now running concurrently.
+  IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving));
 
   // If a heap census is due, we need to do it before
   // resurrectThreads(), for the same reason as checkSanity above:
@@ -942,6 +1010,7 @@ new_gc_thread (uint32_t n, gc_thread *t)
         ws->todo_overflow = NULL;
         ws->n_todo_overflow = 0;
         ws->todo_large_objects = NULL;
+        ws->todo_seg = END_NONMOVING_TODO_LIST;
 
         ws->part_list = NULL;
         ws->n_part_blocks = 0;
@@ -1320,6 +1389,18 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
 }
 #endif
 
+/* ----------------------------------------------------------------------------
+   Save the mutable lists in saved_mut_lists where it will be scavenged
+   during GC
+   ------------------------------------------------------------------------- */
+
+static void
+stash_mut_list (Capability *cap, uint32_t gen_no)
+{
+    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
+    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
+}
+
 /* ----------------------------------------------------------------------------
    Initialise a generation that is to be collected
    ------------------------------------------------------------------------- */
@@ -1331,11 +1412,17 @@ prepare_collected_gen (generation *gen)
     gen_workspace *ws;
     bdescr *bd, *next;
 
-    // Throw away the current mutable list.  Invariant: the mutable
-    // list always has at least one block; this means we can avoid a
-    // check for NULL in recordMutable().
     g = gen->no;
-    if (g != 0) {
+
+    if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) {
+        // Nonmoving heap's mutable list is always a root.
+        for (i = 0; i < n_capabilities; i++) {
+            stash_mut_list(capabilities[i], g);
+        }
+    } else if (g != 0) {
+        // Otherwise throw away the current mutable list. Invariant: the
+        // mutable list always has at least one block; this means we can avoid
+        // a check for NULL in recordMutable().
         for (i = 0; i < n_capabilities; i++) {
             freeChain(capabilities[i]->mut_lists[g]);
             capabilities[i]->mut_lists[g] =
@@ -1351,13 +1438,17 @@ prepare_collected_gen (generation *gen)
     gen->old_threads = gen->threads;
     gen->threads = END_TSO_QUEUE;
 
-    // deprecate the existing blocks
-    gen->old_blocks   = gen->blocks;
-    gen->n_old_blocks = gen->n_blocks;
-    gen->blocks       = NULL;
-    gen->n_blocks     = 0;
-    gen->n_words      = 0;
-    gen->live_estimate = 0;
+    // deprecate the existing blocks (except in the case of the nonmoving
+    // collector since these will be preserved in nonmovingCollect for the
+    // concurrent GC).
+    if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) {
+        gen->old_blocks   = gen->blocks;
+        gen->n_old_blocks = gen->n_blocks;
+        gen->blocks       = NULL;
+        gen->n_blocks     = 0;
+        gen->n_words      = 0;
+        gen->live_estimate = 0;
+    }
 
     // initialise the large object queues.
     ASSERT(gen->scavenged_large_objects == NULL);
@@ -1451,18 +1542,6 @@ prepare_collected_gen (generation *gen)
     }
 }
 
-
-/* ----------------------------------------------------------------------------
-   Save the mutable lists in saved_mut_lists
-   ------------------------------------------------------------------------- */
-
-static void
-stash_mut_list (Capability *cap, uint32_t gen_no)
-{
-    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
-}
-
 /* ----------------------------------------------------------------------------
    Initialise a generation that is *not* to be collected
    ------------------------------------------------------------------------- */
@@ -1531,31 +1610,57 @@ collect_gct_blocks (void)
 }
 
 /* -----------------------------------------------------------------------------
-   During mutation, any blocks that are filled by allocatePinned() are
-   stashed on the local pinned_object_blocks list, to avoid needing to
-   take a global lock.  Here we collect those blocks from the
-   cap->pinned_object_blocks lists and put them on the
-   main g0->large_object list.
+   During mutation, any blocks that are filled by allocatePinned() are stashed
+   on the local pinned_object_blocks list, to avoid needing to take a global
+   lock.  Here we collect those blocks from the cap->pinned_object_blocks lists
+   and put them on the g0->large_object or oldest_gen->large_objects.
+
+   How to decide which list to put them on?
+
+   - When non-moving heap is enabled and this is a major GC, we put them on
+     oldest_gen. This is because after preparation we really want no
+     old-to-young references, and we want to be able to reset mut_lists. For
+     this we need to promote every potentially live object to the oldest gen.
+
+   - Otherwise we put them on g0.
    -------------------------------------------------------------------------- */
 
 static void
 collect_pinned_object_blocks (void)
 {
-    uint32_t n;
-    bdescr *bd, *prev;
+    generation *gen;
+    const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving;
+    if (use_nonmoving && major_gc) {
+        gen = oldest_gen;
+    } else {
+        gen = g0;
+    }
 
-    for (n = 0; n < n_capabilities; n++) {
-        prev = NULL;
-        for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
-            prev = bd;
+    for (uint32_t n = 0; n < n_capabilities; n++) {
+        bdescr *last = NULL;
+        if (use_nonmoving && gen == oldest_gen) {
+            // Mark objects as belonging to the nonmoving heap
+            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+                bd->flags |= BF_NONMOVING;
+                bd->gen = oldest_gen;
+                bd->gen_no = oldest_gen->no;
+                oldest_gen->n_large_words += bd->free - bd->start;
+                oldest_gen->n_large_blocks += bd->blocks;
+                last = bd;
+            }
+        } else {
+            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+                last = bd;
+            }
         }
-        if (prev != NULL) {
-            prev->link = g0->large_objects;
-            if (g0->large_objects != NULL) {
-                g0->large_objects->u.back = prev;
+
+        if (last != NULL) {
+            last->link = gen->large_objects;
+            if (gen->large_objects != NULL) {
+                gen->large_objects->u.back = last;
             }
-            g0->large_objects = capabilities[n]->pinned_object_blocks;
-            capabilities[n]->pinned_object_blocks = 0;
+            gen->large_objects = capabilities[n]->pinned_object_blocks;
+            capabilities[n]->pinned_object_blocks = NULL;
         }
     }
 }
@@ -1614,98 +1719,100 @@ mark_root(void *user USED_IF_THREADS, StgClosure **root)
    percentage of the maximum heap size available to allocate into.
    ------------------------------------------------------------------------- */
 
-static void
-resize_generations (void)
+void
+resizeGenerations (void)
 {
     uint32_t g;
+    W_ live, size, min_alloc, words;
+    const W_ max  = RtsFlags.GcFlags.maxHeapSize;
+    const W_ gens = RtsFlags.GcFlags.generations;
 
-    if (major_gc && RtsFlags.GcFlags.generations > 1) {
-        W_ live, size, min_alloc, words;
-        const W_ max  = RtsFlags.GcFlags.maxHeapSize;
-        const W_ gens = RtsFlags.GcFlags.generations;
-
-        // live in the oldest generations
-        if (oldest_gen->live_estimate != 0) {
-            words = oldest_gen->live_estimate;
-        } else {
-            words = oldest_gen->n_words;
-        }
-        live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
-            oldest_gen->n_large_blocks +
-            oldest_gen->n_compact_blocks;
+    // live in the oldest generations
+    if (oldest_gen->live_estimate != 0) {
+        words = oldest_gen->live_estimate;
+    } else {
+        words = oldest_gen->n_words;
+    }
+    live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
+        oldest_gen->n_large_blocks +
+        oldest_gen->n_compact_blocks;
 
-        // default max size for all generations except zero
-        size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
-                       RtsFlags.GcFlags.minOldGenSize);
+    // default max size for all generations except zero
+    size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
+                   RtsFlags.GcFlags.minOldGenSize);
 
-        if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
-            if (max > 0) {
-                RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
-            } else {
-                RtsFlags.GcFlags.heapSizeSuggestion = size;
-            }
+    if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
+        if (max > 0) {
+            RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
+        } else {
+            RtsFlags.GcFlags.heapSizeSuggestion = size;
         }
+    }
 
-        // minimum size for generation zero
-        min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
-                            RtsFlags.GcFlags.minAllocAreaSize
-                            * (W_)n_capabilities);
-
-        // Auto-enable compaction when the residency reaches a
-        // certain percentage of the maximum heap size (default: 30%).
-        if (RtsFlags.GcFlags.compact ||
-            (max > 0 &&
-             oldest_gen->n_blocks >
-             (RtsFlags.GcFlags.compactThreshold * max) / 100)) {
-            oldest_gen->mark = 1;
-            oldest_gen->compact = 1;
+    // minimum size for generation zero
+    min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
+                        RtsFlags.GcFlags.minAllocAreaSize
+                        * (W_)n_capabilities);
+
+    // Auto-enable compaction when the residency reaches a
+    // certain percentage of the maximum heap size (default: 30%).
+    // Except when non-moving GC is enabled.
+    if (!RtsFlags.GcFlags.useNonmoving &&
+        (RtsFlags.GcFlags.compact ||
+         (max > 0 &&
+          oldest_gen->n_blocks >
+          (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
+        oldest_gen->mark = 1;
+        oldest_gen->compact = 1;
 //        debugBelch("compaction: on\n", live);
-        } else {
-            oldest_gen->mark = 0;
-            oldest_gen->compact = 0;
+    } else {
+        oldest_gen->mark = 0;
+        oldest_gen->compact = 0;
 //        debugBelch("compaction: off\n", live);
-        }
+    }
 
-        if (RtsFlags.GcFlags.sweep) {
-            oldest_gen->mark = 1;
-        }
+    if (RtsFlags.GcFlags.sweep) {
+        oldest_gen->mark = 1;
+    }
 
-        // if we're going to go over the maximum heap size, reduce the
-        // size of the generations accordingly.  The calculation is
-        // different if compaction is turned on, because we don't need
-        // to double the space required to collect the old generation.
-        if (max != 0) {
+    // if we're going to go over the maximum heap size, reduce the
+    // size of the generations accordingly.  The calculation is
+    // different if compaction is turned on, because we don't need
+    // to double the space required to collect the old generation.
+    if (max != 0) {
+
+        // this test is necessary to ensure that the calculations
+        // below don't have any negative results - we're working
+        // with unsigned values here.
+        if (max < min_alloc) {
+            heapOverflow();
+        }
 
-            // this test is necessary to ensure that the calculations
-            // below don't have any negative results - we're working
-            // with unsigned values here.
-            if (max < min_alloc) {
-                heapOverflow();
+        if (oldest_gen->compact) {
+            if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
+                size = (max - min_alloc) / ((gens - 1) * 2 - 1);
             }
-
-            if (oldest_gen->compact) {
-                if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
-                    size = (max - min_alloc) / ((gens - 1) * 2 - 1);
-                }
-            } else {
-                if ( (size * (gens - 1) * 2) + min_alloc > max ) {
-                    size = (max - min_alloc) / ((gens - 1) * 2);
-                }
+        } else {
+            if ( (size * (gens - 1) * 2) + min_alloc > max ) {
+                size = (max - min_alloc) / ((gens - 1) * 2);
             }
+        }
 
-            if (size < live) {
-                heapOverflow();
-            }
+        if (size < live) {
+            heapOverflow();
         }
+    }
 
 #if 0
-        debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
-                   min_alloc, size, max);
+    debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
+               min_alloc, size, max);
+    debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n",
+               oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks);
+    debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks);
 #endif
 
-        for (g = 0; g < gens; g++) {
-            generations[g].max_blocks = size;
-        }
+    for (g = 0; g < gens; g++) {
+        generations[g].max_blocks = size;
     }
 }
 
@@ -1841,21 +1948,16 @@ resize_nursery (void)
 
 #if defined(DEBUG)
 
-static void gcCAFs(void)
+void gcCAFs(void)
 {
-    StgIndStatic *p, *prev;
+    uint32_t i = 0;
+    StgIndStatic *prev = NULL;
 
-    const StgInfoTable *info;
-    uint32_t i;
-
-    i = 0;
-    p = debug_caf_list;
-    prev = NULL;
-
-    for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST;
-         p = (StgIndStatic*)p->saved_info) {
-
-        info = get_itbl((StgClosure*)p);
+    for (StgIndStatic *p = debug_caf_list;
+         p != (StgIndStatic*) END_OF_CAF_LIST;
+         p = (StgIndStatic*) p->saved_info)
+    {
+        const StgInfoTable *info = get_itbl((StgClosure*)p);
         ASSERT(info->type == IND_STATIC);
 
         // See Note [STATIC_LINK fields] in Storage.h
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 43cc4ca8a16ac3357ff1d2a08e8255b89618f6fd..bde006913b83059a6c5d67912fd2bc417730a062 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -17,9 +17,12 @@
 
 #include "HeapAlloc.h"
 
-void GarbageCollect (uint32_t force_major_gc,
+void GarbageCollect (uint32_t collect_gen,
                      bool do_heap_census,
-                     uint32_t gc_type, Capability *cap, bool idle_cap[]);
+                     bool deadlock_detect,
+                     uint32_t gc_type,
+                     Capability *cap,
+                     bool idle_cap[]);
 
 typedef void (*evac_fn)(void *user, StgClosure **root);
 
@@ -30,6 +33,8 @@ bool doIdleGCWork(Capability *cap, bool all);
 
 extern uint32_t N;
 extern bool major_gc;
+/* See Note [Deadlock detection under nonmoving collector]. */
+extern bool deadlock_detect_gc;
 
 extern bdescr *mark_stack_bd;
 extern bdescr *mark_stack_top_bd;
@@ -55,6 +60,8 @@ void gcWorkerThread (Capability *cap);
 void initGcThreads (uint32_t from, uint32_t to);
 void freeGcThreads (void);
 
+void resizeGenerations (void);
+
 #if defined(THREADED_RTS)
 void waitForGcThreads (Capability *cap, bool idle_cap[]);
 void releaseGCThreads (Capability *cap, bool idle_cap[]);
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 650dc2c1dfbab2bc0d6c103a27522a502e2a1d2a..11080c1f22cf4aaf244e70b6e3b0d11357bc3a8f 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -60,6 +60,14 @@ isAlive(StgClosure *p)
     // ignore closures in generations that we're not collecting.
     bd = Bdescr((P_)q);
 
+    // isAlive is used when scavenging moving generations, before the mark
+    // phase. Because we don't know alive-ness of objects before the mark phase
+    // we have to conservatively treat objects in the non-moving generation as
+    // alive here.
+    if (bd->flags & BF_NONMOVING) {
+        return p;
+    }
+
     // if it's a pointer into to-space, then we're done
     if (bd->flags & BF_EVACUATED) {
         return p;
@@ -140,14 +148,14 @@ markCAFs (evac_fn evac, void *user)
     StgIndStatic *c;
 
     for (c = dyn_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
         evac(user, &c->indirectee);
     }
     for (c = revertible_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index 66f7a7f84fb7bd5855a0800599532ece48c6e1e9..3012f52f2822b4aff7f5edc74de0e000fd36a4bc 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -83,6 +83,7 @@ typedef struct gen_workspace_ {
     bdescr *     todo_bd;
     StgPtr       todo_free;            // free ptr for todo_bd
     StgPtr       todo_lim;             // lim for todo_bd
+    struct NonmovingSegment *todo_seg; // only available for oldest gen workspace
 
     WSDeque *    todo_q;
     bdescr *     todo_overflow;
@@ -100,9 +101,6 @@ typedef struct gen_workspace_ {
     bdescr *     part_list;
     StgWord      n_part_blocks;      // count of above
     StgWord      n_part_words;
-
-    StgWord pad[1];
-
 } gen_workspace ATTRIBUTE_ALIGNED(64);
 // align so that computing gct->gens[n] is a shift, not a multiply
 // fails if the size is <64, which is why we need the pad above
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
new file mode 100644
index 0000000000000000000000000000000000000000..50cf784aab61f35cb06e96b3aceec6ca785e656a
--- /dev/null
+++ b/rts/sm/NonMoving.c
@@ -0,0 +1,1390 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "Capability.h"
+#include "Printer.h"
+#include "Storage.h"
+// We call evacuate, which expects the thread-local gc_thread to be valid;
+// This is sometimes declared as a register variable therefore it is necessary
+// to include the declaration so that the compiler doesn't clobber the register.
+#include "GCThread.h"
+#include "GCTDecl.h"
+#include "Schedule.h"
+
+#include "NonMoving.h"
+#include "NonMovingMark.h"
+#include "NonMovingSweep.h"
+#include "NonMovingCensus.h"
+#include "StablePtr.h" // markStablePtrTable
+#include "Schedule.h" // markScheduler
+#include "Weak.h" // dead_weak_ptr_list
+
+struct NonmovingHeap nonmovingHeap;
+
+uint8_t nonmovingMarkEpoch = 1;
+
+static void nonmovingBumpEpoch(void) {
+    nonmovingMarkEpoch = nonmovingMarkEpoch == 1 ? 2 : 1;
+}
+
+#if defined(THREADED_RTS)
+/*
+ * This mutex ensures that only one non-moving collection is active at a time.
+ */
+Mutex nonmoving_collection_mutex;
+
+OSThreadId mark_thread;
+bool concurrent_coll_running = false;
+Condition concurrent_coll_finished;
+Mutex concurrent_coll_finished_lock;
+#endif
+
+/*
+ * Note [Non-moving garbage collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The sources rts/NonMoving*.c implement GHC's non-moving garbage collector
+ * for the oldest generation. In contrast to the throughput-oriented moving
+ * collector, the non-moving collector is designed to achieve low GC latencies
+ * on large heaps. It accomplishes low-latencies by way of a concurrent
+ * mark-and-sweep collection strategy on a specially-designed heap structure.
+ * While the design is described in detail in the design document found in
+ * docs/storage/nonmoving-gc, we briefly summarize the structure here.
+ *
+ *
+ * === Heap Structure ===
+ *
+ * The nonmoving heap (embodied by struct NonmovingHeap) consists of a family
+ * of allocators, each serving a range of allocation sizes. Each allocator
+ * consists of a set of *segments*, each of which contain fixed-size *blocks*
+ * (not to be confused with "blocks" provided by GHC's block allocator; this is
+ * admittedly an unfortunate overlap in terminology).  These blocks are the
+ * backing store for the allocator. In addition to blocks, the segment also
+ * contains some header information (see struct NonmovingSegment in
+ * NonMoving.h). This header contains a *bitmap* encoding one byte per block
+ * (used by the collector to record liveness), as well as the index of the next
+ * unallocated block (and a *snapshot* of this field which will be described in
+ * the next section).
+ *
+ * Each allocator maintains three sets of segments:
+ *
+ *  - A *current* segment for each capability; this is the segment which that
+ *    capability will allocate into.
+ *
+ *  - A pool of *active* segments, each of which containing at least one
+ *    unallocated block. The allocate will take a segment from this pool when
+ *    it fills its *current* segment.
+ *
+ *  - A set of *filled* segments, which contain no unallocated blocks and will
+ *    be collected during the next major GC cycle
+ *
+ * Storage for segments is allocated using the block allocator using an aligned
+ * group of NONMOVING_SEGMENT_BLOCKS blocks. This makes the task of locating
+ * the segment header for a clone a simple matter of bit-masking (as
+ * implemented by nonmovingGetSegment).
+ *
+ * In addition, to relieve pressure on the block allocator we keep a small pool
+ * of free blocks around (nonmovingHeap.free) which can be pushed/popped
+ * to/from in a lock-free manner.
+ *
+ *
+ * === Allocation ===
+ *
+ * The allocator (as implemented by nonmovingAllocate) starts by identifying
+ * which allocator the request should be made against. It then allocates into
+ * its local current segment and bumps the next_free pointer to point to the
+ * next unallocated block (as indicated by the bitmap). If it finds the current
+ * segment is now full it moves it to the filled list and looks for a new
+ * segment to make current from a few sources:
+ *
+ *  1. the allocator's active list (see pop_active_segment)
+ *  2. the nonmoving heap's free block pool (see nonmovingPopFreeSegment)
+ *  3. allocate a new segment from the block allocator (see
+ *     nonmovingAllocSegment)
+ *
+ * Note that allocation does *not* involve modifying the bitmap. The bitmap is
+ * only modified by the collector.
+ *
+ *
+ * === Snapshot invariant ===
+ *
+ * To safely collect in a concurrent setting, the collector relies on the
+ * notion of a *snapshot*. The snapshot is a hypothetical frozen state of the
+ * heap topology taken at the beginning of the major collection cycle.
+ * With this definition we require the following property of the mark phase,
+ * which we call the *snapshot invariant*,
+ *
+ *     All objects that were reachable at the time the snapshot was collected
+ *     must have their mark bits set at the end of the mark phase.
+ *
+ * As the mutator might change the topology of the heap while we are marking
+ * this property requires some cooperation from the mutator to maintain.
+ * Specifically, we rely on a write barrier as described in Note [Update
+ * remembered set].
+ *
+ * To determine which objects were existent when the snapshot was taken we
+ * record a snapshot of each segments next_free pointer at the beginning of
+ * collection.
+ *
+ *
+ * === Collection ===
+ *
+ * Collection happens in a few phases some of which occur during a
+ * stop-the-world period (marked with [STW]) and others which can occur
+ * concurrently with mutation and minor collection (marked with [CONC]):
+ *
+ *  1. [STW] Preparatory GC: Here we do a standard minor collection of the
+ *     younger generations (which may evacuate things to the nonmoving heap).
+ *     References from younger generations into the nonmoving heap are recorded
+ *     in the mark queue (see Note [Aging under the non-moving collector] in
+ *     this file).
+ *
+ *  2. [STW] Snapshot update: Here we update the segment snapshot metadata
+ *     (see nonmovingPrepareMark) and move the filled segments to
+ *     nonmovingHeap.sweep_list, which is the set of segments which we will
+ *     sweep this GC cycle.
+ *
+ *  3. [STW] Root collection: Here we walk over a variety of root sources
+ *     and add them to the mark queue (see nonmovingCollect).
+ *
+ *  4. [CONC] Concurrent marking: Here we do the majority of marking concurrently
+ *     with mutator execution (but with the write barrier enabled; see
+ *     Note [Update remembered set]).
+ *
+ *  5. [STW] Final sync: Here we interrupt the mutators, ask them to
+ *     flush their final update remembered sets, and mark any new references
+ *     we find.
+ *
+ *  6. [CONC] Sweep: Here we walk over the nonmoving segments on sweep_list
+ *     and place them back on either the active, current, or filled list,
+ *     depending upon how much live data they contain.
+ *
+ *
+ * === Marking ===
+ *
+ * Ignoring large and static objects, marking a closure is fairly
+ * straightforward (implemented in NonMovingMark.c:mark_closure):
+ *
+ *  1. Check whether the closure is in the non-moving generation; if not then
+ *     we ignore it.
+ *  2. Find the segment containing the closure's block.
+ *  3. Check whether the closure's block is above $seg->next_free_snap; if so
+ *     then the block was not allocated when we took the snapshot and therefore
+ *     we don't need to mark it.
+ *  4. Check whether the block's bitmap bits is equal to nonmovingMarkEpoch. If
+ *     so then we can stop as we have already marked it.
+ *  5. Push the closure's pointers to the mark queue.
+ *  6. Set the blocks bitmap bits to nonmovingMarkEpoch.
+ *
+ * Note that the ordering of (5) and (6) is rather important, as described in
+ * Note [StgStack dirtiness flags and concurrent marking].
+ *
+ *
+ * === Other references ===
+ *
+ * Apart from the design document in docs/storage/nonmoving-gc and the Ueno
+ * 2016 paper (TODO citation) from which it drew inspiration, there are a
+ * variety of other relevant Notes scattered throughout the tree:
+ *
+ *  - Note [Concurrent non-moving collection] (NonMoving.c) describes
+ *    concurrency control of the nonmoving collector
+ *
+ *  - Note [Live data accounting in nonmoving collector] (NonMoving.c)
+ *    describes how we track the quantity of live data in the nonmoving
+ *    generation.
+ *
+ *  - Note [Aging under the non-moving collector] (NonMoving.c) describes how
+ *    we accomodate aging
+ *
+ *  - Note [Large objects in the non-moving collector] (NonMovingMark.c)
+ *    describes how we track large objects.
+ *
+ *  - Note [Update remembered set] (NonMovingMark.c) describes the function and
+ *    implementation of the update remembered set used to realize the concurrent
+ *    write barrier.
+ *
+ *  - Note [Concurrent read barrier on deRefWeak#] (NonMovingMark.c) describes
+ *    the read barrier on Weak# objects.
+ *
+ *  - Note [Unintentional marking in resurrectThreads] (NonMovingMark.c) describes
+ *    a tricky interaction between the update remembered set flush and weak
+ *    finalization.
+ *
+ *  - Note [Origin references in the nonmoving collector] (NonMovingMark.h)
+ *    describes how we implement indirection short-cutting and the selector
+ *    optimisation.
+ *
+ *  - Note [StgStack dirtiness flags and concurrent marking] (TSO.h) describes
+ *    the protocol for concurrent marking of stacks.
+ *
+ *  - Note [Static objects under the nonmoving collector] (Storage.c) describes
+ *    treatment of static objects.
+ *
+ *
+ * Note [Concurrent non-moving collection]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Concurrency-control of non-moving garbage collection is a bit tricky. There
+ * are a few things to keep in mind:
+ *
+ *  - Only one non-moving collection may be active at a time. This is enforced by the
+ *    concurrent_coll_running flag, which is set when a collection is on-going. If
+ *    we attempt to initiate a new collection while this is set we wait on the
+ *    concurrent_coll_finished condition variable, which signals when the
+ *    active collection finishes.
+ *
+ *  - In between the mark and sweep phases the non-moving collector must synchronize
+ *    with mutator threads to collect and mark their final update remembered
+ *    sets. This is accomplished using
+ *    stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held
+ *    the final mark has concluded.
+ *
+ * Note that possibility of concurrent minor and non-moving collections
+ * requires that we handle static objects a bit specially. See
+ * Note [Static objects under the nonmoving collector] in Storage.c
+ * for details.
+ *
+ *
+ * Note [Aging under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The initial design of the non-moving collector mandated that all live data
+ * be evacuated to the non-moving heap prior to a major collection. This
+ * simplified certain bits of implementation and eased reasoning. However, it
+ * was (unsurprisingly) also found to result in significant amounts of
+ * unnecessary copying.
+ *
+ * Consequently, we now allow aging. Aging allows the preparatory GC leading up
+ * to a major collection to evacuate some objects into the young generation.
+ * However, this introduces the following tricky case that might arise after
+ * we have finished the preparatory GC:
+ *
+ *       moving heap  ┆  non-moving heap
+ *     ───────────────┆──────────────────
+ *                    ┆
+ *      B ←────────────── A ←─────────────── root
+ *      │             ┆     ↖─────────────── gen1 mut_list
+ *      ╰───────────────→ C
+ *                    ┆
+ *
+ * In this case C is clearly live, but the non-moving collector can only see
+ * this by walking through B, which lives in the moving heap. However, doing so
+ * would require that we synchronize with the mutator/minor GC to ensure that it
+ * isn't in the middle of moving B. What to do?
+ *
+ * The solution we use here is to teach the preparatory moving collector to
+ * "evacuate" objects it encounters in the non-moving heap by adding them to
+ * the mark queue. This is implemented by pushing the object to the update
+ * remembered set of the capability held by the evacuating gc_thread
+ * (implemented by markQueuePushClosureGC)
+ *
+ * Consequently collection of the case above would proceed as follows:
+ *
+ *  1. Initial state:
+ *      * A lives in the non-moving heap and is reachable from the root set
+ *      * A is on the oldest generation's mut_list, since it contains a pointer
+ *        to B, which lives in a younger generation
+ *      * B lives in the moving collector's from space
+ *      * C lives in the non-moving heap
+ *
+ *  2. Preparatory GC: Scavenging mut_lists:
+ *
+ *     The mut_list of the oldest generation is scavenged, resulting in B being
+ *     evacuated (aged) into the moving collector's to-space.
+ *
+ *  3. Preparatory GC: Scavenge B
+ *
+ *     B (now in to-space) is scavenged, resulting in evacuation of C.
+ *     evacuate(C) pushes a reference to C to the mark queue.
+ *
+ *  4. Non-moving GC: C is marked
+ *
+ *     The non-moving collector will come to C in the mark queue and mark it.
+ *
+ *
+ * Note [Deadlock detection under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * In GHC the garbage collector is responsible for identifying deadlocked
+ * programs. Providing for this responsibility is slightly tricky in the
+ * non-moving collector due to the existence of aging. In particular, the
+ * non-moving collector cannot traverse objects living in a young generation
+ * but reachable from the non-moving generation, as described in Note [Aging
+ * under the non-moving collector].
+ *
+ * However, this can pose trouble for deadlock detection since it means that we
+ * may conservatively mark dead closures as live. Consider this case:
+ *
+ *       moving heap  ┆  non-moving heap
+ *     ───────────────┆──────────────────
+ *                    ┆
+ *      MVAR_QUEUE ←───── TSO ←───────────── gen1 mut_list
+ *        ↑  │  ╰────────↗  │
+ *        │  │        ┆     │
+ *        │  │        ┆     ↓
+ *        │  ╰──────────→ MVAR
+ *        ╰─────────────────╯
+ *                    ┆
+ *
+ * In this case we have a TSO blocked on a dead MVar. Because the MVAR_TSO_QUEUE on
+ * which it is blocked lives in the moving heap, the TSO is necessarily on the
+ * oldest generation's mut_list. As in Note [Aging under the non-moving
+ * collector], the MVAR_TSO_QUEUE will be evacuated. If MVAR_TSO_QUEUE is aged
+ * (e.g. evacuated to the young generation) then the MVAR will be added to the
+ * mark queue. Consequently, we will falsely conclude that the MVAR is still
+ * alive and fail to spot the deadlock.
+ *
+ * To avoid this sort of situation we disable aging when we are starting a
+ * major GC specifically for deadlock detection (as done by
+ * scheduleDetectDeadlock). This condition is recorded by the
+ * deadlock_detect_gc global variable declared in GC.h. Setting this has a few
+ * effects on the preparatory GC:
+ *
+ *  - Evac.c:alloc_for_copy forces evacuation to the non-moving generation.
+ *
+ *  - The evacuation logic usually responsible for pushing objects living in
+ *    the non-moving heap to the mark queue is disabled. This is safe because
+ *    we know that all live objects will be in the non-moving heap by the end
+ *    of the preparatory moving collection.
+ *
+ *
+ * Note [Live data accounting in nonmoving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The nonmoving collector uses an approximate heuristic for reporting live
+ * data quantity. Specifically, during mark we record how much live data we
+ * find in nonmoving_live_words. At the end of mark we declare this amount to
+ * be how much live data we have on in the nonmoving heap (by setting
+ * oldest_gen->live_estimate).
+ *
+ * In addition, we update oldest_gen->live_estimate every time we fill a
+ * segment. This, as well, is quite approximate: we assume that all blocks
+ * above next_free_next are newly-allocated. In principle we could refer to the
+ * bitmap to count how many blocks we actually allocated but this too would be
+ * approximate due to concurrent collection and ultimately seems more costly
+ * than the problem demands.
+ *
+ */
+
+memcount nonmoving_live_words = 0;
+
+#if defined(THREADED_RTS)
+static void* nonmovingConcurrentMark(void *mark_queue);
+#endif
+static void nonmovingClearBitmap(struct NonmovingSegment *seg);
+static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads);
+
+static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t log_block_size)
+{
+    bdescr *bd = Bdescr((P_) seg);
+    seg->link = NULL;
+    seg->todo_link = NULL;
+    seg->next_free = 0;
+    nonmovingClearBitmap(seg);
+    bd->nonmoving_segment.log_block_size = log_block_size;
+    bd->nonmoving_segment.next_free_snap = 0;
+    bd->u.scan = nonmovingSegmentGetBlock(seg, 0);
+}
+
+// Add a segment to the free list.
+void nonmovingPushFreeSegment(struct NonmovingSegment *seg)
+{
+    // See Note [Live data accounting in nonmoving collector].
+    if (nonmovingHeap.n_free > NONMOVING_MAX_FREE) {
+        bdescr *bd = Bdescr((StgPtr) seg);
+        ACQUIRE_SM_LOCK;
+        ASSERT(oldest_gen->n_blocks >= bd->blocks);
+        ASSERT(oldest_gen->n_words >= BLOCK_SIZE_W * bd->blocks);
+        oldest_gen->n_blocks -= bd->blocks;
+        oldest_gen->n_words  -= BLOCK_SIZE_W * bd->blocks;
+        freeGroup(bd);
+        RELEASE_SM_LOCK;
+        return;
+    }
+
+    while (true) {
+        struct NonmovingSegment *old = nonmovingHeap.free;
+        seg->link = old;
+        if (cas((StgVolatilePtr) &nonmovingHeap.free, (StgWord) old, (StgWord) seg) == (StgWord) old)
+            break;
+    }
+    __sync_add_and_fetch(&nonmovingHeap.n_free, 1);
+}
+
+static struct NonmovingSegment *nonmovingPopFreeSegment(void)
+{
+    while (true) {
+        struct NonmovingSegment *seg = nonmovingHeap.free;
+        if (seg == NULL) {
+            return NULL;
+        }
+        if (cas((StgVolatilePtr) &nonmovingHeap.free,
+                (StgWord) seg,
+                (StgWord) seg->link) == (StgWord) seg) {
+            __sync_sub_and_fetch(&nonmovingHeap.n_free, 1);
+            return seg;
+        }
+    }
+}
+
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size)
+{
+  // We compute the overwhelmingly common size cases directly to avoid a very
+  // expensive integer division.
+  switch (log_block_size) {
+    case 3:  return nonmovingBlockCount(3);
+    case 4:  return nonmovingBlockCount(4);
+    case 5:  return nonmovingBlockCount(5);
+    case 6:  return nonmovingBlockCount(6);
+    case 7:  return nonmovingBlockCount(7);
+    default: return nonmovingBlockCount(log_block_size);
+  }
+}
+
+/*
+ * Request a fresh segment from the free segment list or allocate one of the
+ * given node.
+ *
+ * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock
+ * under the assumption that we are in a GC context).
+ */
+static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
+{
+    // First try taking something off of the free list
+    struct NonmovingSegment *ret;
+    ret = nonmovingPopFreeSegment();
+
+    // Nothing in the free list, allocate a new segment...
+    if (ret == NULL) {
+        // Take gc spinlock: another thread may be scavenging a moving
+        // generation and call `todo_block_full`
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        bdescr *bd = allocAlignedGroupOnNode(node, NONMOVING_SEGMENT_BLOCKS);
+        // See Note [Live data accounting in nonmoving collector].
+        oldest_gen->n_blocks += bd->blocks;
+        oldest_gen->n_words  += BLOCK_SIZE_W * bd->blocks;
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+
+        for (StgWord32 i = 0; i < bd->blocks; ++i) {
+            initBdescr(&bd[i], oldest_gen, oldest_gen);
+            bd[i].flags = BF_NONMOVING;
+        }
+        ret = (struct NonmovingSegment *)bd->start;
+    }
+
+    // Check alignment
+    ASSERT(((uintptr_t)ret % NONMOVING_SEGMENT_SIZE) == 0);
+    return ret;
+}
+
+static inline unsigned long log2_floor(unsigned long x)
+{
+    return sizeof(unsigned long)*8 - 1 - __builtin_clzl(x);
+}
+
+static inline unsigned long log2_ceil(unsigned long x)
+{
+    unsigned long log = log2_floor(x);
+    return (x - (1 << log)) ? log + 1 : log;
+}
+
+// Advance a segment's next_free pointer. Returns true if segment if full.
+static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count)
+{
+    const uint8_t *bitmap = seg->bitmap;
+    ASSERT(blk_count == nonmovingSegmentBlockCount(seg));
+#if defined(NAIVE_ADVANCE_FREE)
+    // reference implementation
+    for (unsigned int i = seg->next_free+1; i < blk_count; i++) {
+        if (!bitmap[i]) {
+            seg->next_free = i;
+            return false;
+        }
+    }
+    seg->next_free = blk_count;
+    return true;
+#else
+    const uint8_t *c = memchr(&bitmap[seg->next_free+1], 0, blk_count - seg->next_free - 1);
+    if (c == NULL) {
+        seg->next_free = blk_count;
+        return true;
+    } else {
+        seg->next_free = c - bitmap;
+        return false;
+    }
+#endif
+}
+
+static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca)
+{
+    while (true) {
+        struct NonmovingSegment *seg = alloca->active;
+        if (seg == NULL) {
+            return NULL;
+        }
+        if (cas((StgVolatilePtr) &alloca->active,
+                (StgWord) seg,
+                (StgWord) seg->link) == (StgWord) seg) {
+            return seg;
+        }
+    }
+}
+
+/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */
+GNUC_ATTR_HOT
+void *nonmovingAllocate(Capability *cap, StgWord sz)
+{
+    unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
+    unsigned int block_count = nonmovingBlockCountFromSize(log_block_size);
+
+    // The max we ever allocate is 3276 bytes (anything larger is a large
+    // object and not moved) which is covered by allocator 9.
+    ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT);
+
+    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0];
+
+    // Allocate into current segment
+    struct NonmovingSegment *current = alloca->current[cap->no];
+    ASSERT(current); // current is never NULL
+    void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free);
+    ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
+
+    // Advance the current segment's next_free or allocate a new segment if full
+    bool full = advance_next_free(current, block_count);
+    if (full) {
+        // Current segment is full: update live data estimate link it to
+        // filled, take an active segment if one exists, otherwise allocate a
+        // new segment.
+
+        // Update live data estimate.
+        // See Note [Live data accounting in nonmoving collector].
+        unsigned int new_blocks = block_count - nonmovingSegmentInfo(current)->next_free_snap;
+        unsigned int block_size = 1 << log_block_size;
+        atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_));
+
+        // push the current segment to the filled list
+        nonmovingPushFilledSegment(current);
+
+        // first look for a new segment in the active list
+        struct NonmovingSegment *new_current = pop_active_segment(alloca);
+
+        // there are no active segments, allocate new segment
+        if (new_current == NULL) {
+            new_current = nonmovingAllocSegment(cap->node);
+            nonmovingInitSegment(new_current, log_block_size);
+        }
+
+        // make it current
+        new_current->link = NULL;
+        alloca->current[cap->no] = new_current;
+    }
+
+    return ret;
+}
+
+/* Allocate a nonmovingAllocator */
+static struct NonmovingAllocator *alloc_nonmoving_allocator(uint32_t n_caps)
+{
+    size_t allocator_sz =
+        sizeof(struct NonmovingAllocator) +
+        sizeof(void*) * n_caps; // current segment pointer for each capability
+    struct NonmovingAllocator *alloc =
+        stgMallocBytes(allocator_sz, "nonmovingInit");
+    memset(alloc, 0, allocator_sz);
+    return alloc;
+}
+
+static void free_nonmoving_allocator(struct NonmovingAllocator *alloc)
+{
+    stgFree(alloc);
+}
+
+void nonmovingInit(void)
+{
+    if (! RtsFlags.GcFlags.useNonmoving) return;
+#if defined(THREADED_RTS)
+    initMutex(&nonmoving_collection_mutex);
+    initCondition(&concurrent_coll_finished);
+    initMutex(&concurrent_coll_finished_lock);
+#endif
+    for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(n_capabilities);
+    }
+    nonmovingMarkInitUpdRemSet();
+}
+
+// Stop any nonmoving collection in preparation for RTS shutdown.
+void nonmovingStop(void)
+{
+    if (! RtsFlags.GcFlags.useNonmoving) return;
+#if defined(THREADED_RTS)
+    if (mark_thread) {
+        debugTrace(DEBUG_nonmoving_gc,
+                   "waiting for nonmoving collector thread to terminate");
+        ACQUIRE_LOCK(&concurrent_coll_finished_lock);
+        waitCondition(&concurrent_coll_finished, &concurrent_coll_finished_lock);
+    }
+#endif
+}
+
+void nonmovingExit(void)
+{
+    if (! RtsFlags.GcFlags.useNonmoving) return;
+
+    // First make sure collector is stopped before we tear things down.
+    nonmovingStop();
+
+#if defined(THREADED_RTS)
+    closeMutex(&concurrent_coll_finished_lock);
+    closeCondition(&concurrent_coll_finished);
+    closeMutex(&nonmoving_collection_mutex);
+#endif
+
+    for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        free_nonmoving_allocator(nonmovingHeap.allocators[i]);
+    }
+}
+
+/*
+ * Assumes that no garbage collector or mutator threads are running to safely
+ * resize the nonmoving_allocators.
+ *
+ * Must hold sm_mutex.
+ */
+void nonmovingAddCapabilities(uint32_t new_n_caps)
+{
+    unsigned int old_n_caps = nonmovingHeap.n_caps;
+    struct NonmovingAllocator **allocs = nonmovingHeap.allocators;
+
+    for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+        struct NonmovingAllocator *old = allocs[i];
+        allocs[i] = alloc_nonmoving_allocator(new_n_caps);
+
+        // Copy the old state
+        allocs[i]->filled = old->filled;
+        allocs[i]->active = old->active;
+        for (unsigned int j = 0; j < old_n_caps; j++) {
+            allocs[i]->current[j] = old->current[j];
+        }
+        stgFree(old);
+
+        // Initialize current segments for the new capabilities
+        for (unsigned int j = old_n_caps; j < new_n_caps; j++) {
+            allocs[i]->current[j] = nonmovingAllocSegment(capabilities[j]->node);
+            nonmovingInitSegment(allocs[i]->current[j], NONMOVING_ALLOCA0 + i);
+            allocs[i]->current[j]->link = NULL;
+        }
+    }
+    nonmovingHeap.n_caps = new_n_caps;
+}
+
+static inline void nonmovingClearBitmap(struct NonmovingSegment *seg)
+{
+    unsigned int n = nonmovingSegmentBlockCount(seg);
+    memset(seg->bitmap, 0, n);
+}
+
+/* Prepare the heap bitmaps and snapshot metadata for a mark */
+static void nonmovingPrepareMark(void)
+{
+    // See Note [Static objects under the nonmoving collector].
+    prev_static_flag = static_flag;
+    static_flag =
+        static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
+
+    // Should have been cleared by the last sweep
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+
+    nonmovingBumpEpoch();
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+
+        // Update current segments' snapshot pointers
+        for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) {
+            struct NonmovingSegment *seg = alloca->current[cap_n];
+            nonmovingSegmentInfo(seg)->next_free_snap = seg->next_free;
+        }
+
+        // Update filled segments' snapshot pointers and move to sweep_list
+        uint32_t n_filled = 0;
+        struct NonmovingSegment *const filled = alloca->filled;
+        alloca->filled = NULL;
+        if (filled) {
+            struct NonmovingSegment *seg = filled;
+            while (true) {
+                n_filled++;
+                prefetchForRead(seg->link);
+                // Clear bitmap
+                prefetchForWrite(seg->link->bitmap);
+                nonmovingClearBitmap(seg);
+                // Set snapshot
+                nonmovingSegmentInfo(seg)->next_free_snap = seg->next_free;
+                if (seg->link)
+                    seg = seg->link;
+                else
+                    break;
+            }
+            // add filled segments to sweep_list
+            seg->link = nonmovingHeap.sweep_list;
+            nonmovingHeap.sweep_list = filled;
+        }
+
+        // N.B. It's not necessary to update snapshot pointers of active segments;
+        // they were set after they were swept and haven't seen any allocation
+        // since.
+    }
+
+    // Clear large object bits of existing large objects
+    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
+        bd->flags &= ~BF_MARKED;
+    }
+
+    // Add newly promoted large objects and clear mark bits
+    bdescr *next;
+    ASSERT(oldest_gen->scavenged_large_objects == NULL);
+    for (bdescr *bd = oldest_gen->large_objects; bd; bd = next) {
+        next = bd->link;
+        bd->flags |= BF_NONMOVING_SWEEPING;
+        bd->flags &= ~BF_MARKED;
+        dbl_link_onto(bd, &nonmoving_large_objects);
+    }
+    n_nonmoving_large_blocks += oldest_gen->n_large_blocks;
+    oldest_gen->large_objects = NULL;
+    oldest_gen->n_large_words = 0;
+    oldest_gen->n_large_blocks = 0;
+    nonmoving_live_words = 0;
+
+    // Clear compact object mark bits
+    for (bdescr *bd = nonmoving_compact_objects; bd; bd = bd->link) {
+        bd->flags &= ~BF_MARKED;
+    }
+
+    // Move new compact objects from younger generations to nonmoving_compact_objects
+    for (bdescr *bd = oldest_gen->compact_objects; bd; bd = next) {
+        next = bd->link;
+        bd->flags |= BF_NONMOVING_SWEEPING;
+        bd->flags &= ~BF_MARKED;
+        dbl_link_onto(bd, &nonmoving_compact_objects);
+    }
+    n_nonmoving_compact_blocks += oldest_gen->n_compact_blocks;
+    oldest_gen->n_compact_blocks = 0;
+    oldest_gen->compact_objects = NULL;
+    // TODO (osa): what about "in import" stuff??
+
+
+
+#if defined(DEBUG)
+    debug_caf_list_snapshot = debug_caf_list;
+    debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
+#endif
+}
+
+// Mark weak pointers in the non-moving heap. They'll either end up in
+// dead_weak_ptr_list or stay in weak_ptr_list. Either way they need to be kept
+// during sweep. See `MarkWeak.c:markWeakPtrList` for the moving heap variant
+// of this.
+static void nonmovingMarkWeakPtrList(MarkQueue *mark_queue, StgWeak *dead_weak_ptr_list)
+{
+    for (StgWeak *w = oldest_gen->weak_ptr_list; w; w = w->link) {
+        markQueuePushClosure_(mark_queue, (StgClosure*)w);
+        // Do not mark finalizers and values here, those fields will be marked
+        // in `nonmovingMarkDeadWeaks` (for dead weaks) or
+        // `nonmovingTidyWeaks` (for live weaks)
+    }
+
+    // We need to mark dead_weak_ptr_list too. This is subtle:
+    //
+    // - By the beginning of this GC we evacuated all weaks to the non-moving
+    //   heap (in `markWeakPtrList`)
+    //
+    // - During the scavenging of the moving heap we discovered that some of
+    //   those weaks are dead and moved them to `dead_weak_ptr_list`. Note that
+    //   because of the fact above _all weaks_ are in the non-moving heap at
+    //   this point.
+    //
+    // - So, to be able to traverse `dead_weak_ptr_list` and run finalizers we
+    //   need to mark it.
+    for (StgWeak *w = dead_weak_ptr_list; w; w = w->link) {
+        markQueuePushClosure_(mark_queue, (StgClosure*)w);
+        nonmovingMarkDeadWeak(mark_queue, w);
+    }
+}
+
+void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
+{
+#if defined(THREADED_RTS)
+    // We can't start a new collection until the old one has finished
+    // We also don't run in final GC
+    if (concurrent_coll_running || sched_state > SCHED_RUNNING) {
+        return;
+    }
+#endif
+
+    trace(TRACE_nonmoving_gc, "Starting nonmoving GC preparation");
+    resizeGenerations();
+
+    nonmovingPrepareMark();
+
+    // N.B. These should have been cleared at the end of the last sweep.
+    ASSERT(nonmoving_marked_large_objects == NULL);
+    ASSERT(n_nonmoving_marked_large_blocks == 0);
+    ASSERT(nonmoving_marked_compact_objects == NULL);
+    ASSERT(n_nonmoving_marked_compact_blocks == 0);
+
+    MarkQueue *mark_queue = stgMallocBytes(sizeof(MarkQueue), "mark queue");
+    initMarkQueue(mark_queue);
+    current_mark_queue = mark_queue;
+
+    // Mark roots
+    trace(TRACE_nonmoving_gc, "Marking roots for nonmoving GC");
+    markCAFs((evac_fn)markQueueAddRoot, mark_queue);
+    for (unsigned int n = 0; n < n_capabilities; ++n) {
+        markCapability((evac_fn)markQueueAddRoot, mark_queue,
+                capabilities[n], true/*don't mark sparks*/);
+    }
+    markScheduler((evac_fn)markQueueAddRoot, mark_queue);
+    nonmovingMarkWeakPtrList(mark_queue, *dead_weaks);
+    markStablePtrTable((evac_fn)markQueueAddRoot, mark_queue);
+
+    // Mark threads resurrected during moving heap scavenging
+    for (StgTSO *tso = *resurrected_threads; tso != END_TSO_QUEUE; tso = tso->global_link) {
+        markQueuePushClosure_(mark_queue, (StgClosure*)tso);
+    }
+    trace(TRACE_nonmoving_gc, "Finished marking roots for nonmoving GC");
+
+    // Roots marked, mark threads and weak pointers
+
+    // At this point all threads are moved to threads list (from old_threads)
+    // and all weaks are moved to weak_ptr_list (from old_weak_ptr_list) by
+    // the previous scavenge step, so we need to move them to "old" lists
+    // again.
+
+    // Fine to override old_threads because any live or resurrected threads are
+    // moved to threads or resurrected_threads lists.
+    ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
+    ASSERT(nonmoving_old_threads == END_TSO_QUEUE);
+    nonmoving_old_threads = oldest_gen->threads;
+    oldest_gen->threads = END_TSO_QUEUE;
+
+    // Make sure we don't lose any weak ptrs here. Weaks in old_weak_ptr_list
+    // will either be moved to `dead_weaks` (if dead) or `weak_ptr_list` (if
+    // alive).
+    ASSERT(oldest_gen->old_weak_ptr_list == NULL);
+    ASSERT(nonmoving_old_weak_ptr_list == NULL);
+    nonmoving_old_weak_ptr_list = oldest_gen->weak_ptr_list;
+    oldest_gen->weak_ptr_list = NULL;
+    trace(TRACE_nonmoving_gc, "Finished nonmoving GC preparation");
+
+    // We are now safe to start concurrent marking
+
+    // Note that in concurrent mark we can't use dead_weaks and
+    // resurrected_threads from the preparation to add new weaks and threads as
+    // that would cause races between minor collection and mark. So we only pass
+    // those lists to mark function in sequential case. In concurrent case we
+    // allocate fresh lists.
+
+#if defined(THREADED_RTS)
+    // If we're interrupting or shutting down, do not let this capability go and
+    // run a STW collection. Reason: we won't be able to acquire this capability
+    // again for the sync if we let it go, because it'll immediately start doing
+    // a major GC, becuase that's what we do when exiting scheduler (see
+    // exitScheduler()).
+    if (sched_state == SCHED_RUNNING) {
+        concurrent_coll_running = true;
+        nonmoving_write_barrier_enabled = true;
+        debugTrace(DEBUG_nonmoving_gc, "Starting concurrent mark thread");
+        createOSThread(&mark_thread, "non-moving mark thread",
+                       nonmovingConcurrentMark, mark_queue);
+    } else {
+        nonmovingConcurrentMark(mark_queue);
+    }
+#else
+    // Use the weak and thread lists from the preparation for any new weaks and
+    // threads found to be dead in mark.
+    nonmovingMark_(mark_queue, dead_weaks, resurrected_threads);
+#endif
+}
+
+/* Mark mark queue, threads, and weak pointers until no more weaks have been
+ * resuscitated
+ */
+static void nonmovingMarkThreadsWeaks(MarkQueue *mark_queue)
+{
+    while (true) {
+        // Propagate marks
+        nonmovingMark(mark_queue);
+
+        // Tidy threads and weaks
+        nonmovingTidyThreads();
+
+        if (! nonmovingTidyWeaks(mark_queue))
+            return;
+    }
+}
+
+#if defined(THREADED_RTS)
+static void* nonmovingConcurrentMark(void *data)
+{
+    MarkQueue *mark_queue = (MarkQueue*)data;
+    StgWeak *dead_weaks = NULL;
+    StgTSO *resurrected_threads = (StgTSO*)&stg_END_TSO_QUEUE_closure;
+    nonmovingMark_(mark_queue, &dead_weaks, &resurrected_threads);
+    return NULL;
+}
+
+// TODO: Not sure where to put this function.
+// Append w2 to the end of w1.
+static void appendWeakList( StgWeak **w1, StgWeak *w2 )
+{
+    while (*w1) {
+        w1 = &(*w1)->link;
+    }
+    *w1 = w2;
+}
+#endif
+
+static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads)
+{
+    ACQUIRE_LOCK(&nonmoving_collection_mutex);
+    debugTrace(DEBUG_nonmoving_gc, "Starting mark...");
+
+    // Do concurrent marking; most of the heap will get marked here.
+    nonmovingMarkThreadsWeaks(mark_queue);
+
+#if defined(THREADED_RTS)
+    Task *task = newBoundTask();
+
+    // If at this point if we've decided to exit then just return
+    if (sched_state > SCHED_RUNNING) {
+        // Note that we break our invariants here and leave segments in
+        // nonmovingHeap.sweep_list, don't free nonmoving_large_objects etc.
+        // However because we won't be running mark-sweep in the final GC this
+        // is OK.
+
+        // This is a RTS shutdown so we need to move our copy (snapshot) of
+        // weaks (nonmoving_old_weak_ptr_list and nonmoving_weak_ptr_list) to
+        // oldest_gen->threads to be able to run C finalizers in hs_exit_. Note
+        // that there may be more weaks added to oldest_gen->threads since we
+        // started mark, so we need to append our list to the tail of
+        // oldest_gen->threads.
+        appendWeakList(&nonmoving_old_weak_ptr_list, nonmoving_weak_ptr_list);
+        appendWeakList(&oldest_gen->weak_ptr_list, nonmoving_old_weak_ptr_list);
+        // These lists won't be used again so this is not necessary, but still
+        nonmoving_old_weak_ptr_list = NULL;
+        nonmoving_weak_ptr_list = NULL;
+
+        goto finish;
+    }
+
+    // We're still running, request a sync
+    nonmovingBeginFlush(task);
+
+    bool all_caps_syncd;
+    do {
+        all_caps_syncd = nonmovingWaitForFlush();
+        nonmovingMarkThreadsWeaks(mark_queue);
+    } while (!all_caps_syncd);
+#endif
+
+    nonmovingResurrectThreads(mark_queue, resurrected_threads);
+
+    // No more resurrecting threads after this point
+
+    // Do last marking of weak pointers
+    while (true) {
+        // Propagate marks
+        nonmovingMark(mark_queue);
+
+        if (!nonmovingTidyWeaks(mark_queue))
+            break;
+    }
+
+    nonmovingMarkDeadWeaks(mark_queue, dead_weaks);
+
+    // Propagate marks
+    nonmovingMark(mark_queue);
+
+    // Now remove all dead objects from the mut_list to ensure that a younger
+    // generation collection doesn't attempt to look at them after we've swept.
+    nonmovingSweepMutLists();
+
+    debugTrace(DEBUG_nonmoving_gc,
+               "Done marking, resurrecting threads before releasing capabilities");
+
+
+    // Schedule finalizers and resurrect threads
+#if defined(THREADED_RTS)
+    // Just pick a random capability. Not sure if this is a good idea -- we use
+    // only one capability for all finalizers.
+    scheduleFinalizers(capabilities[0], *dead_weaks);
+    // Note that this mutates heap and causes running write barriers.
+    // See Note [Unintentional marking in resurrectThreads] in NonMovingMark.c
+    // for how we deal with this.
+    resurrectThreads(*resurrected_threads);
+#endif
+
+#if defined(DEBUG)
+    // Zap CAFs that we will sweep
+    nonmovingGcCafs();
+#endif
+
+    ASSERT(mark_queue->top->head == 0);
+    ASSERT(mark_queue->blocks->link == NULL);
+
+    // Update oldest_gen thread and weak lists
+    // Note that we need to append these lists as a concurrent minor GC may have
+    // added stuff to them while we're doing mark-sweep concurrently
+    {
+        StgTSO **threads = &oldest_gen->threads;
+        while (*threads != END_TSO_QUEUE) {
+            threads = &(*threads)->global_link;
+        }
+        *threads = nonmoving_threads;
+        nonmoving_threads = END_TSO_QUEUE;
+        nonmoving_old_threads = END_TSO_QUEUE;
+    }
+
+    {
+        StgWeak **weaks = &oldest_gen->weak_ptr_list;
+        while (*weaks) {
+            weaks = &(*weaks)->link;
+        }
+        *weaks = nonmoving_weak_ptr_list;
+        nonmoving_weak_ptr_list = NULL;
+        nonmoving_old_weak_ptr_list = NULL;
+    }
+
+    // Everything has been marked; allow the mutators to proceed
+#if defined(THREADED_RTS)
+    nonmoving_write_barrier_enabled = false;
+    nonmovingFinishFlush(task);
+#endif
+
+    current_mark_queue = NULL;
+    freeMarkQueue(mark_queue);
+    stgFree(mark_queue);
+
+    oldest_gen->live_estimate = nonmoving_live_words;
+    oldest_gen->n_old_blocks = 0;
+    resizeGenerations();
+
+    /****************************************************
+     * Sweep
+     ****************************************************/
+
+    traceConcSweepBegin();
+
+    // Because we can't mark large object blocks (no room for mark bit) we
+    // collect them in a map in mark_queue and we pass it here to sweep large
+    // objects
+    nonmovingSweepLargeObjects();
+    nonmovingSweepCompactObjects();
+    nonmovingSweepStableNameTable();
+
+    nonmovingSweep();
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+    debugTrace(DEBUG_nonmoving_gc, "Finished sweeping.");
+    traceConcSweepEnd();
+#if defined(DEBUG)
+    if (RtsFlags.DebugFlags.nonmoving_gc)
+        nonmovingPrintAllocatorCensus();
+#endif
+
+    // TODO: Remainder of things done by GarbageCollect (update stats)
+
+#if defined(THREADED_RTS)
+finish:
+    boundTaskExiting(task);
+
+    // We are done...
+    mark_thread = 0;
+
+    // Signal that the concurrent collection is finished, allowing the next
+    // non-moving collection to proceed
+    concurrent_coll_running = false;
+    signalCondition(&concurrent_coll_finished);
+    RELEASE_LOCK(&nonmoving_collection_mutex);
+#endif
+}
+
+#if defined(DEBUG)
+
+// Use this with caution: this doesn't work correctly during scavenge phase
+// when we're doing parallel scavenging. Use it in mark phase or later (where
+// we don't allocate more anymore).
+void assert_in_nonmoving_heap(StgPtr p)
+{
+    if (!HEAP_ALLOCED_GC(p))
+        return;
+
+    bdescr *bd = Bdescr(p);
+    if (bd->flags & BF_LARGE) {
+        // It should be in a capability (if it's not filled yet) or in non-moving heap
+        for (uint32_t cap = 0; cap < n_capabilities; ++cap) {
+            if (bd == capabilities[cap]->pinned_object_block) {
+                return;
+            }
+        }
+        ASSERT(bd->flags & BF_NONMOVING);
+        return;
+    }
+
+    // Search snapshot segments
+    for (struct NonmovingSegment *seg = nonmovingHeap.sweep_list; seg; seg = seg->link) {
+        if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+            return;
+        }
+    }
+
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+        // Search current segments
+        for (uint32_t cap_idx = 0; cap_idx < n_capabilities; ++cap_idx) {
+            struct NonmovingSegment *seg = alloca->current[cap_idx];
+            if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                return;
+            }
+        }
+
+        // Search active segments
+        int seg_idx = 0;
+        struct NonmovingSegment *seg = alloca->active;
+        while (seg) {
+            if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+
+        // Search filled segments
+        seg_idx = 0;
+        seg = alloca->filled;
+        while (seg) {
+            if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+    }
+
+    // We don't search free segments as they're unused
+
+    barf("%p is not in nonmoving heap\n", (void*)p);
+}
+
+void nonmovingPrintSegment(struct NonmovingSegment *seg)
+{
+    int num_blocks = nonmovingSegmentBlockCount(seg);
+    uint8_t log_block_size = nonmovingSegmentLogBlockSize(seg);
+
+    debugBelch("Segment with %d blocks of size 2^%d (%d bytes, %u words, scan: %p)\n",
+               num_blocks,
+               log_block_size,
+               1 << log_block_size,
+               (unsigned int) ROUNDUP_BYTES_TO_WDS(1 << log_block_size),
+               (void*)Bdescr((P_)seg)->u.scan);
+
+    for (nonmoving_block_idx p_idx = 0; p_idx < seg->next_free; ++p_idx) {
+        StgClosure *p = (StgClosure*)nonmovingSegmentGetBlock(seg, p_idx);
+        if (nonmovingGetMark(seg, p_idx) != 0) {
+            debugBelch("%d (%p)* :\t", p_idx, (void*)p);
+        } else {
+            debugBelch("%d (%p)  :\t", p_idx, (void*)p);
+        }
+        printClosure(p);
+    }
+
+    debugBelch("End of segment\n\n");
+}
+
+void nonmovingPrintAllocator(struct NonmovingAllocator *alloc)
+{
+    debugBelch("Allocator at %p\n", (void*)alloc);
+    debugBelch("Filled segments:\n");
+    for (struct NonmovingSegment *seg = alloc->filled; seg != NULL; seg = seg->link) {
+        debugBelch("%p ", (void*)seg);
+    }
+    debugBelch("\nActive segments:\n");
+    for (struct NonmovingSegment *seg = alloc->active; seg != NULL; seg = seg->link) {
+        debugBelch("%p ", (void*)seg);
+    }
+    debugBelch("\nCurrent segments:\n");
+    for (uint32_t i = 0; i < n_capabilities; ++i) {
+        debugBelch("%p ", alloc->current[i]);
+    }
+    debugBelch("\n");
+}
+
+void locate_object(P_ obj)
+{
+    // Search allocators
+    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
+        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
+        for (uint32_t cap = 0; cap < n_capabilities; ++cap) {
+            struct NonmovingSegment *seg = alloca->current[cap];
+            if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                debugBelch("%p is in current segment of capability %d of allocator %d at %p\n", obj, cap, alloca_idx, (void*)seg);
+                return;
+            }
+        }
+        int seg_idx = 0;
+        struct NonmovingSegment *seg = alloca->active;
+        while (seg) {
+            if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                debugBelch("%p is in active segment %d of allocator %d at %p\n", obj, seg_idx, alloca_idx, (void*)seg);
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+
+        seg_idx = 0;
+        seg = alloca->filled;
+        while (seg) {
+            if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+                debugBelch("%p is in filled segment %d of allocator %d at %p\n", obj, seg_idx, alloca_idx, (void*)seg);
+                return;
+            }
+            seg_idx++;
+            seg = seg->link;
+        }
+    }
+
+    struct NonmovingSegment *seg = nonmovingHeap.free;
+    int seg_idx = 0;
+    while (seg) {
+        if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
+            debugBelch("%p is in free segment %d at %p\n", obj, seg_idx, (void*)seg);
+            return;
+        }
+        seg_idx++;
+        seg = seg->link;
+    }
+
+    // Search nurseries
+    for (uint32_t nursery_idx = 0; nursery_idx < n_nurseries; ++nursery_idx) {
+        for (bdescr* nursery_block = nurseries[nursery_idx].blocks; nursery_block; nursery_block = nursery_block->link) {
+            if (obj >= nursery_block->start && obj <= nursery_block->start + nursery_block->blocks*BLOCK_SIZE_W) {
+                debugBelch("%p is in nursery %d\n", obj, nursery_idx);
+                return;
+            }
+        }
+    }
+
+    // Search generations
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++g) {
+        generation *gen = &generations[g];
+        for (bdescr *blk = gen->blocks; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " blocks\n", obj, g);
+                return;
+            }
+        }
+        for (bdescr *blk = gen->old_blocks; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " old blocks\n", obj, g);
+                return;
+            }
+        }
+    }
+
+    // Search large objects
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++g) {
+        generation *gen = &generations[g];
+        for (bdescr *large_block = gen->large_objects; large_block; large_block = large_block->link) {
+            if ((P_)large_block->start == obj) {
+                debugBelch("%p is in large blocks of generation %d\n", obj, g);
+                return;
+            }
+        }
+    }
+
+    for (bdescr *large_block = nonmoving_large_objects; large_block; large_block = large_block->link) {
+        if ((P_)large_block->start == obj) {
+            debugBelch("%p is in nonmoving_large_objects\n", obj);
+            return;
+        }
+    }
+
+    for (bdescr *large_block = nonmoving_marked_large_objects; large_block; large_block = large_block->link) {
+        if ((P_)large_block->start == obj) {
+            debugBelch("%p is in nonmoving_marked_large_objects\n", obj);
+            return;
+        }
+    }
+
+    // Search workspaces FIXME only works in non-threaded runtime
+#if !defined(THREADED_RTS)
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++ g) {
+        gen_workspace *ws = &gct->gens[g];
+        for (bdescr *blk = ws->todo_bd; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " todo bds\n", obj, g);
+                return;
+            }
+        }
+        for (bdescr *blk = ws->scavd_list; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " scavd bds\n", obj, g);
+                return;
+            }
+        }
+        for (bdescr *blk = ws->todo_large_objects; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " todo large bds\n", obj, g);
+                return;
+            }
+        }
+    }
+#endif
+}
+
+void nonmovingPrintSweepList()
+{
+    debugBelch("==== SWEEP LIST =====\n");
+    int i = 0;
+    for (struct NonmovingSegment *seg = nonmovingHeap.sweep_list; seg; seg = seg->link) {
+        debugBelch("%d: %p\n", i++, (void*)seg);
+    }
+    debugBelch("= END OF SWEEP LIST =\n");
+}
+
+void check_in_mut_list(StgClosure *p)
+{
+    for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) {
+        for (bdescr *bd = capabilities[cap_n]->mut_lists[oldest_gen->no]; bd; bd = bd->link) {
+            for (StgPtr q = bd->start; q < bd->free; ++q) {
+                if (*((StgPtr**)q) == (StgPtr*)p) {
+                    debugBelch("Object is in mut list of cap %d: %p\n", cap_n, capabilities[cap_n]->mut_lists[oldest_gen->no]);
+                    return;
+                }
+            }
+        }
+    }
+
+    debugBelch("Object is not in a mut list\n");
+}
+
+void print_block_list(bdescr* bd)
+{
+    while (bd) {
+        debugBelch("%p, ", (void*)bd);
+        bd = bd->link;
+    }
+    debugBelch("\n");
+}
+
+void print_thread_list(StgTSO* tso)
+{
+    while (tso != END_TSO_QUEUE) {
+        printClosure((StgClosure*)tso);
+        tso = tso->global_link;
+    }
+}
+
+#endif
diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3d4e14065cd718327b2fd16fa84b12a81e9a93b
--- /dev/null
+++ b/rts/sm/NonMoving.h
@@ -0,0 +1,335 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#if !defined(CMINUSMINUS)
+
+#include <string.h>
+#include "HeapAlloc.h"
+#include "NonMovingMark.h"
+
+#include "BeginPrivate.h"
+
+// Segments
+#define NONMOVING_SEGMENT_BITS 15   // 2^15 = 32kByte
+// Mask to find base of segment
+#define NONMOVING_SEGMENT_MASK ((1 << NONMOVING_SEGMENT_BITS) - 1)
+// In bytes
+#define NONMOVING_SEGMENT_SIZE (1 << NONMOVING_SEGMENT_BITS)
+// In words
+#define NONMOVING_SEGMENT_SIZE_W ((1 << NONMOVING_SEGMENT_BITS) / SIZEOF_VOID_P)
+// In blocks
+#define NONMOVING_SEGMENT_BLOCKS (NONMOVING_SEGMENT_SIZE / BLOCK_SIZE)
+
+_Static_assert(NONMOVING_SEGMENT_SIZE % BLOCK_SIZE == 0,
+               "non-moving segment size must be multiple of block size");
+
+// The index of a block within a segment
+typedef uint16_t nonmoving_block_idx;
+
+// A non-moving heap segment
+struct NonmovingSegment {
+    struct NonmovingSegment *link;     // for linking together segments into lists
+    struct NonmovingSegment *todo_link; // NULL when not in todo list
+    nonmoving_block_idx next_free;      // index of the next unallocated block
+    uint8_t bitmap[];                   // liveness bitmap
+    // After the liveness bitmap comes the data blocks. Note that we need to
+    // ensure that the size of this struct (including the bitmap) is a multiple
+    // of the word size since GHC assumes that all object pointers are
+    // so-aligned.
+
+    // N.B. There are also bits of information which are stored in the
+    // NonmovingBlockInfo stored in the segment's block descriptor. Namely:
+    //
+    //  * the block size can be found in nonmovingBlockInfo(seg)->log_block_size.
+    //  * the next_free snapshot can be found in
+    //    nonmovingBlockInfo(seg)->next_free_snap.
+    //
+    // This allows us to mark a nonmoving closure without bringing the
+    // NonmovingSegment header into cache.
+};
+
+// This is how we mark end of todo lists. Not NULL because todo_link == NULL
+// means segment is not in list.
+#define END_NONMOVING_TODO_LIST ((struct NonmovingSegment*)1)
+
+// A non-moving allocator for a particular block size
+struct NonmovingAllocator {
+    struct NonmovingSegment *filled;
+    struct NonmovingSegment *active;
+    // indexed by capability number
+    struct NonmovingSegment *current[];
+};
+
+// first allocator is of size 2^NONMOVING_ALLOCA0 (in bytes)
+#define NONMOVING_ALLOCA0 3
+
+// allocators cover block sizes of 2^NONMOVING_ALLOCA0 to
+// 2^(NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT) (in bytes)
+#define NONMOVING_ALLOCA_CNT 12
+
+// maximum number of free segments to hold on to
+#define NONMOVING_MAX_FREE 16
+
+struct NonmovingHeap {
+    struct NonmovingAllocator *allocators[NONMOVING_ALLOCA_CNT];
+    // free segment list. This is a cache where we keep up to
+    // NONMOVING_MAX_FREE segments to avoid thrashing the block allocator.
+    // Note that segments in this list are still counted towards
+    // oldest_gen->n_blocks.
+    struct NonmovingSegment *free;
+    // how many segments in free segment list? accessed atomically.
+    unsigned int n_free;
+
+    // records the current length of the nonmovingAllocator.current arrays
+    unsigned int n_caps;
+
+    // The set of segments being swept in this GC. Segments are moved here from
+    // the filled list during preparation and moved back to either the filled,
+    // active, or free lists during sweep.  Should be NULL before mark and
+    // after sweep.
+    struct NonmovingSegment *sweep_list;
+};
+
+extern struct NonmovingHeap nonmovingHeap;
+
+extern memcount nonmoving_live_words;
+
+#if defined(THREADED_RTS)
+extern bool concurrent_coll_running;
+#endif
+
+void nonmovingInit(void);
+void nonmovingStop(void);
+void nonmovingExit(void);
+
+
+// dead_weaks and resurrected_threads lists are used for two things:
+//
+// - The weaks and threads in those lists are found to be dead during
+//   preparation, but the weaks will be used for finalization and threads will
+//   be scheduled again (aka. resurrection) so we need to keep them alive in the
+//   non-moving heap as well. So we treat them as roots and mark them.
+//
+// - In non-threaded runtime we add weaks and threads found to be dead in the
+//   non-moving heap to those lists so that they'll be finalized and scheduled
+//   as other weaks and threads. In threaded runtime we can't do this as that'd
+//   cause races between a minor collection and non-moving collection. Instead
+//   in non-moving heap we finalize the weaks and resurrect the threads
+//   directly, but in a pause.
+//
+void nonmovingCollect(StgWeak **dead_weaks,
+                       StgTSO **resurrected_threads);
+
+void *nonmovingAllocate(Capability *cap, StgWord sz);
+void nonmovingAddCapabilities(uint32_t new_n_caps);
+void nonmovingPushFreeSegment(struct NonmovingSegment *seg);
+
+
+INLINE_HEADER struct NonmovingSegmentInfo *nonmovingSegmentInfo(struct NonmovingSegment *seg) {
+    return &Bdescr((StgPtr) seg)->nonmoving_segment;
+}
+
+INLINE_HEADER uint8_t nonmovingSegmentLogBlockSize(struct NonmovingSegment *seg) {
+    return nonmovingSegmentInfo(seg)->log_block_size;
+}
+
+// Add a segment to the appropriate active list.
+INLINE_HEADER void nonmovingPushActiveSegment(struct NonmovingSegment *seg)
+{
+    struct NonmovingAllocator *alloc =
+        nonmovingHeap.allocators[nonmovingSegmentLogBlockSize(seg) - NONMOVING_ALLOCA0];
+    while (true) {
+        struct NonmovingSegment *current_active = (struct NonmovingSegment*)VOLATILE_LOAD(&alloc->active);
+        seg->link = current_active;
+        if (cas((StgVolatilePtr) &alloc->active, (StgWord) current_active, (StgWord) seg) == (StgWord) current_active) {
+            break;
+        }
+    }
+}
+
+// Add a segment to the appropriate filled list.
+INLINE_HEADER void nonmovingPushFilledSegment(struct NonmovingSegment *seg)
+{
+    struct NonmovingAllocator *alloc =
+        nonmovingHeap.allocators[nonmovingSegmentLogBlockSize(seg) - NONMOVING_ALLOCA0];
+    while (true) {
+        struct NonmovingSegment *current_filled = (struct NonmovingSegment*)VOLATILE_LOAD(&alloc->filled);
+        seg->link = current_filled;
+        if (cas((StgVolatilePtr) &alloc->filled, (StgWord) current_filled, (StgWord) seg) == (StgWord) current_filled) {
+            break;
+        }
+    }
+}
+// Assert that the pointer can be traced by the non-moving collector (e.g. in
+// mark phase). This means one of the following:
+//
+// - A static object
+// - A large object
+// - An object in the non-moving heap (e.g. in one of the segments)
+//
+void assert_in_nonmoving_heap(StgPtr p);
+
+// The block size of a given segment in bytes.
+INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *seg)
+{
+    return 1 << nonmovingSegmentLogBlockSize(seg);
+}
+
+// How many blocks does a segment with the given block size have?
+INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size)
+{
+  unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment);
+  segment_data_size -= segment_data_size % SIZEOF_VOID_P;
+  unsigned int blk_size = 1 << log_block_size;
+  // N.B. +1 accounts for the byte in the mark bitmap.
+  return segment_data_size / (blk_size + 1);
+}
+
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size);
+
+// How many blocks does the given segment contain? Also the size of the bitmap.
+INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
+{
+  return nonmovingBlockCountFromSize(nonmovingSegmentLogBlockSize(seg));
+}
+
+// Get a pointer to the given block index assuming that the block size is as
+// given (avoiding a potential cache miss when this information is already
+// available). The log_block_size argument must be equal to seg->block_size.
+INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i)
+{
+  ASSERT(log_block_size == nonmovingSegmentLogBlockSize(seg));
+  // Block size in bytes
+  unsigned int blk_size = 1 << log_block_size;
+  // Bitmap size in bytes
+  W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t);
+  // Where the actual data starts (address of the first block).
+  // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that
+  // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back
+  // back to bytes by multiplying with word size.
+  W_ data = ROUNDUP_BYTES_TO_WDS(((W_)seg) + sizeof(struct NonmovingSegment) + bitmap_size) * sizeof(W_);
+  return (void*)(data + i*blk_size);
+}
+
+// Get a pointer to the given block index.
+INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+  return nonmovingSegmentGetBlock_(seg, nonmovingSegmentLogBlockSize(seg), i);
+}
+
+// Get the segment which a closure resides in. Assumes that pointer points into
+// non-moving heap.
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p)
+{
+    const uintptr_t mask = ~NONMOVING_SEGMENT_MASK;
+    return (struct NonmovingSegment *) (((uintptr_t) p) & mask);
+}
+
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    return nonmovingGetSegment_unchecked(p);
+}
+
+INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    struct NonmovingSegment *seg = nonmovingGetSegment(p);
+    ptrdiff_t blk0 = (ptrdiff_t)nonmovingSegmentGetBlock(seg, 0);
+    ptrdiff_t offset = (ptrdiff_t)p - blk0;
+    return (nonmoving_block_idx) (offset >> nonmovingSegmentLogBlockSize(seg));
+}
+
+// TODO: Eliminate this
+extern uint8_t nonmovingMarkEpoch;
+
+INLINE_HEADER void nonmovingSetMark(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+    seg->bitmap[i] = nonmovingMarkEpoch;
+}
+
+INLINE_HEADER uint8_t nonmovingGetMark(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+    return seg->bitmap[i];
+}
+
+INLINE_HEADER void nonmovingSetClosureMark(StgPtr p)
+{
+    nonmovingSetMark(nonmovingGetSegment(p), nonmovingGetBlockIdx(p));
+}
+
+// TODO: Audit the uses of these
+/* Was the given closure marked this major GC cycle? */
+INLINE_HEADER bool nonmovingClosureMarkedThisCycle(StgPtr p)
+{
+    struct NonmovingSegment *seg = nonmovingGetSegment(p);
+    nonmoving_block_idx blk_idx = nonmovingGetBlockIdx(p);
+    return nonmovingGetMark(seg, blk_idx) == nonmovingMarkEpoch;
+}
+
+INLINE_HEADER bool nonmovingClosureMarked(StgPtr p)
+{
+    struct NonmovingSegment *seg = nonmovingGetSegment(p);
+    nonmoving_block_idx blk_idx = nonmovingGetBlockIdx(p);
+    return nonmovingGetMark(seg, blk_idx) != 0;
+}
+
+// Can be called during a major collection to determine whether a particular
+// segment is in the set of segments that will be swept this collection cycle.
+INLINE_HEADER bool nonmovingSegmentBeingSwept(struct NonmovingSegment *seg)
+{
+    struct NonmovingSegmentInfo *seginfo = nonmovingSegmentInfo(seg);
+    unsigned int n = nonmovingBlockCountFromSize(seginfo->log_block_size);
+    return seginfo->next_free_snap >= n;
+}
+
+// Can be called during a major collection to determine whether a particular
+// closure lives in a segment that will be swept this collection cycle.
+// Note that this returns true for both large and normal objects.
+INLINE_HEADER bool nonmovingClosureBeingSwept(StgClosure *p)
+{
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (HEAP_ALLOCED_GC(p)) {
+        if (bd->flags & BF_NONMOVING_SWEEPING) {
+            return true;
+        } else if (bd->flags & BF_NONMOVING) {
+            struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+            return nonmovingSegmentBeingSwept(seg);
+        } else {
+            // outside of the nonmoving heap
+            return false;
+        }
+    } else {
+        // a static object
+        return true;
+    }
+}
+
+INLINE_HEADER bool isNonmovingClosure(StgClosure *p)
+{
+    return !HEAP_ALLOCED_GC(p) || Bdescr((P_)p)->flags & BF_NONMOVING;
+}
+
+#if defined(DEBUG)
+
+void nonmovingPrintSegment(struct NonmovingSegment *seg);
+void nonmovingPrintAllocator(struct NonmovingAllocator *alloc);
+void locate_object(P_ obj);
+void nonmovingPrintSweepList(void);
+// Check if the object is in one of non-moving heap mut_lists
+void check_in_mut_list(StgClosure *p);
+void print_block_list(bdescr *bd);
+void print_thread_list(StgTSO* tso);
+
+#endif
+
+#include "EndPrivate.h"
+
+#endif // CMINUSMINUS
diff --git a/rts/sm/NonMovingCensus.c b/rts/sm/NonMovingCensus.c
new file mode 100644
index 0000000000000000000000000000000000000000..670d51263cf8898f779bfc5e1464a17c9c3cf4a4
--- /dev/null
+++ b/rts/sm/NonMovingCensus.c
@@ -0,0 +1,129 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Accounting census
+ *
+ * This is a simple space accounting census useful for characterising
+ * fragmentation in the nonmoving heap.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "NonMoving.h"
+#include "Trace.h"
+#include "NonMovingCensus.h"
+
+// N.B. This may miss segments in the event of concurrent mutation (e.g. if a
+// mutator retires its current segment to the filled list).
+//
+// all_stopped is whether we can guarantee that all mutators and minor GCs are
+// stopped. In this case is safe to look at active and current segments so we can
+// also collect statistics on live words.
+static inline struct NonmovingAllocCensus
+nonmovingAllocatorCensus_(struct NonmovingAllocator *alloc, bool collect_live_words)
+{
+    struct NonmovingAllocCensus census = {0, 0, 0, 0};
+
+    for (struct NonmovingSegment *seg = alloc->filled;
+         seg != NULL;
+         seg = seg->link)
+    {
+        unsigned int n = nonmovingSegmentBlockCount(seg);
+        census.n_filled_segs++;
+        census.n_live_blocks += n;
+        if (collect_live_words) {
+            for (unsigned int i=0; i < n; i++) {
+                StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i);
+                census.n_live_words += closure_sizeW(c);
+            }
+        }
+    }
+
+    for (struct NonmovingSegment *seg = alloc->active;
+         seg != NULL;
+         seg = seg->link)
+    {
+        census.n_active_segs++;
+        unsigned int n = nonmovingSegmentBlockCount(seg);
+        for (unsigned int i=0; i < n; i++) {
+            if (nonmovingGetMark(seg, i)) {
+                StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i);
+                if (collect_live_words)
+                    census.n_live_words += closure_sizeW(c);
+                census.n_live_blocks++;
+            }
+        }
+    }
+
+    for (unsigned int cap=0; cap < n_capabilities; cap++)
+    {
+        struct NonmovingSegment *seg = alloc->current[cap];
+        unsigned int n = nonmovingSegmentBlockCount(seg);
+        for (unsigned int i=0; i < n; i++) {
+            if (nonmovingGetMark(seg, i)) {
+                StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i);
+                if (collect_live_words)
+                    census.n_live_words += closure_sizeW(c);
+                census.n_live_blocks++;
+            }
+        }
+    }
+    return census;
+}
+
+/* This must not be used when mutators are active since it assumes that
+ * all blocks in nonmoving heap are valid closures.
+ */
+struct NonmovingAllocCensus
+nonmovingAllocatorCensusWithWords(struct NonmovingAllocator *alloc)
+{
+    return nonmovingAllocatorCensus_(alloc, true);
+}
+
+struct NonmovingAllocCensus
+nonmovingAllocatorCensus(struct NonmovingAllocator *alloc)
+{
+    return nonmovingAllocatorCensus_(alloc, false);
+}
+
+
+void nonmovingPrintAllocatorCensus()
+{
+    if (!RtsFlags.GcFlags.useNonmoving)
+        return;
+
+    for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+        struct NonmovingAllocCensus census =
+            nonmovingAllocatorCensus(nonmovingHeap.allocators[i]);
+
+        uint32_t blk_size = 1 << (i + NONMOVING_ALLOCA0);
+        // We define occupancy as the fraction of space that is used for useful
+        // data (that is, live and not slop).
+        double occupancy = 100.0 * census.n_live_words * sizeof(W_)
+            / (census.n_live_blocks * blk_size);
+        if (census.n_live_blocks == 0) occupancy = 100;
+        (void) occupancy; // silence warning if !DEBUG
+        debugTrace(DEBUG_nonmoving_gc, "Allocator %d (%d bytes - %d bytes): "
+                   "%d active segs, %d filled segs, %d live blocks, %d live words "
+                   "(%2.1f%% occupancy)",
+                   i, 1 << (i + NONMOVING_ALLOCA0 - 1), 1 << (i + NONMOVING_ALLOCA0),
+                   census.n_active_segs, census.n_filled_segs, census.n_live_blocks, census.n_live_words,
+                   occupancy);
+    }
+}
+
+void nonmovingTraceAllocatorCensus()
+{
+#if defined(TRACING)
+    if (!RtsFlags.GcFlags.useNonmoving && !TRACE_nonmoving_gc)
+        return;
+
+    for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+        const struct NonmovingAllocCensus census =
+            nonmovingAllocatorCensus(nonmovingHeap.allocators[i]);
+        const uint32_t log_blk_size = i + NONMOVING_ALLOCA0;
+        traceNonmovingHeapCensus(log_blk_size, &census);
+    }
+#endif
+}
diff --git a/rts/sm/NonMovingCensus.h b/rts/sm/NonMovingCensus.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a66dc9b699064b86a212d0b54da8063e76a428b
--- /dev/null
+++ b/rts/sm/NonMovingCensus.h
@@ -0,0 +1,28 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Accounting census
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#include "NonMoving.h"
+
+struct NonmovingAllocCensus {
+    uint32_t n_active_segs;
+    uint32_t n_filled_segs;
+    uint32_t n_live_blocks;
+    uint32_t n_live_words;
+};
+
+
+struct NonmovingAllocCensus
+nonmovingAllocatorCensusWithWords(struct NonmovingAllocator *alloc);
+
+struct NonmovingAllocCensus
+nonmovingAllocatorCensus(struct NonmovingAllocator *alloc);
+
+void nonmovingPrintAllocatorCensus(void);
+void nonmovingTraceAllocatorCensus(void);
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
new file mode 100644
index 0000000000000000000000000000000000000000..03e342806a6786ba42de5e430943ef08f0eb4c1b
--- /dev/null
+++ b/rts/sm/NonMovingMark.c
@@ -0,0 +1,1958 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Mark phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+// We call evacuate, which expects the thread-local gc_thread to be valid;
+// This is sometimes declared as a register variable therefore it is necessary
+// to include the declaration so that the compiler doesn't clobber the register.
+#include "NonMovingMark.h"
+#include "NonMovingShortcut.h"
+#include "NonMoving.h"
+#include "BlockAlloc.h"  /* for countBlocks */
+#include "HeapAlloc.h"
+#include "Task.h"
+#include "Trace.h"
+#include "HeapUtils.h"
+#include "Printer.h"
+#include "Schedule.h"
+#include "Weak.h"
+#include "STM.h"
+#include "MarkWeak.h"
+#include "sm/Storage.h"
+#include "CNF.h"
+
+static void mark_closure (MarkQueue *queue, const StgClosure *p, StgClosure **origin);
+static void mark_tso (MarkQueue *queue, StgTSO *tso);
+static void mark_stack (MarkQueue *queue, StgStack *stack);
+static void mark_PAP_payload (MarkQueue *queue,
+                              StgClosure *fun,
+                              StgClosure **payload,
+                              StgWord size);
+
+// How many Array# entries to add to the mark queue at once?
+#define MARK_ARRAY_CHUNK_LENGTH 128
+
+/* Note [Large objects in the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The nonmoving collector keeps a separate list of its large objects, apart from
+ * oldest_gen->large_objects. There are two reasons for this:
+ *
+ *  1. oldest_gen is mutated by minor collections, which happen concurrently with
+ *     marking
+ *  2. the non-moving collector needs a consistent picture
+ *
+ * At the beginning of a major collection, nonmovingCollect takes the objects in
+ * oldest_gen->large_objects (which includes all large objects evacuated by the
+ * moving collector) and adds them to nonmoving_large_objects. This is the set
+ * of large objects that will being collected in the current major GC cycle.
+ *
+ * As the concurrent mark phase proceeds, the large objects in
+ * nonmoving_large_objects that are found to be live are moved to
+ * nonmoving_marked_large_objects. During sweep we discard all objects that remain
+ * in nonmoving_large_objects and move everything in nonmoving_marked_larged_objects
+ * back to nonmoving_large_objects.
+ *
+ * During minor collections large objects will accumulate on
+ * oldest_gen->large_objects, where they will be picked up by the nonmoving
+ * collector and moved to nonmoving_large_objects during the next major GC.
+ * When this happens the block gets its BF_NONMOVING_SWEEPING flag set to
+ * indicate that it is part of the snapshot and consequently should be marked by
+ * the nonmoving mark phase..
+ */
+
+bdescr *nonmoving_large_objects = NULL;
+bdescr *nonmoving_marked_large_objects = NULL;
+memcount n_nonmoving_large_blocks = 0;
+memcount n_nonmoving_marked_large_blocks = 0;
+
+bdescr *nonmoving_compact_objects = NULL;
+bdescr *nonmoving_marked_compact_objects = NULL;
+memcount n_nonmoving_compact_blocks = 0;
+memcount n_nonmoving_marked_compact_blocks = 0;
+
+#if defined(THREADED_RTS)
+/* Protects everything above. Furthermore, we only set the BF_MARKED bit of
+ * large object blocks when this is held. This ensures that the write barrier
+ * (e.g. finish_upd_rem_set_mark) and the collector (mark_closure) don't try to
+ * move the same large object to nonmoving_marked_large_objects more than once.
+ */
+static Mutex nonmoving_large_objects_mutex;
+// Note that we don't need a similar lock for compact objects becuase we never
+// mark a compact object eagerly in a write barrier; all compact objects are
+// marked by the mark thread, so there can't be any races here.
+#endif
+
+/*
+ * Where we keep our threads during collection since we must have a snapshot of
+ * the threads that lived in the nonmoving heap at the time that the snapshot
+ * was taken to safely resurrect.
+ */
+StgTSO *nonmoving_old_threads = END_TSO_QUEUE;
+/* Same for weak pointers */
+StgWeak *nonmoving_old_weak_ptr_list = NULL;
+/* Because we can "tidy" thread and weak lists concurrently with a minor GC we
+ * need to move marked threads and weaks to these lists until we pause for sync.
+ * Then we move them to oldest_gen lists. */
+StgTSO *nonmoving_threads = END_TSO_QUEUE;
+StgWeak *nonmoving_weak_ptr_list = NULL;
+
+#if defined(DEBUG)
+// TODO (osa): Document
+StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
+#endif
+
+/* Note [Update remembered set]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The concurrent non-moving collector uses a remembered set to ensure
+ * that its marking is consistent with the snapshot invariant defined in
+ * the design. This remembered set, known as the update remembered set,
+ * records all pointers that have been overwritten since the beginning
+ * of the concurrent mark. This ensures that concurrent mutation cannot hide
+ * pointers to live objects from the nonmoving garbage collector.
+ *
+ * The update remembered set is maintained via a write barrier that
+ * is enabled whenever a concurrent mark is active. This write barrier
+ * can be found in a number of places:
+ *
+ *  - In rts/Primops.cmm in primops responsible for modifying mutable closures
+ *    (e.g. MVARs, MUT_VARs, etc.)
+ *
+ *  - In rts/STM.c, where
+ *
+ *  - In the dirty_* functions found in rts/Storage.c where we dirty MVARs,
+ *    MUT_VARs, TSOs and STACKs. STACK is a somewhat special case, as described
+ *    in Note [StgStack dirtiness flags and concurrent marking] in TSO.h.
+ *
+ *  - In the code generated by the STG code generator for pointer array writes
+ *
+ *  - In thunk updates (e.g. updateWithIndirection) to ensure that the free
+ *    variables of the original thunk remain reachable.
+ *
+ * There is also a read barrier to handle weak references, as described in
+ * Note [Concurrent read barrier on deRefWeak#].
+ *
+ * The representation of the update remembered set is the same as that of
+ * the mark queue. For efficiency, each capability maintains its own local
+ * accumulator of remembered set entries. When a capability fills its
+ * accumulator it is linked in to the global remembered set
+ * (upd_rem_set_block_list), where it is consumed by the mark phase.
+ *
+ * The mark phase is responsible for freeing update remembered set block
+ * allocations.
+ *
+ * Note that we eagerly flush update remembered sets during minor GCs as
+ * described in Note [Eager update remembered set flushing].
+ *
+ *
+ * Note [Eager update remembered set flushing]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * We eagerly flush update remembered sets during minor GCs to avoid scenarios
+ * like the following which could result in long sync pauses:
+ *
+ *  1. We start a major GC, all thread stacks are added to the mark queue.
+ *  2. The concurrent mark thread starts.
+ *  3. The mutator is allowed to resume. One mutator thread T is scheduled and marks its
+ *     stack to its local update remembered set.
+ *  4. The mark thread eventually encounters the mutator thread's stack but
+ *     sees that it has already been marked; skips it.
+ *  5. Thread T continues running but does not push enough to its update
+ *     remembered set to require a flush.
+ *  6. Eventually the mark thread finished marking and requests a final sync.
+ *  7. The thread T flushes its update remembered set.
+ *  8. We find that a large fraction of the heap (namely the subset that is
+ *     only reachable from the thread T's stack) needs to be marked, incurring
+ *     a large sync pause
+ *
+ * We avoid this by periodically (during minor GC) forcing a flush of the
+ * update remembered set.
+ *
+ * A better (but more complex) approach that would be worthwhile trying in the
+ * future would be to rather do the following:
+ *
+ *  1. Concurrent mark phase is on-going
+ *  2. Mark thread runs out of things to mark
+ *  3. Mark thread sends a signal to capabilities requesting that they send
+ *     their update remembered sets without suspending their execution
+ *  4. The mark thread marks everything it was sent; runs out of things to mark
+ *  5. Mark thread initiates a sync
+ *  6. Capabilities send their final update remembered sets and suspend execution
+ *  7. Mark thread marks everything is was sent
+ *  8. Mark thead allows capabilities to resume.
+ *
+ * However, this is obviously a fair amount of complexity and so far the
+ * periodic eager flushing approach has been sufficient.
+ *
+ *
+ * Note [Concurrent read barrier on deRefWeak#]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * In general the non-moving GC assumes that all pointers reachable from a
+ * marked object are themselves marked (or in the mark queue). However,
+ * weak pointers are an obvious exception to this rule. In particular,
+ * deRefWeakPtr# allows the mutator to turn a weak reference into a strong
+ * reference. This interacts badly with concurrent collection. For
+ * instance, consider this program:
+ *
+ *     f :: a -> b -> IO b
+ *     f k v = do
+ *         -- assume that k and v are the only references to the
+ *         -- closures to which they refer.
+ *         weak <- mkWeakPtr k v Nothing
+ *
+ *         -- N.B. k is now technically dead since the only reference to it is
+ *         -- weak, but we've not yet had a chance to tombstone the WeakPtr
+ *         -- (which will happen in the course of major GC).
+ *         performMajorGC
+ *         -- Now we are running concurrently with the mark...
+
+ *         Just x <- deRefWeak weak
+ *         -- We have now introduced a reference to `v`, which will
+ *         -- not be marked as the only reference to `v` when the snapshot was
+ *         -- taken is via a WeakPtr.
+ *         return x
+ *
+ */
+static Mutex upd_rem_set_lock;
+bdescr *upd_rem_set_block_list = NULL;
+
+#if defined(THREADED_RTS)
+/* Used during the mark/sweep phase transition to track how many capabilities
+ * have pushed their update remembered sets. Protected by upd_rem_set_lock.
+ */
+static volatile StgWord upd_rem_set_flush_count = 0;
+#endif
+
+
+/* Signaled by each capability when it has flushed its update remembered set */
+static Condition upd_rem_set_flushed_cond;
+
+/* Indicates to mutators that the write barrier must be respected. Set while
+ * concurrent mark is running.
+ */
+StgWord nonmoving_write_barrier_enabled = false;
+
+/* Used to provide the current mark queue to the young generation
+ * collector for scavenging.
+ */
+MarkQueue *current_mark_queue = NULL;
+
+/* Initialise update remembered set data structures */
+void nonmovingMarkInitUpdRemSet() {
+    initMutex(&upd_rem_set_lock);
+    initCondition(&upd_rem_set_flushed_cond);
+#if defined(THREADED_RTS)
+    initMutex(&nonmoving_large_objects_mutex);
+#endif
+}
+
+#if defined(THREADED_RTS) && defined(DEBUG)
+static uint32_t markQueueLength(MarkQueue *q);
+#endif
+static void init_mark_queue_(MarkQueue *queue);
+
+/* Transfers the given capability's update-remembered set to the global
+ * remembered set.
+ *
+ * Really the argument type should be UpdRemSet* but this would be rather
+ * inconvenient without polymorphism.
+ */
+void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
+{
+    if (markQueueIsEmpty(rset)) return;
+
+    // find the tail of the queue
+    bdescr *start = rset->blocks;
+    bdescr *end = start;
+    while (end->link != NULL)
+        end = end->link;
+
+    // add the blocks to the global remembered set
+    ACQUIRE_LOCK(&upd_rem_set_lock);
+    end->link = upd_rem_set_block_list;
+    upd_rem_set_block_list = start;
+    RELEASE_LOCK(&upd_rem_set_lock);
+
+    // Reset remembered set
+    ACQUIRE_SM_LOCK;
+    init_mark_queue_(rset);
+    rset->is_upd_rem_set = true;
+    RELEASE_SM_LOCK;
+}
+
+#if defined(THREADED_RTS)
+/* Called by capabilities to flush their update remembered sets when
+ * synchronising with the non-moving collector as it transitions from mark to
+ * sweep phase.
+ */
+void nonmovingFlushCapUpdRemSetBlocks(Capability *cap)
+{
+    debugTrace(DEBUG_nonmoving_gc,
+               "Capability %d flushing update remembered set: %d",
+               cap->no, markQueueLength(&cap->upd_rem_set.queue));
+    traceConcUpdRemSetFlush(cap);
+    nonmovingAddUpdRemSetBlocks(&cap->upd_rem_set.queue);
+    atomic_inc(&upd_rem_set_flush_count, 1);
+    signalCondition(&upd_rem_set_flushed_cond);
+    // After this mutation will remain suspended until nonmovingFinishFlush
+    // releases its capabilities.
+}
+
+/* Request that all capabilities flush their update remembered sets and suspend
+ * execution until the further notice.
+ */
+void nonmovingBeginFlush(Task *task)
+{
+    debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush...");
+    traceConcSyncBegin();
+    upd_rem_set_flush_count = 0;
+    stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET);
+
+    // XXX: We may have been given a capability via releaseCapability (i.e. a
+    // task suspended due to a foreign call) in which case our requestSync
+    // logic won't have been hit. Make sure that everyone so far has flushed.
+    // Ideally we want to mark asynchronously with syncing.
+    for (uint32_t i = 0; i < n_capabilities; i++) {
+        nonmovingFlushCapUpdRemSetBlocks(capabilities[i]);
+    }
+}
+
+/* Wait until a capability has flushed its update remembered set. Returns true
+ * if all capabilities have flushed.
+ */
+bool nonmovingWaitForFlush()
+{
+    ACQUIRE_LOCK(&upd_rem_set_lock);
+    debugTrace(DEBUG_nonmoving_gc, "Flush count %d", upd_rem_set_flush_count);
+    bool finished = upd_rem_set_flush_count == n_capabilities;
+    if (!finished) {
+        waitCondition(&upd_rem_set_flushed_cond, &upd_rem_set_lock);
+    }
+    RELEASE_LOCK(&upd_rem_set_lock);
+    return finished;
+}
+
+/* Note [Unintentional marking in resurrectThreads]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * In both moving and non-moving collectors threads found to be unreachable are
+ * evacuated/marked and then resurrected with resurrectThreads. resurrectThreads
+ * raises an exception in the unreachable thread via raiseAsync, which does
+ * mutations on the heap. These mutations cause adding stuff to UpdRemSet of the
+ * thread's capability. Here's an example backtrace where this happens:
+ *
+ *     #0  updateRemembSetPushClosure
+ *     #1  0x000000000072b363 in dirty_TVAR
+ *     #2  0x00000000007162e5 in remove_watch_queue_entries_for_trec
+ *     #3  0x0000000000717098 in stmAbortTransaction
+ *     #4  0x000000000070c6eb in raiseAsync
+ *     #5  0x000000000070b473 in throwToSingleThreaded__
+ *     #6  0x000000000070b4ab in throwToSingleThreaded
+ *     #7  0x00000000006fce82 in resurrectThreads
+ *     #8  0x00000000007215db in nonmovingMark_
+ *     #9  0x0000000000721438 in nonmovingConcurrentMark
+ *     #10 0x00007f1ee81cd6db in start_thread
+ *     #11 0x00007f1ee850688f in clone
+ *
+ * However we don't really want to run write barriers when calling
+ * resurrectThreads here, because we're in a GC pause, and overwritten values
+ * are definitely gone forever (as opposed to being inserted in a marked object
+ * or kept in registers and used later).
+ *
+ * When this happens, if we don't reset the UpdRemSets, what happens is in the
+ * next mark we see these objects that were added in previous mark's
+ * resurrectThreads in UpdRemSets, and mark those. This causes keeping
+ * unreachable objects alive, and effects weak finalization and thread resurrect
+ * (which rely on things become unreachable). As an example, stm048 fails when
+ * we get this wrong, because when we do raiseAsync on a thread that was blocked
+ * on an STM transaction we mutate a TVAR_WATCH_QUEUE, which has a reference to
+ * the TSO that was running the STM transaction. If the TSO becomes unreachable
+ * again in the next GC we don't realize this, because it was added to an
+ * UpdRemSet in the previous GC's mark phase, because of raiseAsync.
+ *
+ * To fix this we clear all UpdRemSets in nonmovingFinishFlush, right before
+ * releasing capabilities. This is somewhat inefficient (we allow adding objects
+ * to UpdRemSets, only to later reset them), but the only case where we add to
+ * UpdRemSets during mark is resurrectThreads, and I don't think we do so many
+ * resurrection in a thread that we fill UpdRemSets and allocate new blocks. So
+ * pushing an UpdRemSet in this case is really fast, and resetting is even
+ * faster (we just update a pointer).
+ *
+ * TODO (osa): What if we actually marked UpdRemSets in this case, in the mark
+ * loop? Would that work? Or what would break?
+ */
+
+/* Notify capabilities that the synchronisation is finished; they may resume
+ * execution.
+ */
+void nonmovingFinishFlush(Task *task)
+{
+    // See Note [Unintentional marking in resurrectThreads]
+    for (uint32_t i = 0; i < n_capabilities; i++) {
+        reset_upd_rem_set(&capabilities[i]->upd_rem_set);
+    }
+    // Also reset upd_rem_set_block_list in case some of the UpdRemSets were
+    // filled and we flushed them.
+    freeChain_lock(upd_rem_set_block_list);
+    upd_rem_set_block_list = NULL;
+
+    debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush...");
+    traceConcSyncEnd();
+    releaseAllCapabilities(n_capabilities, NULL, task);
+}
+#endif
+
+/*********************************************************
+ * Pushing to either the mark queue or remembered set
+ *********************************************************/
+
+STATIC_INLINE void
+push (MarkQueue *q, const MarkQueueEnt *ent)
+{
+    // Are we at the end of the block?
+    if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
+        // Yes, this block is full.
+        if (q->is_upd_rem_set) {
+            nonmovingAddUpdRemSetBlocks(q);
+        } else {
+            // allocate a fresh block.
+            ACQUIRE_SM_LOCK;
+            bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
+            bd->link = q->blocks;
+            q->blocks = bd;
+            q->top = (MarkQueueBlock *) bd->start;
+            q->top->head = 0;
+            RELEASE_SM_LOCK;
+        }
+    }
+
+    q->top->entries[q->top->head] = *ent;
+    q->top->head++;
+}
+
+/* A variant of push to be used by the minor GC when it encounters a reference
+ * to an object in the non-moving heap. In contrast to the other push
+ * operations this uses the gc_alloc_block_sync spinlock instead of the
+ * SM_LOCK to allocate new blocks in the event that the mark queue is full.
+ */
+void
+markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
+{
+    /* We should not make it here if we are doing a deadlock detect GC.
+     * See Note [Deadlock detection under nonmoving collector].
+     */
+    ASSERT(!deadlock_detect_gc);
+
+    // Are we at the end of the block?
+    if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
+        // Yes, this block is full.
+        // allocate a fresh block.
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
+        bd->link = q->blocks;
+        q->blocks = bd;
+        q->top = (MarkQueueBlock *) bd->start;
+        q->top->head = 0;
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    }
+
+    MarkQueueEnt ent = {
+        .mark_closure = {
+            .p = UNTAG_CLOSURE(p),
+            .origin = NULL,
+        }
+    };
+    q->top->entries[q->top->head] = ent;
+    q->top->head++;
+}
+
+static inline
+void push_closure (MarkQueue *q,
+                   StgClosure *p,
+                   StgClosure **origin)
+{
+#if defined(DEBUG)
+    ASSERT(!HEAP_ALLOCED_GC(p) || (Bdescr((StgPtr) p)->gen == oldest_gen));
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    // Commenting out: too slow
+    // if (RtsFlags.DebugFlags.sanity) {
+    //     assert_in_nonmoving_heap((P_)p);
+    //     if (origin)
+    //         assert_in_nonmoving_heap((P_)origin);
+    // }
+#endif
+
+    // This must be true as origin points to a pointer and therefore must be
+    // word-aligned. However, we check this as otherwise we would confuse this
+    // with a mark_array entry
+    ASSERT(((uintptr_t) origin & 0x3) == 0);
+
+    MarkQueueEnt ent = {
+        .mark_closure = {
+            .p = p,
+            .origin = origin,
+        }
+    };
+    push(q, &ent);
+}
+
+static
+void push_array (MarkQueue *q,
+                 const StgMutArrPtrs *array,
+                 StgWord start_index)
+{
+    // TODO: Push this into callers where they already have the Bdescr
+    if (HEAP_ALLOCED_GC(array) && (Bdescr((StgPtr) array)->gen != oldest_gen))
+        return;
+
+    MarkQueueEnt ent = {
+        .mark_array = {
+            .array = array,
+            .start_index = (start_index << 16) | 0x3,
+        }
+    };
+    push(q, &ent);
+}
+
+static
+void push_thunk_srt (MarkQueue *q, const StgInfoTable *info)
+{
+    const StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(info);
+    if (thunk_info->i.srt) {
+        push_closure(q, (StgClosure*)GET_SRT(thunk_info), NULL);
+    }
+}
+
+static
+void push_fun_srt (MarkQueue *q, const StgInfoTable *info)
+{
+    const StgFunInfoTable *fun_info = itbl_to_fun_itbl(info);
+    if (fun_info->i.srt) {
+        push_closure(q, (StgClosure*)GET_FUN_SRT(fun_info), NULL);
+    }
+}
+
+/*********************************************************
+ * Pushing to the update remembered set
+ *
+ * upd_rem_set_push_* functions are directly called by
+ * mutators and need to check whether the value is in
+ * non-moving heap.
+ *********************************************************/
+
+// Check if the object is traced by the non-moving collector. This holds in two
+// conditions:
+//
+// - Object is in non-moving heap
+// - Object is a large (BF_LARGE) and marked as BF_NONMOVING
+// - Object is static (HEAP_ALLOCED_GC(obj) == false)
+//
+static
+bool check_in_nonmoving_heap(StgClosure *p) {
+    if (HEAP_ALLOCED_GC(p)) {
+        // This works for both large and small objects:
+        return Bdescr((P_)p)->flags & BF_NONMOVING;
+    } else {
+        return true; // a static object
+    }
+}
+
+/* Push the free variables of a (now-evaluated) thunk to the
+ * update remembered set.
+ */
+inline void updateRemembSetPushThunk(Capability *cap, StgThunk *thunk)
+{
+    const StgInfoTable *info;
+    do {
+        info = get_volatile_itbl((StgClosure *) thunk);
+    } while (info->type == WHITEHOLE);
+    updateRemembSetPushThunkEager(cap, (StgThunkInfoTable *) info, thunk);
+}
+
+/* Push the free variables of a thunk to the update remembered set.
+ * This is called by the thunk update code (e.g. updateWithIndirection) before
+ * we update the indirectee to ensure that the thunk's free variables remain
+ * visible to the concurrent collector.
+ *
+ * See Note [Update rememembered set].
+ */
+void updateRemembSetPushThunkEager(Capability *cap,
+                                   const StgThunkInfoTable *info,
+                                   StgThunk *thunk)
+{
+    /* N.B. info->i.type mustn't be WHITEHOLE */
+    MarkQueue *queue = &cap->upd_rem_set.queue;
+    switch (info->i.type) {
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    {
+        push_thunk_srt(queue, &info->i);
+
+        for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) {
+            if (check_in_nonmoving_heap(thunk->payload[i])) {
+                // Don't bother to push origin; it makes the barrier needlessly
+                // expensive with little benefit.
+                push_closure(queue, thunk->payload[i], NULL);
+            }
+        }
+        break;
+    }
+    case AP:
+    {
+        StgAP *ap = (StgAP *) thunk;
+        if (check_in_nonmoving_heap(ap->fun)) {
+            push_closure(queue, ap->fun, NULL);
+        }
+        mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
+        break;
+    }
+    case THUNK_SELECTOR:
+    case BLACKHOLE:
+        // TODO: This is right, right?
+        break;
+    // The selector optimization performed by the nonmoving mark may have
+    // overwritten a thunk which we are updating with an indirection.
+    case IND:
+    {
+        StgInd *ind = (StgInd *) thunk;
+        if (check_in_nonmoving_heap(ind->indirectee)) {
+            push_closure(queue, ind->indirectee, NULL);
+        }
+        break;
+    }
+    default:
+        barf("updateRemembSetPushThunk: invalid thunk pushed: p=%p, type=%d",
+             thunk, info->i.type);
+    }
+}
+
+void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p)
+{
+    updateRemembSetPushThunk(regTableToCapability(reg), p);
+}
+
+inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p)
+{
+    if (check_in_nonmoving_heap(p)) {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        push_closure(queue, p, NULL);
+    }
+}
+
+void updateRemembSetPushClosure_(StgRegTable *reg, struct StgClosure_ *p)
+{
+    updateRemembSetPushClosure(regTableToCapability(reg), p);
+}
+
+STATIC_INLINE bool needs_upd_rem_set_mark(StgClosure *p)
+{
+    // TODO: Deduplicate with mark_closure
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (bd->gen != oldest_gen) {
+        return false;
+    } else if (bd->flags & BF_LARGE) {
+        if (! (bd->flags & BF_NONMOVING_SWEEPING)) {
+            return false;
+        } else {
+            return ! (bd->flags & BF_MARKED);
+        }
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        return nonmovingGetMark(seg, block_idx) != nonmovingMarkEpoch;
+    }
+}
+
+/* Set the mark bit; only to be called *after* we have fully marked the closure */
+STATIC_INLINE void finish_upd_rem_set_mark(StgClosure *p)
+{
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (bd->flags & BF_LARGE) {
+        // Someone else may have already marked it.
+        ACQUIRE_LOCK(&nonmoving_large_objects_mutex);
+        if (! (bd->flags & BF_MARKED)) {
+            bd->flags |= BF_MARKED;
+            dbl_link_remove(bd, &nonmoving_large_objects);
+            dbl_link_onto(bd, &nonmoving_marked_large_objects);
+            n_nonmoving_large_blocks -= bd->blocks;
+            n_nonmoving_marked_large_blocks += bd->blocks;
+        }
+        RELEASE_LOCK(&nonmoving_large_objects_mutex);
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        nonmovingSetMark(seg, block_idx);
+    }
+}
+
+void updateRemembSetPushTSO(Capability *cap, StgTSO *tso)
+{
+    if (needs_upd_rem_set_mark((StgClosure *) tso)) {
+        debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: TSO %p", tso);
+        mark_tso(&cap->upd_rem_set.queue, tso);
+        finish_upd_rem_set_mark((StgClosure *) tso);
+    }
+}
+
+void updateRemembSetPushStack(Capability *cap, StgStack *stack)
+{
+    // N.B. caller responsible for checking nonmoving_write_barrier_enabled
+    if (needs_upd_rem_set_mark((StgClosure *) stack)) {
+        StgWord8 marking = stack->marking;
+        // See Note [StgStack dirtiness flags and concurrent marking]
+        if (cas_word8(&stack->marking, marking, nonmovingMarkEpoch)
+              != nonmovingMarkEpoch) {
+            // We have claimed the right to mark the stack.
+            debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: STACK %p", stack->sp);
+            mark_stack(&cap->upd_rem_set.queue, stack);
+            finish_upd_rem_set_mark((StgClosure *) stack);
+            return;
+        } else {
+            // The concurrent GC has claimed the right to mark the stack.
+            // Wait until it finishes marking before proceeding with
+            // mutation.
+            while (needs_upd_rem_set_mark((StgClosure *) stack));
+#if defined(PARALLEL_GC)
+                busy_wait_nop(); // TODO: Spinning here is unfortunate
+#endif
+            return;
+        }
+    }
+}
+
+/*********************************************************
+ * Pushing to the mark queue
+ *********************************************************/
+
+void markQueuePush (MarkQueue *q, const MarkQueueEnt *ent)
+{
+    push(q, ent);
+}
+
+void markQueuePushClosure (MarkQueue *q,
+                           StgClosure *p,
+                           StgClosure **origin)
+{
+    // TODO: Push this into callers where they already have the Bdescr
+    if (check_in_nonmoving_heap(p)) {
+        push_closure(q, p, origin);
+    }
+}
+
+/* TODO: Do we really never want to specify the origin here? */
+void markQueueAddRoot (MarkQueue* q, StgClosure** root)
+{
+    markQueuePushClosure(q, *root, NULL);
+}
+
+/* Push a closure to the mark queue without origin information */
+void markQueuePushClosure_ (MarkQueue *q, StgClosure *p)
+{
+    markQueuePushClosure(q, p, NULL);
+}
+
+void markQueuePushFunSrt (MarkQueue *q, const StgInfoTable *info)
+{
+    push_fun_srt(q, info);
+}
+
+void markQueuePushThunkSrt (MarkQueue *q, const StgInfoTable *info)
+{
+    push_thunk_srt(q, info);
+}
+
+void markQueuePushArray (MarkQueue *q,
+                            const StgMutArrPtrs *array,
+                            StgWord start_index)
+{
+    push_array(q, array, start_index);
+}
+
+/*********************************************************
+ * Popping from the mark queue
+ *********************************************************/
+
+// Returns invalid MarkQueueEnt if queue is empty.
+static MarkQueueEnt markQueuePop_ (MarkQueue *q)
+{
+    MarkQueueBlock *top;
+
+again:
+    top = q->top;
+
+    // Are we at the beginning of the block?
+    if (top->head == 0) {
+        // Is this the first block of the queue?
+        if (q->blocks->link == NULL) {
+            // Yes, therefore queue is empty...
+            MarkQueueEnt none = { .null_entry = { .p = NULL } };
+            return none;
+        } else {
+            // No, unwind to the previous block and try popping again...
+            bdescr *old_block = q->blocks;
+            q->blocks = old_block->link;
+            q->top = (MarkQueueBlock*)q->blocks->start;
+            ACQUIRE_SM_LOCK;
+            freeGroup(old_block); // TODO: hold on to a block to avoid repeated allocation/deallocation?
+            RELEASE_SM_LOCK;
+            goto again;
+        }
+    }
+
+    top->head--;
+    MarkQueueEnt ent = top->entries[top->head];
+    return ent;
+}
+
+static MarkQueueEnt markQueuePop (MarkQueue *q)
+{
+#if MARK_PREFETCH_QUEUE_DEPTH == 0
+    return markQueuePop_(q);
+#else
+    unsigned int i = q->prefetch_head;
+    while (nonmovingMarkQueueEntryType(&q->prefetch_queue[i]) == NULL_ENTRY) {
+        MarkQueueEnt new = markQueuePop_(q);
+        if (nonmovingMarkQueueEntryType(&new) == NULL_ENTRY) {
+            // Mark queue is empty; look for any valid entries in the prefetch
+            // queue
+            for (unsigned int j = (i+1) % MARK_PREFETCH_QUEUE_DEPTH;
+                 j != i;
+                 j = (j+1) % MARK_PREFETCH_QUEUE_DEPTH)
+            {
+                if (nonmovingMarkQueueEntryType(&q->prefetch_queue[j]) != NULL_ENTRY) {
+                    i = j;
+                    goto done;
+                }
+            }
+            return new;
+        }
+
+        // The entry may not be a MARK_CLOSURE but it doesn't matter, our
+        // MarkQueueEnt encoding always places the pointer to the object to be
+        // marked first.
+        prefetchForRead(&new.mark_closure.p->header.info);
+        prefetchForRead(Bdescr((StgPtr) new.mark_closure.p));
+        q->prefetch_queue[i] = new;
+        i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH;
+    }
+
+  done:
+    ;
+    MarkQueueEnt ret = q->prefetch_queue[i];
+    q->prefetch_queue[i].null_entry.p = NULL;
+    q->prefetch_head = i;
+    return ret;
+#endif
+}
+
+/*********************************************************
+ * Creating and destroying MarkQueues and UpdRemSets
+ *********************************************************/
+
+/* Must hold sm_mutex. */
+static void init_mark_queue_ (MarkQueue *queue)
+{
+    bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
+    queue->blocks = bd;
+    queue->top = (MarkQueueBlock *) bd->start;
+    queue->top->head = 0;
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    memset(&queue->prefetch_queue, 0, sizeof(queue->prefetch_queue));
+    queue->prefetch_head = 0;
+#endif
+}
+
+/* Must hold sm_mutex. */
+void initMarkQueue (MarkQueue *queue)
+{
+    init_mark_queue_(queue);
+    queue->is_upd_rem_set = false;
+}
+
+/* Must hold sm_mutex. */
+void init_upd_rem_set (UpdRemSet *rset)
+{
+    init_mark_queue_(&rset->queue);
+    rset->queue.is_upd_rem_set = true;
+}
+
+void reset_upd_rem_set (UpdRemSet *rset)
+{
+    // UpdRemSets always have one block for the mark queue. This assertion is to
+    // update this code if we change that.
+    ASSERT(rset->queue.blocks->link == NULL);
+    rset->queue.top->head = 0;
+}
+
+void freeMarkQueue (MarkQueue *queue)
+{
+    freeChain_lock(queue->blocks);
+}
+
+#if defined(THREADED_RTS) && defined(DEBUG)
+static uint32_t
+markQueueLength (MarkQueue *q)
+{
+    uint32_t n = 0;
+    for (bdescr *block = q->blocks; block; block = block->link) {
+        MarkQueueBlock *queue = (MarkQueueBlock*)block->start;
+        n += queue->head;
+    }
+    return n;
+}
+#endif
+
+
+/*********************************************************
+ * Marking
+ *********************************************************/
+
+/*
+ * N.B. Mutation of TRecHeaders is completely unprotected by any write
+ * barrier. Consequently it's quite important that we deeply mark
+ * any outstanding transactions.
+ */
+static void
+mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
+{
+    while (trec != NO_TREC) {
+        StgTRecChunk *chunk = trec->current_chunk;
+        markQueuePushClosure_(queue, (StgClosure *) trec);
+        markQueuePushClosure_(queue, (StgClosure *) chunk);
+        while (chunk != END_STM_CHUNK_LIST) {
+            for (StgWord i=0; i < chunk->next_entry_idx; i++) {
+                TRecEntry *ent = &chunk->entries[i];
+                markQueuePushClosure_(queue, (StgClosure *) ent->tvar);
+                markQueuePushClosure_(queue, ent->expected_value);
+                markQueuePushClosure_(queue, ent->new_value);
+            }
+            chunk = chunk->prev_chunk;
+        }
+        trec = trec->enclosing_trec;
+    }
+}
+
+static void
+mark_tso (MarkQueue *queue, StgTSO *tso)
+{
+    // TODO: Clear dirty if contains only old gen objects
+
+    if (tso->bound != NULL) {
+        markQueuePushClosure_(queue, (StgClosure *) tso->bound->tso);
+    }
+
+    markQueuePushClosure_(queue, (StgClosure *) tso->blocked_exceptions);
+    markQueuePushClosure_(queue, (StgClosure *) tso->bq);
+    mark_trec_header(queue, tso->trec);
+    markQueuePushClosure_(queue, (StgClosure *) tso->stackobj);
+    markQueuePushClosure_(queue, (StgClosure *) tso->_link);
+    if (   tso->why_blocked == BlockedOnMVar
+        || tso->why_blocked == BlockedOnMVarRead
+        || tso->why_blocked == BlockedOnBlackHole
+        || tso->why_blocked == BlockedOnMsgThrowTo
+        || tso->why_blocked == NotBlocked
+        ) {
+        markQueuePushClosure_(queue, tso->block_info.closure);
+    }
+}
+
+static void
+do_push_closure (StgClosure **p, void *user)
+{
+    MarkQueue *queue = (MarkQueue *) user;
+    // TODO: Origin? need reference to containing closure
+    markQueuePushClosure_(queue, *p);
+}
+
+static void
+mark_large_bitmap (MarkQueue *queue,
+                   StgClosure **p,
+                   StgLargeBitmap *large_bitmap,
+                   StgWord size)
+{
+    walk_large_bitmap(do_push_closure, p, large_bitmap, size, queue);
+}
+
+static void
+mark_small_bitmap (MarkQueue *queue, StgClosure **p, StgWord size, StgWord bitmap)
+{
+    while (size > 0) {
+        if ((bitmap & 1) == 0) {
+            // TODO: Origin?
+            markQueuePushClosure(queue, *p, NULL);
+        }
+        p++;
+        bitmap = bitmap >> 1;
+        size--;
+    }
+}
+
+static GNUC_ATTR_HOT
+void mark_PAP_payload (MarkQueue *queue,
+                       StgClosure *fun,
+                       StgClosure **payload,
+                       StgWord size)
+{
+    const StgFunInfoTable *fun_info = get_fun_itbl(UNTAG_CONST_CLOSURE(fun));
+    ASSERT(fun_info->i.type != PAP);
+    StgPtr p = (StgPtr) payload;
+
+    StgWord bitmap;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+        goto small_bitmap;
+    case ARG_GEN_BIG:
+        mark_large_bitmap(queue, payload, GET_FUN_LARGE_BITMAP(fun_info), size);
+        break;
+    case ARG_BCO:
+        mark_large_bitmap(queue, payload, BCO_BITMAP(fun), size);
+        break;
+    default:
+        bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+        mark_small_bitmap(queue, (StgClosure **) p, size, bitmap);
+        break;
+    }
+}
+
+/* Helper for mark_stack; returns next stack frame. */
+static StgPtr
+mark_arg_block (MarkQueue *queue, const StgFunInfoTable *fun_info, StgClosure **args)
+{
+    StgWord bitmap, size;
+
+    StgPtr p = (StgPtr)args;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+        size = BITMAP_SIZE(fun_info->f.b.bitmap);
+        goto small_bitmap;
+    case ARG_GEN_BIG:
+        size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+        mark_large_bitmap(queue, (StgClosure**)p, GET_FUN_LARGE_BITMAP(fun_info), size);
+        p += size;
+        break;
+    default:
+        bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+        size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+        mark_small_bitmap(queue, (StgClosure**)p, size, bitmap);
+        p += size;
+        break;
+    }
+    return p;
+}
+
+static GNUC_ATTR_HOT void
+mark_stack_ (MarkQueue *queue, StgPtr sp, StgPtr spBottom)
+{
+    ASSERT(sp <= spBottom);
+
+    while (sp < spBottom) {
+        const StgRetInfoTable *info = get_ret_itbl((StgClosure *)sp);
+        switch (info->i.type) {
+        case UPDATE_FRAME:
+        {
+            // See Note [upd-black-hole] in rts/Scav.c
+            StgUpdateFrame *frame = (StgUpdateFrame *) sp;
+            markQueuePushClosure_(queue, frame->updatee);
+            sp += sizeofW(StgUpdateFrame);
+            continue;
+        }
+
+            // small bitmap (< 32 entries, or 64 on a 64-bit machine)
+        case CATCH_STM_FRAME:
+        case CATCH_RETRY_FRAME:
+        case ATOMICALLY_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case CATCH_FRAME:
+        case RET_SMALL:
+        {
+            StgWord bitmap = BITMAP_BITS(info->i.layout.bitmap);
+            StgWord size   = BITMAP_SIZE(info->i.layout.bitmap);
+            // NOTE: the payload starts immediately after the info-ptr, we
+            // don't have an StgHeader in the same sense as a heap closure.
+            sp++;
+            mark_small_bitmap(queue, (StgClosure **) sp, size, bitmap);
+            sp += size;
+        }
+        follow_srt:
+            if (info->i.srt) {
+                markQueuePushClosure_(queue, (StgClosure*)GET_SRT(info));
+            }
+            continue;
+
+        case RET_BCO: {
+            sp++;
+            markQueuePushClosure_(queue, *(StgClosure**)sp);
+            StgBCO *bco = (StgBCO *)*sp;
+            sp++;
+            StgWord size = BCO_BITMAP_SIZE(bco);
+            mark_large_bitmap(queue, (StgClosure **) sp, BCO_BITMAP(bco), size);
+            sp += size;
+            continue;
+        }
+
+          // large bitmap (> 32 entries, or > 64 on a 64-bit machine)
+        case RET_BIG:
+        {
+            StgWord size;
+
+            size = GET_LARGE_BITMAP(&info->i)->size;
+            sp++;
+            mark_large_bitmap(queue, (StgClosure **) sp, GET_LARGE_BITMAP(&info->i), size);
+            sp += size;
+            // and don't forget to follow the SRT
+            goto follow_srt;
+        }
+
+        case RET_FUN:
+        {
+            StgRetFun *ret_fun = (StgRetFun *)sp;
+            const StgFunInfoTable *fun_info;
+
+            markQueuePushClosure_(queue, ret_fun->fun);
+            fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun));
+            sp = mark_arg_block(queue, fun_info, ret_fun->payload);
+            goto follow_srt;
+        }
+
+        default:
+            barf("mark_stack: weird activation record found on stack: %d", (int)(info->i.type));
+        }
+    }
+}
+
+static GNUC_ATTR_HOT void
+mark_stack (MarkQueue *queue, StgStack *stack)
+{
+    // TODO: Clear dirty if contains only old gen objects
+
+    mark_stack_(queue, stack->sp, stack->stack + stack->stack_size);
+}
+
+/* See Note [Static objects under the nonmoving collector].
+ *
+ * Returns true if the object needs to be marked.
+ */
+static bool
+bump_static_flag(StgClosure **link_field, StgClosure *q STG_UNUSED)
+{
+    while (1) {
+        StgWord link = (StgWord) *link_field;
+        StgWord new = (link & ~STATIC_BITS) | static_flag;
+        if ((link & STATIC_BITS) == static_flag)
+            return false;
+        else if (cas((StgVolatilePtr) link_field, link, new) == link) {
+            return true;
+        }
+    }
+}
+
+static GNUC_ATTR_HOT void
+mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
+{
+    StgClosure *p = (StgClosure*)p0;
+
+ try_again:
+    ;
+    bdescr *bd = NULL;
+    StgClosure *p_next = NULL;
+    StgWord tag = GET_CLOSURE_TAG(p);
+    p = UNTAG_CLOSURE(p);
+
+#   define PUSH_FIELD(obj, field)                                \
+        markQueuePushClosure(queue,                              \
+                                (StgClosure *) (obj)->field,     \
+                                (StgClosure **) &(obj)->field)
+
+    if (!HEAP_ALLOCED_GC(p)) {
+        const StgInfoTable *info = get_itbl(p);
+        StgHalfWord type = info->type;
+
+        if (type == CONSTR_0_1 || type == CONSTR_0_2 || type == CONSTR_NOCAF) {
+            // no need to put these on the static linked list, they don't need
+            // to be marked.
+            return;
+        }
+
+        switch (type) {
+
+        case THUNK_STATIC:
+            if (info->srt != 0) {
+                if (bump_static_flag(THUNK_STATIC_LINK((StgClosure *)p), p)) {
+                    markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                }
+            }
+            goto done;
+
+        case FUN_STATIC:
+            if (info->srt != 0 || info->layout.payload.ptrs != 0) {
+                if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                    markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
+
+                    // a FUN_STATIC can also be an SRT, so it may have pointer
+                    // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
+                    // the [FUN] optimisation.
+                    // TODO (osa) I don't understand this comment
+                    for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                        PUSH_FIELD(p, payload[i]);
+                    }
+                }
+            }
+            goto done;
+
+        case IND_STATIC:
+            if (bump_static_flag(IND_STATIC_LINK((StgClosure *)p), p)) {
+                PUSH_FIELD((StgInd *) p, indirectee);
+            }
+            goto done;
+
+        case CONSTR:
+        case CONSTR_1_0:
+        case CONSTR_2_0:
+        case CONSTR_1_1:
+            if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                    PUSH_FIELD(p, payload[i]);
+                }
+            }
+            goto done;
+
+        case WHITEHOLE:
+            while (get_volatile_itbl(p)->type == WHITEHOLE);
+                // busy_wait_nop(); // FIXME
+            goto try_again;
+
+        default:
+            barf("mark_closure(static): strange closure type %d", (int)(info->type));
+        }
+    }
+
+    bd = Bdescr((StgPtr) p);
+
+    if (bd->gen != oldest_gen) {
+        // Here we have an object living outside of the non-moving heap. While
+        // we likely evacuated nearly everything to the nonmoving heap during
+        // preparation there are nevertheless a few ways in which we might trace
+        // a reference into younger generations:
+        //
+        //  * a mutable object might have been updated
+        //  * we might have aged an object
+        goto done;
+    }
+
+    ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
+
+    ASSERT(!IS_FORWARDING_PTR(p->header.info));
+
+    // N.B. only the first block of a compact region is guaranteed to carry
+    // BF_NONMOVING; conseqently we must separately check for BF_COMPACT.
+    if (bd->flags & (BF_COMPACT | BF_NONMOVING)) {
+
+        if (bd->flags & BF_COMPACT) {
+            StgCompactNFData *str = objectGetCompact((StgClosure*)p);
+            bd = Bdescr((P_)str);
+
+            if (! (bd->flags & BF_NONMOVING_SWEEPING)) {
+                // Not in the snapshot
+                return;
+            }
+            if (bd->flags & BF_MARKED) {
+                goto done;
+            }
+        } else if (bd->flags & BF_LARGE) {
+            if (! (bd->flags & BF_NONMOVING_SWEEPING)) {
+                // Not in the snapshot
+                goto done;
+            }
+            if (bd->flags & BF_MARKED) {
+                goto done;
+            }
+
+            // Mark contents
+            p = (StgClosure*)bd->start;
+        } else {
+            struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+            nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+
+            /* We don't mark blocks that,
+             *  - were not live at the time that the snapshot was taken, or
+             *  - we have already marked this cycle
+             */
+            uint8_t mark = nonmovingGetMark(seg, block_idx);
+            /* Don't mark things we've already marked (since we may loop) */
+            if (mark == nonmovingMarkEpoch)
+                goto done;
+
+            StgClosure *snapshot_loc =
+              (StgClosure *) nonmovingSegmentGetBlock(seg, nonmovingSegmentInfo(seg)->next_free_snap);
+            if (p >= snapshot_loc && mark == 0) {
+                /*
+                 * In this case we are looking at a block that wasn't allocated
+                 * at the time that the snapshot was taken. We mustn't trace
+                 * things above the allocation pointer that aren't marked since
+                 * they may not be valid objects.
+                 */
+                goto done;
+            }
+        }
+    }
+
+    // A pinned object that is still attached to a capability (because it's not
+    // filled yet). No need to trace it pinned objects can't contain poiners.
+    else if (bd->flags & BF_PINNED) {
+#if defined(DEBUG)
+        bool found_it = false;
+        for (uint32_t i = 0; i < n_capabilities; ++i) {
+            if (capabilities[i]->pinned_object_block == bd) {
+                found_it = true;
+                break;
+            }
+        }
+        ASSERT(found_it);
+#endif
+        return; // we don't update origin here! TODO(osa): explain this
+    }
+
+    else {
+        barf("Strange closure in nonmoving mark: %p", p);
+    }
+
+    /////////////////////////////////////////////////////
+    // Trace pointers
+    /////////////////////////////////////////////////////
+
+    const StgInfoTable *info = get_itbl(p);
+    switch (info->type) {
+
+    case MVAR_CLEAN:
+    case MVAR_DIRTY: {
+        StgMVar *mvar = (StgMVar *) p;
+        PUSH_FIELD(mvar, head);
+        PUSH_FIELD(mvar, tail);
+        PUSH_FIELD(mvar, value);
+        break;
+    }
+
+    case TVAR: {
+        StgTVar *tvar = ((StgTVar *)p);
+        PUSH_FIELD(tvar, current_value);
+        PUSH_FIELD(tvar, first_watch_queue_entry);
+        break;
+    }
+
+    case FUN_2_0:
+        markQueuePushFunSrt(queue, info);
+        PUSH_FIELD(p, payload[1]);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case THUNK_2_0: {
+        StgThunk *thunk = (StgThunk *) p;
+        markQueuePushThunkSrt(queue, info);
+        PUSH_FIELD(thunk, payload[1]);
+        PUSH_FIELD(thunk, payload[0]);
+        break;
+    }
+
+    case CONSTR_2_0:
+        PUSH_FIELD(p, payload[1]);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case THUNK_1_0:
+        markQueuePushThunkSrt(queue, info);
+        PUSH_FIELD((StgThunk *) p, payload[0]);
+        break;
+
+    case FUN_1_0:
+        markQueuePushFunSrt(queue, info);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case CONSTR_1_0:
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case THUNK_0_1:
+        markQueuePushThunkSrt(queue, info);
+        break;
+
+    case FUN_0_1:
+        markQueuePushFunSrt(queue, info);
+        break;
+
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+        break;
+
+    case THUNK_0_2:
+        markQueuePushThunkSrt(queue, info);
+        break;
+
+    case FUN_0_2:
+        markQueuePushFunSrt(queue, info);
+        break;
+
+    case THUNK_1_1:
+        markQueuePushThunkSrt(queue, info);
+        PUSH_FIELD((StgThunk *) p, payload[0]);
+        break;
+
+    case FUN_1_1:
+        markQueuePushFunSrt(queue, info);
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case CONSTR_1_1:
+        PUSH_FIELD(p, payload[0]);
+        break;
+
+    case FUN:
+        markQueuePushFunSrt(queue, info);
+        goto gen_obj;
+
+    case THUNK: {
+        markQueuePushThunkSrt(queue, info);
+        for (StgWord i = 0; i < info->layout.payload.ptrs; i++) {
+            StgClosure **field = &((StgThunk *) p)->payload[i];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    gen_obj:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case WEAK:
+    case PRIM:
+    {
+        for (StgWord i = 0; i < info->layout.payload.ptrs; i++) {
+            StgClosure **field = &((StgClosure *) p)->payload[i];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    case BCO: {
+        StgBCO *bco = (StgBCO *)p;
+        PUSH_FIELD(bco, instrs);
+        PUSH_FIELD(bco, literals);
+        PUSH_FIELD(bco, ptrs);
+        break;
+    }
+
+
+    case IND: {
+        PUSH_FIELD((StgInd *) p, indirectee);
+        if (origin != NULL) {
+            p_next = ((StgInd*)p)->indirectee;
+        }
+        break;
+    }
+
+    case BLACKHOLE: {
+        PUSH_FIELD((StgInd *) p, indirectee);
+        StgClosure *indirectee = ((StgInd*)p)->indirectee;
+        if (GET_CLOSURE_TAG(indirectee) == 0 || origin == NULL) {
+            // do nothing
+        } else {
+            p_next = indirectee;
+        }
+        break;
+    }
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        PUSH_FIELD((StgMutVar *)p, var);
+        break;
+
+    case BLOCKING_QUEUE: {
+        StgBlockingQueue *bq = (StgBlockingQueue *)p;
+        PUSH_FIELD(bq, bh);
+        PUSH_FIELD(bq, owner);
+        PUSH_FIELD(bq, queue);
+        PUSH_FIELD(bq, link);
+        break;
+    }
+
+    case THUNK_SELECTOR:
+        if (RtsFlags.GcFlags.nonmovingSelectorOpt) {
+            nonmoving_eval_thunk_selector(queue, (StgSelector*)p, origin);
+        } else {
+            PUSH_FIELD((StgSelector *) p, selectee);
+        }
+        break;
+
+    case AP_STACK: {
+        StgAP_STACK *ap = (StgAP_STACK *)p;
+        PUSH_FIELD(ap, fun);
+        mark_stack_(queue, (StgPtr) ap->payload, (StgPtr) ap->payload + ap->size);
+        break;
+    }
+
+    case PAP: {
+        StgPAP *pap = (StgPAP *) p;
+        PUSH_FIELD(pap, fun);
+        mark_PAP_payload(queue, pap->fun, pap->payload, pap->n_args);
+        break;
+    }
+
+    case AP: {
+        StgAP *ap = (StgAP *) p;
+        PUSH_FIELD(ap, fun);
+        mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
+        break;
+    }
+
+    case ARR_WORDS:
+        // nothing to follow
+        break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+        // TODO: Check this against Scav.c
+        markQueuePushArray(queue, (StgMutArrPtrs *) p, 0);
+        break;
+
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: {
+        StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs *) p;
+        for (StgWord i = 0; i < arr->ptrs; i++) {
+            StgClosure **field = &arr->payload[i];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    case TSO:
+        mark_tso(queue, (StgTSO *) p);
+        break;
+
+    case STACK: {
+        // See Note [StgStack dirtiness flags and concurrent marking]
+        StgStack *stack = (StgStack *) p;
+        StgWord8 marking = stack->marking;
+
+        // N.B. stack->marking must be != nonmovingMarkEpoch unless
+        // someone has already marked it.
+        if (cas_word8(&stack->marking, marking, nonmovingMarkEpoch)
+              != nonmovingMarkEpoch) {
+            // We have claimed the right to mark the stack.
+            mark_stack(queue, stack);
+        } else {
+            // A mutator has already started marking the stack; we just let it
+            // do its thing and move on. There's no reason to wait; we know that
+            // the stack will be fully marked before we sweep due to the final
+            // post-mark synchronization. Most importantly, we do not set its
+            // mark bit, the mutator is responsible for this.
+            goto done;
+        }
+        break;
+    }
+
+    case MUT_PRIM: {
+        for (StgHalfWord p_idx = 0; p_idx < info->layout.payload.ptrs; ++p_idx) {
+            StgClosure **field = &p->payload[p_idx];
+            markQueuePushClosure(queue, *field, field);
+        }
+        break;
+    }
+
+    case TREC_CHUNK: {
+        // TODO: Should we abort here? This should have already been marked
+        // when we dirtied the TSO
+        StgTRecChunk *tc = ((StgTRecChunk *) p);
+        PUSH_FIELD(tc, prev_chunk);
+        TRecEntry *end = &tc->entries[tc->next_entry_idx];
+        for (TRecEntry *e = &tc->entries[0]; e < end; e++) {
+            markQueuePushClosure_(queue, (StgClosure *) e->tvar);
+            markQueuePushClosure_(queue, (StgClosure *) e->expected_value);
+            markQueuePushClosure_(queue, (StgClosure *) e->new_value);
+        }
+        break;
+    }
+
+    case WHITEHOLE:
+        while (get_volatile_itbl(p)->type == WHITEHOLE);
+        goto try_again;
+
+    case COMPACT_NFDATA:
+        break;
+
+    default:
+        barf("mark_closure: unimplemented/strange closure type %d @ %p",
+             info->type, p);
+    }
+
+#   undef PUSH_FIELD
+
+    /* Set the mark bit: it's important that we do this only after we actually push
+     * the object's pointers since in the case of marking stacks there may be a
+     * mutator waiting for us to finish so it can start execution.
+     */
+    if (bd->flags & BF_COMPACT) {
+        StgCompactNFData *str = objectGetCompact((StgClosure*)p);
+        dbl_link_remove(bd, &nonmoving_compact_objects);
+        dbl_link_onto(bd, &nonmoving_marked_compact_objects);
+        StgWord blocks = str->totalW / BLOCK_SIZE_W;
+        n_nonmoving_compact_blocks -= blocks;
+        n_nonmoving_marked_compact_blocks += blocks;
+        bd->flags |= BF_MARKED;
+    } else if (bd->flags & BF_LARGE) {
+        /* Marking a large object isn't idempotent since we move it to
+         * nonmoving_marked_large_objects; to ensure that we don't repeatedly
+         * mark a large object, we only set BF_MARKED on large objects in the
+         * nonmoving heap while holding nonmoving_large_objects_mutex
+         */
+        ACQUIRE_LOCK(&nonmoving_large_objects_mutex);
+        if (! (bd->flags & BF_MARKED)) {
+            // Remove the object from nonmoving_large_objects and link it to
+            // nonmoving_marked_large_objects
+            dbl_link_remove(bd, &nonmoving_large_objects);
+            dbl_link_onto(bd, &nonmoving_marked_large_objects);
+            n_nonmoving_large_blocks -= bd->blocks;
+            n_nonmoving_marked_large_blocks += bd->blocks;
+            bd->flags |= BF_MARKED;
+        }
+        RELEASE_LOCK(&nonmoving_large_objects_mutex);
+    } else if (bd->flags & BF_NONMOVING) {
+        // TODO: Kill repetition
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        nonmovingSetMark(seg, block_idx);
+        nonmoving_live_words += nonmovingSegmentBlockSize(seg) / sizeof(W_);
+    }
+
+    // If we found a indirection to shortcut keep going.
+    if (p_next) {
+        p = p_next;
+        goto try_again;
+    }
+
+done:
+    if (origin != NULL && (!HEAP_ALLOCED(p) || bd->flags & BF_NONMOVING)) {
+        if (UNTAG_CLOSURE((StgClosure*)p0) != p && *origin == p0) {
+            if (cas((StgVolatilePtr)origin, (StgWord)p0, (StgWord)TAG_CLOSURE(tag, p)) == (StgWord)p0) {
+                // debugBelch("Thunk optimization successful\n");
+            }
+        }
+    }
+}
+
+/* This is the main mark loop.
+ * Invariants:
+ *
+ *  a. nonmovingPrepareMark has been called.
+ *  b. the nursery has been fully evacuated into the non-moving generation.
+ *  c. the mark queue has been seeded with a set of roots.
+ *
+ */
+GNUC_ATTR_HOT void
+nonmovingMark (MarkQueue *queue)
+{
+    traceConcMarkBegin();
+    debugTrace(DEBUG_nonmoving_gc, "Starting mark pass");
+    unsigned int count = 0;
+    while (true) {
+        count++;
+        MarkQueueEnt ent = markQueuePop(queue);
+
+        switch (nonmovingMarkQueueEntryType(&ent)) {
+        case MARK_CLOSURE:
+            mark_closure(queue, ent.mark_closure.p, ent.mark_closure.origin);
+            break;
+        case MARK_ARRAY: {
+            const StgMutArrPtrs *arr = ent.mark_array.array;
+            StgWord start = ent.mark_array.start_index >> 16;
+            StgWord end = start + MARK_ARRAY_CHUNK_LENGTH;
+            if (end < arr->ptrs) {
+                markQueuePushArray(queue, ent.mark_array.array, end);
+            } else {
+                end = arr->ptrs;
+            }
+            for (StgWord i = start; i < end; i++) {
+                markQueuePushClosure_(queue, arr->payload[i]);
+            }
+            break;
+        }
+        case NULL_ENTRY:
+            // Perhaps the update remembered set has more to mark...
+            if (upd_rem_set_block_list) {
+                ACQUIRE_LOCK(&upd_rem_set_lock);
+                bdescr *old = queue->blocks;
+                queue->blocks = upd_rem_set_block_list;
+                queue->top = (MarkQueueBlock *) queue->blocks->start;
+                upd_rem_set_block_list = NULL;
+                RELEASE_LOCK(&upd_rem_set_lock);
+
+                ACQUIRE_SM_LOCK;
+                freeGroup(old);
+                RELEASE_SM_LOCK;
+            } else {
+                // Nothing more to do
+                debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count);
+                traceConcMarkEnd(count);
+                return;
+            }
+        }
+    }
+}
+
+// A variant of `isAlive` that works for non-moving heap. Used for:
+//
+// - Collecting weak pointers; checking key of a weak pointer.
+// - Resurrecting threads; checking if a thread is dead.
+// - Sweeping object lists: large_objects, mut_list, stable_name_table.
+//
+// This may only be used after a full mark but before nonmovingSweep as it
+// relies on the correctness of the next_free_snap and mark bitmaps.
+bool nonmovingIsAlive (StgClosure *p)
+{
+    // Ignore static closures. See comments in `isAlive`.
+    if (!HEAP_ALLOCED_GC(p)) {
+        return true;
+    }
+
+    bdescr *bd = Bdescr((P_)p);
+
+    // All non-static objects in the non-moving heap should be marked as
+    // BF_NONMOVING
+    ASSERT(bd->flags & BF_NONMOVING);
+
+    if (bd->flags & (BF_COMPACT | BF_LARGE)) {
+        if (bd->flags & BF_COMPACT) {
+            StgCompactNFData *str = objectGetCompact((StgClosure*)p);
+            bd = Bdescr((P_)str);
+        }
+        return (bd->flags & BF_NONMOVING_SWEEPING) == 0
+                   // the large object wasn't in the snapshot and therefore wasn't marked
+            || (bd->flags & BF_MARKED) != 0;
+                   // The object was marked
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx i = nonmovingGetBlockIdx((StgPtr) p);
+        uint8_t mark =  nonmovingGetMark(seg, i);
+        if (i >= nonmovingSegmentInfo(seg)->next_free_snap) {
+            // If the object is allocated after next_free_snap then one of the
+            // following must be true:
+            //
+            // * if its mark is 0 then the block was not allocated last time
+            //   the segment was swept; however, it may have been allocated since
+            //   then and therefore we must conclude that the block is alive.
+            //
+            // * if its mark is equal to nonmovingMarkEpoch then we found that
+            //   the object was alive in the snapshot of the current GC (recall
+            //   that this function may only be used after a mark).
+            //   Consequently we must conclude that the object is still alive.
+            //
+            // * if its mark is not equal to nonmovingMarkEpoch then we found
+            //   that the object was not reachable in the last snapshot.
+            //   Assuming that the mark is complete we can conclude that the
+            //   object is dead since the snapshot invariant guarantees that
+            //   all objects alive in the snapshot would be marked.
+            //
+            return mark == nonmovingMarkEpoch || mark == 0;
+        } else {
+            // If the object is below next_free_snap then the snapshot
+            // invariant guarantees that it is marked if reachable.
+            return mark == nonmovingMarkEpoch;
+        }
+    }
+}
+
+// Check whether a snapshotted object is alive. That is for an object that we
+// know to be in the snapshot, is its mark bit set. It is imperative that the
+// object is in the snapshot (e.g. was in the nonmoving heap at the time that
+// the snapshot was taken) since we assume that its mark bit reflects its
+// reachability.
+//
+// This is used when
+//
+// - Collecting weak pointers; checking key of a weak pointer.
+// - Resurrecting threads; checking if a thread is dead.
+// - Sweeping object lists: large_objects, mut_list, stable_name_table.
+//
+static bool nonmovingIsNowAlive (StgClosure *p)
+{
+    // Ignore static closures. See comments in `isAlive`.
+    if (!HEAP_ALLOCED_GC(p)) {
+        return true;
+    }
+
+    bdescr *bd = Bdescr((P_)p);
+
+    // All non-static objects in the non-moving heap should be marked as
+    // BF_NONMOVING
+    ASSERT(bd->flags & BF_NONMOVING);
+
+    if (bd->flags & BF_LARGE) {
+        return (bd->flags & BF_NONMOVING_SWEEPING) == 0
+                   // the large object wasn't in the snapshot and therefore wasn't marked
+            || (bd->flags & BF_MARKED) != 0;
+                   // The object was marked
+    } else {
+        return nonmovingClosureMarkedThisCycle((P_)p);
+    }
+}
+
+// Non-moving heap variant of `tidyWeakList`
+bool nonmovingTidyWeaks (struct MarkQueue_ *queue)
+{
+    bool did_work = false;
+
+    StgWeak **last_w = &nonmoving_old_weak_ptr_list;
+    StgWeak *next_w;
+    for (StgWeak *w = nonmoving_old_weak_ptr_list; w != NULL; w = next_w) {
+        if (w->header.info == &stg_DEAD_WEAK_info) {
+            // finalizeWeak# was called on the weak
+            next_w = w->link;
+            *last_w = next_w;
+            continue;
+        }
+
+        // Otherwise it's a live weak
+        ASSERT(w->header.info == &stg_WEAK_info);
+
+        if (nonmovingIsNowAlive(w->key)) {
+            nonmovingMarkLiveWeak(queue, w);
+            did_work = true;
+
+            // remove this weak ptr from old_weak_ptr list
+            *last_w = w->link;
+            next_w = w->link;
+
+            // and put it on the weak ptr list
+            w->link = nonmoving_weak_ptr_list;
+            nonmoving_weak_ptr_list = w;
+        } else {
+            last_w = &(w->link);
+            next_w = w->link;
+        }
+    }
+
+    return did_work;
+}
+
+void nonmovingMarkDeadWeak (struct MarkQueue_ *queue, StgWeak *w)
+{
+    if (w->cfinalizers != &stg_NO_FINALIZER_closure) {
+        markQueuePushClosure_(queue, w->value);
+    }
+    markQueuePushClosure_(queue, w->finalizer);
+}
+
+void nonmovingMarkLiveWeak (struct MarkQueue_ *queue, StgWeak *w)
+{
+    ASSERT(nonmovingClosureMarkedThisCycle((P_)w));
+    markQueuePushClosure_(queue, w->value);
+    markQueuePushClosure_(queue, w->finalizer);
+    markQueuePushClosure_(queue, w->cfinalizers);
+}
+
+// When we're done with marking, any weak pointers with non-marked keys will be
+// considered "dead". We mark values and finalizers of such weaks, and then
+// schedule them for finalization in `scheduleFinalizers` (which we run during
+// synchronization).
+void nonmovingMarkDeadWeaks (struct MarkQueue_ *queue, StgWeak **dead_weaks)
+{
+    StgWeak *next_w;
+    for (StgWeak *w = nonmoving_old_weak_ptr_list; w; w = next_w) {
+        ASSERT(!nonmovingClosureMarkedThisCycle((P_)(w->key)));
+        nonmovingMarkDeadWeak(queue, w);
+        next_w = w ->link;
+        w->link = *dead_weaks;
+        *dead_weaks = w;
+    }
+}
+
+// Non-moving heap variant of of `tidyThreadList`
+void nonmovingTidyThreads ()
+{
+    StgTSO *next;
+    StgTSO **prev = &nonmoving_old_threads;
+    for (StgTSO *t = nonmoving_old_threads; t != END_TSO_QUEUE; t = next) {
+
+        next = t->global_link;
+
+        // N.B. This thread is in old_threads, consequently we *know* it is in
+        // the snapshot and it is therefore safe to rely on the bitmap to
+        // determine its reachability.
+        if (nonmovingIsNowAlive((StgClosure*)t)) {
+            // alive
+            *prev = next;
+
+            // move this thread onto threads list
+            t->global_link = nonmoving_threads;
+            nonmoving_threads = t;
+        } else {
+            // not alive (yet): leave this thread on the old_threads list
+            prev = &(t->global_link);
+        }
+    }
+}
+
+void nonmovingResurrectThreads (struct MarkQueue_ *queue, StgTSO **resurrected_threads)
+{
+    StgTSO *next;
+    for (StgTSO *t = nonmoving_old_threads; t != END_TSO_QUEUE; t = next) {
+        next = t->global_link;
+
+        switch (t->what_next) {
+        case ThreadKilled:
+        case ThreadComplete:
+            continue;
+        default:
+            markQueuePushClosure_(queue, (StgClosure*)t);
+            t->global_link = *resurrected_threads;
+            *resurrected_threads = t;
+        }
+    }
+}
+
+#if defined(DEBUG)
+
+void printMarkQueueEntry (MarkQueueEnt *ent)
+{
+    switch(nonmovingMarkQueueEntryType(ent)) {
+      case MARK_CLOSURE:
+        debugBelch("Closure: ");
+        printClosure(ent->mark_closure.p);
+        break;
+      case MARK_ARRAY:
+        debugBelch("Array\n");
+        break;
+      case NULL_ENTRY:
+        debugBelch("End of mark\n");
+        break;
+    }
+}
+
+void printMarkQueue (MarkQueue *q)
+{
+    debugBelch("======== MARK QUEUE ========\n");
+    for (bdescr *block = q->blocks; block; block = block->link) {
+        MarkQueueBlock *queue = (MarkQueueBlock*)block->start;
+        for (uint32_t i = 0; i < queue->head; ++i) {
+            printMarkQueueEntry(&queue->entries[i]);
+        }
+    }
+    debugBelch("===== END OF MARK QUEUE ====\n");
+}
+
+#endif
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe150f47cb4971e575c0864b69f05e28d95e0293
--- /dev/null
+++ b/rts/sm/NonMovingMark.h
@@ -0,0 +1,205 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Mark phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#include "Hash.h"
+#include "Task.h"
+#include "NonMoving.h"
+
+#include "BeginPrivate.h"
+
+#include "Hash.h"
+
+enum EntryType {
+    NULL_ENTRY = 0,
+    MARK_CLOSURE,
+    MARK_ARRAY
+};
+
+/* Note [Origin references in the nonmoving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * To implement indirection short-cutting and the selector optimisation the
+ * collector needs to know where it found references, so it can update the
+ * reference if it later turns out that points to an indirection. For this
+ * reason, each mark queue entry contains two things:
+ *
+ * - a pointer to the object to be marked (p), and
+ *
+ * - a pointer to the field where we found the reference (origin)
+ *
+ * Note that the origin pointer is an interior pointer: it points not to a
+ * valid closure (with info table pointer) but rather to a field inside a closure.
+ * Since such references can't be safely scavenged we establish the invariant
+ * that the origin pointer may only point to a field of an object living in the
+ * nonmoving heap, where no scavenging is needed.
+ *
+ */
+
+typedef struct {
+    // Which kind of mark queue entry we have is determined by the low bits of
+    // the second word: they must be zero in the case of a mark_closure entry
+    // (since the second word of a mark_closure entry points to a pointer and
+    // pointers must be word-aligned).  In the case of a mark_array we set them
+    // to 0x3 (the value of start_index is shifted to the left to accomodate
+    // this). null_entry where p==NULL is used to indicate the end of the queue.
+    union {
+        struct {
+            void *p;              // must be NULL
+        } null_entry;
+        struct {
+            StgClosure *p;        // the object to be marked
+            StgClosure **origin;  // field where this reference was found.
+                                  // See Note [Origin references in the nonmoving collector]
+        } mark_closure;
+        struct {
+            const StgMutArrPtrs *array;
+            StgWord start_index;  // start index is shifted to the left by 16 bits
+        } mark_array;
+    };
+} MarkQueueEnt;
+
+INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent)
+{
+    if (ent->null_entry.p == NULL) {
+        return NULL_ENTRY;
+    } else if (((uintptr_t) ent->mark_closure.origin & TAG_BITS) == 0) {
+        return MARK_CLOSURE;
+    } else {
+        ASSERT((ent->mark_array.start_index & TAG_BITS) == 0x3);
+        return MARK_ARRAY;
+    }
+}
+
+typedef struct {
+    // index of first *unused* queue entry
+    uint32_t head;
+
+    MarkQueueEnt entries[];
+} MarkQueueBlock;
+
+// How far ahead in mark queue to prefetch?
+#define MARK_PREFETCH_QUEUE_DEPTH 5
+
+/* The mark queue is not capable of concurrent read or write.
+ *
+ * invariants:
+ *
+ *  a. top == blocks->start;
+ *  b. there is always a valid MarkQueueChunk, although it may be empty
+ *     (e.g. top->head == 0).
+ */
+typedef struct MarkQueue_ {
+    // A singly link-list of blocks, each containing a MarkQueueChunk.
+    bdescr *blocks;
+
+    // Cached value of blocks->start.
+    MarkQueueBlock *top;
+
+    // Is this a mark queue or a capability-local update remembered set?
+    bool is_upd_rem_set;
+
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    // A ring-buffer of entries which we will mark next
+    MarkQueueEnt prefetch_queue[MARK_PREFETCH_QUEUE_DEPTH];
+    // The first free slot in prefetch_queue.
+    uint8_t prefetch_head;
+#endif
+} MarkQueue;
+
+/* While it shares its representation with MarkQueue, UpdRemSet differs in
+ * behavior when pushing; namely full chunks are immediately pushed to the
+ * global update remembered set, not accumulated into a chain. We make this
+ * distinction apparent in the types.
+ */
+typedef struct {
+    MarkQueue queue;
+} UpdRemSet;
+
+// Number of blocks to allocate for a mark queue
+#define MARK_QUEUE_BLOCKS 16
+
+// The length of MarkQueueBlock.entries
+#define MARK_QUEUE_BLOCK_ENTRIES ((MARK_QUEUE_BLOCKS * BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
+
+extern bdescr *nonmoving_large_objects, *nonmoving_marked_large_objects,
+              *nonmoving_compact_objects, *nonmoving_marked_compact_objects;
+extern memcount n_nonmoving_large_blocks, n_nonmoving_marked_large_blocks,
+                n_nonmoving_compact_blocks, n_nonmoving_marked_compact_blocks;
+
+extern StgTSO *nonmoving_old_threads;
+extern StgWeak *nonmoving_old_weak_ptr_list;
+extern StgTSO *nonmoving_threads;
+extern StgWeak *nonmoving_weak_ptr_list;
+
+#if defined(DEBUG)
+extern StgIndStatic *debug_caf_list_snapshot;
+#endif
+
+extern MarkQueue *current_mark_queue;
+extern bdescr *upd_rem_set_block_list;
+
+
+void nonmovingMarkInitUpdRemSet(void);
+
+void init_upd_rem_set(UpdRemSet *rset);
+void reset_upd_rem_set(UpdRemSet *rset);
+void updateRemembSetPushClosure(Capability *cap, StgClosure *p);
+void updateRemembSetPushThunk(Capability *cap, StgThunk *p);
+void updateRemembSetPushTSO(Capability *cap, StgTSO *tso);
+void updateRemembSetPushStack(Capability *cap, StgStack *stack);
+
+#if defined(THREADED_RTS)
+void nonmovingFlushCapUpdRemSetBlocks(Capability *cap);
+void nonmovingBeginFlush(Task *task);
+bool nonmovingWaitForFlush(void);
+void nonmovingFinishFlush(Task *task);
+#endif
+
+void markQueueAddRoot(MarkQueue* q, StgClosure** root);
+
+void initMarkQueue(MarkQueue *queue);
+void freeMarkQueue(MarkQueue *queue);
+void nonmovingMark(struct MarkQueue_ *restrict queue);
+
+bool nonmovingTidyWeaks(struct MarkQueue_ *queue);
+void nonmovingTidyThreads(void);
+void nonmovingMarkDeadWeaks(struct MarkQueue_ *queue, StgWeak **dead_weak_ptr_list);
+void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_threads);
+bool nonmovingIsAlive(StgClosure *p);
+void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w);
+void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w);
+void nonmovingAddUpdRemSetBlocks(struct MarkQueue_ *rset);
+
+void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent);
+void markQueuePushClosureGC(MarkQueue *q, StgClosure *p);
+void markQueuePushClosure(MarkQueue *q,
+                             StgClosure *p,
+                             StgClosure **origin);
+void markQueuePushClosure_(MarkQueue *q, StgClosure *p);
+void markQueuePushThunkSrt(MarkQueue *q, const StgInfoTable *info);
+void markQueuePushFunSrt(MarkQueue *q, const StgInfoTable *info);
+void markQueuePushArray(MarkQueue *q, const StgMutArrPtrs *array, StgWord start_index);
+void updateRemembSetPushThunkEager(Capability *cap,
+                                  const StgThunkInfoTable *orig_info,
+                                  StgThunk *thunk);
+
+INLINE_HEADER bool markQueueIsEmpty(MarkQueue *q)
+{
+    return (q->blocks == NULL) || (q->top->head == 0 && q->blocks->link == NULL);
+}
+
+#if defined(DEBUG)
+
+void printMarkQueueEntry(MarkQueueEnt *ent);
+void printMarkQueue(MarkQueue *q);
+
+#endif
+
+#include "EndPrivate.h"
diff --git a/rts/sm/NonMovingScav.c b/rts/sm/NonMovingScav.c
new file mode 100644
index 0000000000000000000000000000000000000000..9583c7baf9ec7140aafd95043aceedf23c936539
--- /dev/null
+++ b/rts/sm/NonMovingScav.c
@@ -0,0 +1,389 @@
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "NonMoving.h"
+#include "NonMovingScav.h"
+#include "Capability.h"
+#include "Scav.h"
+#include "Evac.h"
+#include "GCThread.h" // for GCUtils.h
+#include "GCUtils.h"
+#include "Printer.h"
+#include "MarkWeak.h" // scavengeLiveWeak
+
+void
+nonmovingScavengeOne (StgClosure *q)
+{
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+    StgPtr p = (StgPtr)q;
+    const StgInfoTable *info = get_itbl(q);
+    const bool saved_eager_promotion = gct->eager_promotion;
+
+    switch (info->type) {
+
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+    {
+        StgMVar *mvar = ((StgMVar *)p);
+        gct->eager_promotion = false;
+        evacuate((StgClosure **)&mvar->head);
+        evacuate((StgClosure **)&mvar->tail);
+        evacuate((StgClosure **)&mvar->value);
+        gct->eager_promotion = saved_eager_promotion;
+        if (gct->failed_to_evac) {
+            mvar->header.info = &stg_MVAR_DIRTY_info;
+        } else {
+            mvar->header.info = &stg_MVAR_CLEAN_info;
+        }
+        break;
+    }
+
+    case TVAR:
+    {
+        StgTVar *tvar = ((StgTVar *)p);
+        gct->eager_promotion = false;
+        evacuate((StgClosure **)&tvar->current_value);
+        evacuate((StgClosure **)&tvar->first_watch_queue_entry);
+        gct->eager_promotion = saved_eager_promotion;
+        if (gct->failed_to_evac) {
+            tvar->header.info = &stg_TVAR_DIRTY_info;
+        } else {
+            tvar->header.info = &stg_TVAR_CLEAN_info;
+        }
+        break;
+    }
+
+    case FUN_2_0:
+        scavenge_fun_srt(info);
+        evacuate(&((StgClosure *)p)->payload[1]);
+        evacuate(&((StgClosure *)p)->payload[0]);
+        break;
+
+    case THUNK_2_0:
+        scavenge_thunk_srt(info);
+        evacuate(&((StgThunk *)p)->payload[1]);
+        evacuate(&((StgThunk *)p)->payload[0]);
+        break;
+
+    case CONSTR_2_0:
+        evacuate(&((StgClosure *)p)->payload[1]);
+        evacuate(&((StgClosure *)p)->payload[0]);
+        break;
+
+    case THUNK_1_0:
+        scavenge_thunk_srt(info);
+        evacuate(&((StgThunk *)p)->payload[0]);
+        break;
+
+    case FUN_1_0:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_1_0:
+        evacuate(&((StgClosure *)p)->payload[0]);
+        break;
+
+    case THUNK_0_1:
+        scavenge_thunk_srt(info);
+        break;
+
+    case FUN_0_1:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_0_1:
+        break;
+
+    case THUNK_0_2:
+        scavenge_thunk_srt(info);
+        break;
+
+    case FUN_0_2:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_0_2:
+        break;
+
+    case THUNK_1_1:
+        scavenge_thunk_srt(info);
+        evacuate(&((StgThunk *)p)->payload[0]);
+        break;
+
+    case FUN_1_1:
+        scavenge_fun_srt(info);
+        FALLTHROUGH;
+    case CONSTR_1_1:
+        evacuate(&q->payload[0]);
+        break;
+
+    case FUN:
+        scavenge_fun_srt(info);
+        goto gen_obj;
+
+    case THUNK:
+    {
+        scavenge_thunk_srt(info);
+        StgPtr end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+        for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+            evacuate((StgClosure **)p);
+        }
+        break;
+    }
+
+    case WEAK:
+    {
+        // We must evacuate the key since it may refer to an object in the
+        // moving heap which may be long gone by the time we call
+        // nonmovingTidyWeaks.
+        StgWeak *weak = (StgWeak *) p;
+        gct->eager_promotion = true;
+        evacuate(&weak->key);
+        gct->eager_promotion = saved_eager_promotion;
+        goto gen_obj;
+    }
+
+    gen_obj:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case PRIM:
+    {
+        StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+            evacuate((StgClosure **)p);
+        }
+        break;
+    }
+
+    case BCO: {
+        StgBCO *bco = (StgBCO *)p;
+        evacuate((StgClosure **)&bco->instrs);
+        evacuate((StgClosure **)&bco->literals);
+        evacuate((StgClosure **)&bco->ptrs);
+        break;
+    }
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        gct->eager_promotion = false;
+        evacuate(&((StgMutVar *)p)->var);
+        gct->eager_promotion = saved_eager_promotion;
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+        }
+        break;
+
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)p;
+
+        gct->eager_promotion = false;
+        evacuate(&bq->bh);
+        evacuate((StgClosure**)&bq->owner);
+        evacuate((StgClosure**)&bq->queue);
+        evacuate((StgClosure**)&bq->link);
+        gct->eager_promotion = saved_eager_promotion;
+
+        if (gct->failed_to_evac) {
+            bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+        } else {
+            bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+        }
+        break;
+    }
+
+    case THUNK_SELECTOR:
+    {
+        StgSelector *s = (StgSelector *)p;
+        evacuate(&s->selectee);
+        break;
+    }
+
+    // A chunk of stack saved in a heap object
+    case AP_STACK:
+    {
+        StgAP_STACK *ap = (StgAP_STACK *)p;
+
+        evacuate(&ap->fun);
+        scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+        break;
+    }
+
+    case PAP:
+        p = scavenge_PAP((StgPAP *)p);
+        break;
+
+    case AP:
+        scavenge_AP((StgAP *)p);
+        break;
+
+    case ARR_WORDS:
+        // nothing to follow
+        break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    {
+        gct->eager_promotion = false;
+        scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+        gct->eager_promotion = saved_eager_promotion;
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+        }
+        gct->failed_to_evac = true; // always put it on the mutable list.
+        break;
+    }
+
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+        // follow everything
+    {
+        scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+        }
+        break;
+    }
+
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+        // follow everything
+    {
+        StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        gct->eager_promotion = false;
+        for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
+            evacuate((StgClosure **)p);
+        }
+        gct->eager_promotion = saved_eager_promotion;
+
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_CLEAN_info;
+        }
+        gct->failed_to_evac = true; // always put it on the mutable list.
+        break;
+    }
+
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+        // follow everything
+    {
+        StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
+            evacuate((StgClosure **)p);
+        }
+
+        if (gct->failed_to_evac) {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
+        } else {
+            ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info;
+        }
+        break;
+    }
+
+    case TSO:
+    {
+        scavengeTSO((StgTSO *)p);
+        break;
+    }
+
+    case STACK:
+    {
+        StgStack *stack = (StgStack*)p;
+
+        gct->eager_promotion = false;
+        scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        gct->eager_promotion = saved_eager_promotion;
+        stack->dirty = gct->failed_to_evac;
+        break;
+    }
+
+    case MUT_PRIM:
+    {
+        StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        gct->eager_promotion = false;
+        for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+            evacuate((StgClosure **)p);
+        }
+        gct->eager_promotion = saved_eager_promotion;
+        gct->failed_to_evac = true; // mutable
+        break;
+    }
+
+    case TREC_CHUNK:
+      {
+        StgWord i;
+        StgTRecChunk *tc = ((StgTRecChunk *) p);
+        TRecEntry *e = &(tc -> entries[0]);
+        gct->eager_promotion = false;
+        evacuate((StgClosure **)&tc->prev_chunk);
+        for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+          evacuate((StgClosure **)&e->tvar);
+          evacuate((StgClosure **)&e->expected_value);
+          evacuate((StgClosure **)&e->new_value);
+        }
+        gct->eager_promotion = saved_eager_promotion;
+        gct->failed_to_evac = true; // mutable
+        break;
+      }
+
+    case IND:
+    case BLACKHOLE:
+    case IND_STATIC:
+        evacuate(&((StgInd *)p)->indirectee);
+        break;
+
+    case COMPACT_NFDATA:
+        scavenge_compact((StgCompactNFData*)p);
+        break;
+
+    default:
+        barf("nonmoving scavenge: unimplemented/strange closure type %d @ %p",
+             info->type, p);
+    }
+
+    if (gct->failed_to_evac) {
+        // Mutable object or points to a younger object, add to the mut_list
+        gct->failed_to_evac = false;
+        if (oldest_gen->no > 0) {
+            recordMutableGen_GC(q, oldest_gen->no);
+        }
+    }
+}
+
+/* Scavenge objects evacuated into a nonmoving segment by a minor GC */
+void
+scavengeNonmovingSegment (struct NonmovingSegment *seg)
+{
+    const StgWord blk_size = nonmovingSegmentBlockSize(seg);
+    gct->evac_gen_no = oldest_gen->no;
+    gct->failed_to_evac = false;
+
+    // scavenge objects between scan and free_ptr whose bitmap bits are 0
+    bdescr *seg_block = Bdescr((P_)seg);
+
+    ASSERT(seg_block->u.scan >= (P_)nonmovingSegmentGetBlock(seg, 0));
+    ASSERT(seg_block->u.scan <= (P_)nonmovingSegmentGetBlock(seg, seg->next_free));
+
+    StgPtr scan_end = (P_)nonmovingSegmentGetBlock(seg, seg->next_free);
+    if (seg_block->u.scan == scan_end)
+        return;
+
+    nonmoving_block_idx p_idx = nonmovingGetBlockIdx(seg_block->u.scan);
+    while (seg_block->u.scan < scan_end) {
+        StgClosure *p = (StgClosure*)seg_block->u.scan;
+
+        // bit set = was allocated in a previous GC, no need to scavenge
+        // bit not set = new allocation, so scavenge
+        if (nonmovingGetMark(seg, p_idx) == 0) {
+            nonmovingScavengeOne(p);
+        }
+
+        p_idx++;
+        seg_block->u.scan = (P_)(((uint8_t*)seg_block->u.scan) + blk_size);
+    }
+}
diff --git a/rts/sm/NonMovingScav.h b/rts/sm/NonMovingScav.h
new file mode 100644
index 0000000000000000000000000000000000000000..021385e1e99e35d2fb0e9c70559719e9d022423e
--- /dev/null
+++ b/rts/sm/NonMovingScav.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "NonMoving.h"
+
+#include "BeginPrivate.h"
+
+void nonmovingScavengeOne(StgClosure *p);
+void scavengeNonmovingSegment(struct NonmovingSegment *seg);
+
+#include "EndPrivate.h"
diff --git a/rts/sm/NonMovingShortcut.c b/rts/sm/NonMovingShortcut.c
new file mode 100644
index 0000000000000000000000000000000000000000..83c4857677b4fd0327b130d844ad5a6c94a41f95
--- /dev/null
+++ b/rts/sm/NonMovingShortcut.c
@@ -0,0 +1,326 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2019
+ *
+ * Non-moving garbage collector and allocator:
+ * Indirection short-cutting and the selector optimisation
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "GC.h"
+#include "SMPClosureOps.h"
+#include "NonMovingMark.h"
+#include "NonMovingShortcut.h"
+#include "Printer.h"
+
+#define MAX_THUNK_SELECTOR_DEPTH 16
+
+//#define SELECTOR_OPT_DEBUG
+
+#if defined(SELECTOR_OPT_DEBUG)
+static void
+print_selector_chain(StgClosure *p)
+{
+    debugBelch("Selector chain: %p", (void*)p);
+    StgClosure *next = p->payload[0];
+    while (next != NULL) {
+        debugBelch(", %p", next);
+        next = next->payload[0];
+    }
+    debugBelch("\n");
+}
+#endif
+
+static void
+update_selector_chain(
+        StgClosure *chain,
+        StgClosure **origin,
+        StgSelector * const p0,
+        StgClosure * const val
+) {
+    ASSERT(val != NULL);
+
+    // Make sure we don't introduce non-moving-to-moving pointers here.
+    ASSERT(isNonmovingClosure(val));
+
+    // This case we can't handle because we don't know info ptr of the closure
+    // before we locked it.
+    ASSERT(chain != val);
+
+#if defined(SELECTOR_OPT_DEBUG)
+    if (chain != NULL) {
+        print_selector_chain(chain);
+        debugBelch("Value: ");
+        printClosure(val);
+    }
+#endif
+
+    while (chain) {
+        // debugBelch("chain: %p\n", (void*)chain);
+
+        StgClosure *next = chain->payload[0];
+
+        // We only update closures in the non-moving heap
+        ASSERT(isNonmovingClosure(chain));
+
+        ((StgInd*)chain)->indirectee = val;
+        unlockClosure((StgClosure*)chain, &stg_IND_info);
+
+        chain = next;
+    }
+
+    if (origin != NULL && (StgClosure*)p0 != val) {
+        cas((StgVolatilePtr)origin, (StgWord)p0, (StgWord)val);
+    }
+}
+
+// Returns value of the selector thunk. The value is a non-moving closure. If
+// it's not possible to evaluate the selector thunk the return value will be the
+// selector itself.
+static StgClosure*
+nonmoving_eval_thunk_selector_(
+        MarkQueue *queue,
+        StgSelector * const p0,
+        StgClosure ** const origin,
+        int depth
+) {
+    // This function should only be called on non-moving objects.
+    ASSERT(HEAP_ALLOCED_GC((P_)p0) && isNonmovingClosure((StgClosure*)p0));
+
+    markQueuePushClosure(queue, (StgClosure*)p0, NULL);
+
+    // INVARIANT: A non-moving object. Locked (below).
+    StgClosure *p = (StgClosure*)p0;
+
+    // Chain of non-moving selectors to update. These will be INDs to `p` when
+    // we reach to a value. INVARIANT: All objects in the chain are locked, and
+    // in the non-moving heap.
+    StgClosure *chain = NULL;
+
+    // Variables to update: p.
+selector_changed:
+    ;
+
+    // debugBelch("Selector changed: %p\n", (void*)p);
+
+    // Lock the selector to avoid concurrent modification in mutators
+    const StgInfoTable *selector_info_ptr = lockClosure((StgClosure*)p);
+    StgInfoTable *selector_info_tbl = INFO_PTR_TO_STRUCT(selector_info_ptr);
+
+    if (selector_info_tbl->type != THUNK_SELECTOR) {
+        // Selector updated in the meantime, or we reached to a value. Update
+        // the chain.
+        unlockClosure(p, selector_info_ptr);
+        update_selector_chain(chain, origin, p0, p);
+        return p;
+    }
+
+    // The closure is locked and it's a selector thunk. If the selectee is a
+    // CONSTR we do the selection here and the In the selected value will be the
+    // value of this selector thunk.
+    //
+    // Two cases:
+    //
+    // - If the selected value is also a selector thunk, then we loop and
+    //   evaluate it. The final value will be the value of both the current
+    //   selector and the selected value (which is also a selector thunk).
+    //
+    // - If the selectee is a selector thunk, we recursively evaluate it (up to
+    //   a certain depth, specified with MAX_THUNK_SELECTOR_DEPTH), then do the
+    //   selection on the value of it.
+
+    //
+    // Do the selection
+    //
+
+    uint32_t field = selector_info_tbl->layout.selector_offset;
+    StgClosure *selectee = UNTAG_CLOSURE(((StgSelector*)p)->selectee);
+
+    // Variables to update: selectee
+selectee_changed:
+    // debugBelch("Selectee changed: %p\n", (void*)selectee);
+
+    if (!isNonmovingClosure(selectee)) {
+        // The selectee is a moving object, and it may be moved by a concurrent
+        // minor GC while we read the info table and fields, so don't try to
+        // read the fields, just update the chain.
+        unlockClosure(p, selector_info_ptr);
+        update_selector_chain(chain, origin, p0, p);
+        return p;
+    }
+
+    // Selectee is a non-moving object, mark it.
+    markQueuePushClosure(queue, selectee, NULL);
+
+    const StgInfoTable *selectee_info_tbl = get_itbl(selectee);
+    switch (selectee_info_tbl->type) {
+        case WHITEHOLE: {
+            // Probably a loop. Abort.
+            unlockClosure(p, selector_info_ptr);
+            update_selector_chain(chain, origin, p0, p);
+            return p;
+        }
+
+        case CONSTR:
+        case CONSTR_1_0:
+        case CONSTR_0_1:
+        case CONSTR_2_0:
+        case CONSTR_1_1:
+        case CONSTR_0_2:
+        case CONSTR_NOCAF: {
+            // Selectee is a constructor in the non-moving heap.
+            // Select the field.
+
+            // Check that the size is in range.
+            ASSERT(field < (StgWord32)(selectee_info_tbl->layout.payload.ptrs +
+                                       selectee_info_tbl->layout.payload.nptrs));
+
+            StgClosure *val = UNTAG_CLOSURE(selectee->payload[field]);
+
+            // `val` is the value of this selector thunk. We need to check a
+            // few cases:
+            //
+            // - If `val` is in the moving heap, we stop here and update the
+            //   chain. All updated objects should be added to the mut_list.
+            //   (TODO (osa): What happens if the value is evacuated as we do
+            //   this?)
+            //
+            // - If `val` is in the non-moving heap, we check if it's also a
+            //   selector. If it is we add it to the chain and loop.
+
+            // Follow indirections. Variables to update: `val`.
+        val_changed:
+            if (!isNonmovingClosure(val)) {
+                // The selected value is a moving object, so we won't be
+                // updating the chain to this object.
+                unlockClosure(p, selector_info_ptr);
+                update_selector_chain(chain, origin, p0, p);
+                return p;
+            }
+
+            switch (get_itbl(val)->type) {
+            case IND:
+            case IND_STATIC:
+                ;
+                // Follow the indirection
+                StgClosure *indirectee = UNTAG_CLOSURE(((StgInd*)val)->indirectee);
+                if (isNonmovingClosure(indirectee)) {
+                    val = UNTAG_CLOSURE(((StgInd*)val)->indirectee);
+                    goto val_changed;
+                } else {
+                    unlockClosure(p, selector_info_ptr);
+                    update_selector_chain(chain, origin, p0, p);
+                    return p;
+                }
+            case THUNK_SELECTOR:
+                // Value of the selector thunk is again a selector thunk in the
+                // non-moving heap. Add the current selector to the chain and
+                // loop.
+                p->payload[0] = chain;
+                chain = p;
+                p = val;
+                goto selector_changed;
+            default:
+                // Found a value, add the current selector to the chain and
+                // update it.
+                p->payload[0] = chain;
+                chain = p;
+                update_selector_chain(chain, origin, p0, val);
+                return val;
+            }
+        }
+
+        case IND:
+        case IND_STATIC: {
+            StgClosure *indirectee = UNTAG_CLOSURE(((StgInd *)selectee)->indirectee);
+            if (isNonmovingClosure(indirectee)) {
+                selectee = indirectee;
+                goto selectee_changed;
+            } else {
+                unlockClosure(p, selector_info_ptr);
+                update_selector_chain(chain, origin, p0, p);
+                return p;
+            }
+        }
+
+        case BLACKHOLE: {
+            StgClosure *indirectee = ((StgInd*)selectee)->indirectee;
+
+            if (!isNonmovingClosure(UNTAG_CLOSURE(indirectee))) {
+                unlockClosure(p, selector_info_ptr);
+                update_selector_chain(chain, origin, p0, p);
+                return p;
+            }
+
+            // Establish whether this BH has been updated, and is now an
+            // indirection, as in evacuate().
+            if (GET_CLOSURE_TAG(indirectee) == 0) {
+                const StgInfoTable *i = indirectee->header.info;
+                if (i == &stg_TSO_info
+                    || i == &stg_WHITEHOLE_info
+                    || i == &stg_BLOCKING_QUEUE_CLEAN_info
+                    || i == &stg_BLOCKING_QUEUE_DIRTY_info) {
+                    unlockClosure(p, selector_info_ptr);
+                    update_selector_chain(chain, origin, p0, p);
+                    return (StgClosure*)p;
+                }
+                ASSERT(i != &stg_IND_info); // TODO not sure about this part
+            }
+
+            // It's an indirection, follow it.
+            selectee = UNTAG_CLOSURE(indirectee);
+            goto selectee_changed;
+        }
+
+        case AP:
+        case AP_STACK:
+        case THUNK:
+        case THUNK_1_0:
+        case THUNK_0_1:
+        case THUNK_2_0:
+        case THUNK_1_1:
+        case THUNK_0_2:
+        case THUNK_STATIC: {
+            // Not evaluated yet
+            unlockClosure(p, selector_info_ptr);
+            update_selector_chain(chain, origin, p0, p);
+            return (StgClosure*)p;
+        }
+
+        case THUNK_SELECTOR: {
+            // Selectee is a selector thunk. Evaluate it if we haven't reached
+            // the recursion limit yet.
+            if (depth < MAX_THUNK_SELECTOR_DEPTH) {
+                StgClosure *new_selectee =
+                    UNTAG_CLOSURE(nonmoving_eval_thunk_selector_(
+                                queue, (StgSelector*)selectee, NULL, depth+1));
+                ASSERT(isNonmovingClosure(new_selectee));
+                if (selectee == new_selectee) {
+                    unlockClosure(p, selector_info_ptr);
+                    update_selector_chain(chain, origin, p0, p);
+                    return (StgClosure*)p;
+                } else {
+                    selectee = new_selectee;
+                    goto selectee_changed;
+                }
+            } else {
+                // Recursion limit reached
+                unlockClosure(p, selector_info_ptr);
+                update_selector_chain(chain, origin, p0, p);
+                return (StgClosure*)p;
+            }
+        }
+
+        default: {
+            barf("nonmoving_eval_thunk_selector: strange selectee %d",
+                 (int)(selectee_info_tbl->type));
+        }
+    }
+}
+
+void
+nonmoving_eval_thunk_selector(MarkQueue *queue, StgSelector *p, StgClosure **origin)
+{
+    nonmoving_eval_thunk_selector_(queue, p, origin, 0);
+}
diff --git a/rts/sm/NonMovingShortcut.h b/rts/sm/NonMovingShortcut.h
new file mode 100644
index 0000000000000000000000000000000000000000..72297884aacdc58eccb7734e38b0ae38123cc329
--- /dev/null
+++ b/rts/sm/NonMovingShortcut.h
@@ -0,0 +1,17 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2019
+ *
+ * Non-moving garbage collector and allocator:
+ * Indirection short-cutting and the selector optimisation
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#include "BeginPrivate.h"
+
+void
+nonmoving_eval_thunk_selector(MarkQueue *queue, StgSelector *p, StgClosure **origin);
+
+#include "EndPrivate.h"
diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c
new file mode 100644
index 0000000000000000000000000000000000000000..cf5fcd70d7650225dfc3ee86a81d8425c1b0b699
--- /dev/null
+++ b/rts/sm/NonMovingSweep.c
@@ -0,0 +1,402 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Sweep phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "NonMovingSweep.h"
+#include "NonMoving.h"
+#include "NonMovingMark.h" // for nonmovingIsAlive
+#include "Capability.h"
+#include "GCThread.h" // for GCUtils.h
+#include "GCUtils.h"
+#include "Storage.h"
+#include "Trace.h"
+#include "StableName.h"
+#include "CNF.h" // compactFree
+
+// On which list should a particular segment be placed?
+enum SweepResult {
+    SEGMENT_FREE,     // segment is empty: place on free list
+    SEGMENT_PARTIAL,  // segment is partially filled: place on active list
+    SEGMENT_FILLED    // segment is full: place on filled list
+};
+
+// Determine which list a marked segment should be placed on and initialize
+// next_free indices as appropriate.
+GNUC_ATTR_HOT static enum SweepResult
+nonmovingSweepSegment(struct NonmovingSegment *seg)
+{
+    bool found_free = false;
+    bool found_live = false;
+
+    for (nonmoving_block_idx i = 0;
+         i < nonmovingSegmentBlockCount(seg);
+         ++i)
+    {
+        if (seg->bitmap[i] == nonmovingMarkEpoch) {
+            found_live = true;
+        } else if (!found_free) {
+            found_free = true;
+            seg->next_free = i;
+            nonmovingSegmentInfo(seg)->next_free_snap = i;
+            Bdescr((P_)seg)->u.scan = (P_)nonmovingSegmentGetBlock(seg, i);
+            seg->bitmap[i] = 0;
+        } else {
+            seg->bitmap[i] = 0;
+        }
+
+        if (found_free && found_live) {
+            // zero the remaining dead object's mark bits
+            for (; i < nonmovingSegmentBlockCount(seg); ++i) {
+                if (seg->bitmap[i] != nonmovingMarkEpoch) {
+                    seg->bitmap[i] = 0;
+                }
+            }
+            return SEGMENT_PARTIAL;
+        }
+    }
+
+    if (found_live) {
+        return SEGMENT_FILLED;
+    } else {
+        ASSERT(seg->next_free == 0);
+        ASSERT(nonmovingSegmentInfo(seg)->next_free_snap == 0);
+        return SEGMENT_FREE;
+    }
+}
+
+#if defined(DEBUG)
+
+void nonmovingGcCafs()
+{
+    uint32_t i = 0;
+    StgIndStatic *next;
+
+    for (StgIndStatic *caf = debug_caf_list_snapshot;
+         caf != (StgIndStatic*) END_OF_CAF_LIST;
+         caf = next)
+    {
+        next = (StgIndStatic*)caf->saved_info;
+
+        const StgInfoTable *info = get_itbl((StgClosure*)caf);
+        ASSERT(info->type == IND_STATIC);
+
+        StgWord flag = ((StgWord) caf->static_link) & STATIC_BITS;
+        if (flag != 0 && flag != static_flag) {
+            debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf);
+            SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it
+        } else {
+            // CAF is alive, move it back to the debug_caf_list
+            ++i;
+            debugTrace(DEBUG_gccafs, "CAF alive at 0x%p", caf);
+            ACQUIRE_SM_LOCK; // debug_caf_list is global, locked by sm_mutex
+            caf->saved_info = (const StgInfoTable*)debug_caf_list;
+            debug_caf_list = caf;
+            RELEASE_SM_LOCK;
+        }
+    }
+
+    debugTrace(DEBUG_gccafs, "%d CAFs live", i);
+    debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
+}
+
+static void
+clear_segment(struct NonmovingSegment* seg)
+{
+    size_t end = ((size_t)seg) + NONMOVING_SEGMENT_SIZE;
+    memset(&seg->bitmap, 0, end - (size_t)&seg->bitmap);
+}
+
+static void
+clear_segment_free_blocks(struct NonmovingSegment* seg)
+{
+    unsigned int block_size = nonmovingSegmentBlockSize(seg);
+    for (unsigned int p_idx = 0; p_idx < nonmovingSegmentBlockCount(seg); ++p_idx) {
+        // after mark, so bit not set == dead
+        if (nonmovingGetMark(seg, p_idx) == 0) {
+            memset(nonmovingSegmentGetBlock(seg, p_idx), 0, block_size);
+        }
+    }
+}
+
+#endif
+
+GNUC_ATTR_HOT void nonmovingSweep(void)
+{
+    while (nonmovingHeap.sweep_list) {
+        struct NonmovingSegment *seg = nonmovingHeap.sweep_list;
+
+        // Pushing the segment to one of the free/active/filled segments
+        // updates the link field, so update sweep_list here
+        nonmovingHeap.sweep_list = seg->link;
+
+        enum SweepResult ret = nonmovingSweepSegment(seg);
+
+        switch (ret) {
+        case SEGMENT_FREE:
+            IF_DEBUG(sanity, clear_segment(seg));
+            nonmovingPushFreeSegment(seg);
+            break;
+        case SEGMENT_PARTIAL:
+            IF_DEBUG(sanity, clear_segment_free_blocks(seg));
+            nonmovingPushActiveSegment(seg);
+            break;
+        case SEGMENT_FILLED:
+            nonmovingPushFilledSegment(seg);
+            break;
+        default:
+            barf("nonmovingSweep: weird sweep return: %d\n", ret);
+        }
+    }
+}
+
+/* Must a closure remain on the mutable list?
+ *
+ * A closure must remain if any of the following applies:
+ *
+ *  1. it contains references to a younger generation
+ *  2. it's a mutable closure (e.g. mutable array or MUT_PRIM)
+ */
+static bool is_closure_clean(StgClosure *p)
+{
+    const StgInfoTable *info = get_itbl(p);
+
+#define CLEAN(ptr) (!HEAP_ALLOCED((StgClosure*) ptr) || Bdescr((StgPtr) ptr)->gen == oldest_gen)
+
+    switch (info->type) {
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+    {
+        StgMVar *mvar = ((StgMVar *)p);
+        if (!CLEAN(mvar->head)) goto dirty_MVAR;
+        if (!CLEAN(mvar->tail)) goto dirty_MVAR;
+        if (!CLEAN(mvar->value)) goto dirty_MVAR;
+        mvar->header.info = &stg_MVAR_CLEAN_info;
+        return true;
+
+dirty_MVAR:
+        mvar->header.info = &stg_MVAR_DIRTY_info;
+        return false;
+    }
+
+    case TVAR:
+    {
+        StgTVar *tvar = ((StgTVar *)p);
+        if (!CLEAN(tvar->current_value)) goto dirty_TVAR;
+        if (!CLEAN(tvar->first_watch_queue_entry)) goto dirty_TVAR;
+        tvar->header.info = &stg_TVAR_CLEAN_info;
+        return true;
+
+dirty_TVAR:
+        tvar->header.info = &stg_TVAR_DIRTY_info;
+        return false;
+    }
+
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_2_0:
+    {
+        StgPtr end = (StgPtr)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+        for (StgPtr q = (StgPtr)((StgThunk *)p)->payload; q < end; q++) {
+            if (!CLEAN(*q)) return false;
+        }
+        return true;
+    }
+
+    case FUN:
+    case FUN_1_0:                       // hardly worth specialising these guys
+    case FUN_0_1:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_2_0:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_2_0:
+    case PRIM:
+    {
+        StgPtr end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        for (StgPtr q = (StgPtr)((StgClosure *)p)->payload; q < end; q++) {
+            if (!CLEAN(*q)) return false;
+        }
+        return true;
+    }
+
+    case WEAK:
+        return false; // TODO
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        if (!CLEAN(((StgMutVar *)p)->var)) {
+            p->header.info = &stg_MUT_VAR_DIRTY_info;
+            return false;
+        } else {
+            p->header.info = &stg_MUT_VAR_CLEAN_info;
+            return true;
+        }
+
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)p;
+
+        if (!CLEAN(bq->bh)) goto dirty_BLOCKING_QUEUE;
+        if (!CLEAN(bq->owner)) goto dirty_BLOCKING_QUEUE;
+        if (!CLEAN(bq->queue)) goto dirty_BLOCKING_QUEUE;
+        if (!CLEAN(bq->link)) goto dirty_BLOCKING_QUEUE;
+        bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+        return true;
+
+dirty_BLOCKING_QUEUE:
+        bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+        return false;
+    }
+
+    case THUNK_SELECTOR:
+        return CLEAN(((StgSelector *) p)->selectee);
+
+    case ARR_WORDS:
+        return true;
+
+    default:
+        // TODO: the rest
+        return false;
+    }
+#undef CLEAN
+}
+
+/* N.B. This happens during the pause so we own all capabilities. */
+void nonmovingSweepMutLists()
+{
+    for (uint32_t n = 0; n < n_capabilities; n++) {
+        Capability *cap = capabilities[n];
+        bdescr *old_mut_list = cap->mut_lists[oldest_gen->no];
+        cap->mut_lists[oldest_gen->no] = allocBlockOnNode_sync(cap->node);
+        for (bdescr *bd = old_mut_list; bd; bd = bd->link) {
+            for (StgPtr p = bd->start; p < bd->free; p++) {
+                StgClosure **q = (StgClosure**)p;
+                if (nonmovingIsAlive(*q) && !is_closure_clean(*q)) {
+                    recordMutableCap(*q, cap, oldest_gen->no);
+                }
+            }
+        }
+        freeChain_lock(old_mut_list);
+    }
+}
+
+/* A variant of freeChain_lock that will only hold the lock for at most max_dur
+ * freed blocks to ensure that we don't starve other lock users (e.g. the
+ * mutator).
+ */
+static void freeChain_lock_max(bdescr *bd, int max_dur)
+{
+  ACQUIRE_SM_LOCK;
+  bdescr *next_bd;
+  int i = 0;
+  while (bd != NULL) {
+    next_bd = bd->link;
+    freeGroup(bd);
+    bd = next_bd;
+    if (i == max_dur) {
+        RELEASE_SM_LOCK;
+        yieldThread();
+        ACQUIRE_SM_LOCK;
+        i = 0;
+    }
+    i++;
+  }
+  RELEASE_SM_LOCK;
+}
+
+void nonmovingSweepLargeObjects()
+{
+    freeChain_lock_max(nonmoving_large_objects, 10000);
+    nonmoving_large_objects = nonmoving_marked_large_objects;
+    n_nonmoving_large_blocks = n_nonmoving_marked_large_blocks;
+    nonmoving_marked_large_objects = NULL;
+    n_nonmoving_marked_large_blocks = 0;
+}
+
+void nonmovingSweepCompactObjects()
+{
+    bdescr *next;
+    ACQUIRE_SM_LOCK;
+    for (bdescr *bd = nonmoving_compact_objects; bd; bd = next) {
+        next = bd->link;
+        compactFree(((StgCompactNFDataBlock*)bd->start)->owner);
+    }
+    RELEASE_SM_LOCK;
+    nonmoving_compact_objects = nonmoving_marked_compact_objects;
+    n_nonmoving_compact_blocks = n_nonmoving_marked_compact_blocks;
+    nonmoving_marked_compact_objects = NULL;
+    n_nonmoving_marked_compact_blocks = 0;
+}
+
+// Helper for nonmovingSweepStableNameTable. Essentially nonmovingIsAlive,
+// but works when the object died in moving heap, see
+// nonmovingSweepStableNameTable
+static bool is_alive(StgClosure *p)
+{
+    if (!HEAP_ALLOCED_GC(p)) {
+        return true;
+    }
+
+    if (nonmovingClosureBeingSwept(p)) {
+        return nonmovingIsAlive(p);
+    } else {
+        // We don't want to sweep any stable names which weren't in the
+        // set of segments that we swept.
+        // See Note [Sweeping stable names in the concurrent collector]
+        return true;
+    }
+}
+
+void nonmovingSweepStableNameTable()
+{
+    // See comments in gcStableTables
+
+    /* Note [Sweeping stable names in the concurrent collector]
+     * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+     *
+     * When collecting concurrently we need to take care to avoid freeing
+     * stable names the we didn't sweep this collection cycle. For instance,
+     * consider the following situation:
+     *
+     *  1. We take a snapshot and start collection
+     *  2. A mutator allocates a new object, then makes a stable name for it
+     *  3. The mutator performs a minor GC and promotes the new object to the nonmoving heap
+     *  4. The GC thread gets to the sweep phase and, when traversing the stable
+     *     name table, finds the new object unmarked. It then assumes that the
+     *     object is dead and removes the stable name from the stable name table.
+     *
+     */
+
+    // FIXME: We can't use nonmovingIsAlive here without first using isAlive:
+    // a stable name can die during moving heap collection and we can't use
+    // nonmovingIsAlive on those objects. Inefficient.
+
+    stableNameLock();
+    FOR_EACH_STABLE_NAME(
+        p, {
+            if (p->sn_obj != NULL) {
+                if (!is_alive((StgClosure*)p->sn_obj)) {
+                    p->sn_obj = NULL; // Just to make an assertion happy
+                    freeSnEntry(p);
+                } else if (p->addr != NULL) {
+                    if (!is_alive((StgClosure*)p->addr)) {
+                        p->addr = NULL;
+                    }
+                }
+            }
+        });
+    stableNameUnlock();
+}
diff --git a/rts/sm/NonMovingSweep.h b/rts/sm/NonMovingSweep.h
new file mode 100644
index 0000000000000000000000000000000000000000..24e9eccd5e77b6482995b09841aabea6e5b25b03
--- /dev/null
+++ b/rts/sm/NonMovingSweep.h
@@ -0,0 +1,31 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2018
+ *
+ * Non-moving garbage collector and allocator: Sweep phase
+ *
+ * ---------------------------------------------------------------------------*/
+
+#pragma once
+
+#include "NonMoving.h"
+#include "Hash.h"
+
+GNUC_ATTR_HOT void nonmovingSweep(void);
+
+// Remove unmarked entries in oldest generation mut_lists
+void nonmovingSweepMutLists(void);
+
+// Remove unmarked entries in oldest generation scavenged_large_objects list
+void nonmovingSweepLargeObjects(void);
+
+// Remove unmarked entries in oldest generation compact_objects list
+void nonmovingSweepCompactObjects(void);
+
+// Remove dead entries in the stable name table
+void nonmovingSweepStableNameTable(void);
+
+#if defined(DEBUG)
+// The non-moving equivalent of the moving collector's gcCAFs.
+void nonmovingGcCafs(void);
+#endif
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 3585bd93b4fabb9768900505b95ec146a43528f8..23f0fc57b4ad99b427ce0f68951653d1a0f38422 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -29,6 +29,8 @@
 #include "Arena.h"
 #include "RetainerProfile.h"
 #include "CNF.h"
+#include "sm/NonMoving.h"
+#include "sm/NonMovingMark.h"
 #include "Profiling.h" // prof_arena
 
 /* -----------------------------------------------------------------------------
@@ -40,6 +42,9 @@ static void  checkLargeBitmap    ( StgPtr payload, StgLargeBitmap*, uint32_t );
 static void  checkClosureShallow ( const StgClosure * );
 static void  checkSTACK          (StgStack *stack);
 
+static W_    countNonMovingSegments ( struct NonmovingSegment *segs );
+static W_    countNonMovingHeap     ( struct NonmovingHeap *heap );
+
 /* -----------------------------------------------------------------------------
    Check stack sanity
    -------------------------------------------------------------------------- */
@@ -478,6 +483,41 @@ void checkHeapChain (bdescr *bd)
     }
 }
 
+/* -----------------------------------------------------------------------------
+ * Check nonmoving heap sanity
+ *
+ * After a concurrent sweep the nonmoving heap can be checked for validity.
+ * -------------------------------------------------------------------------- */
+
+static void checkNonmovingSegments (struct NonmovingSegment *seg)
+{
+    while (seg != NULL) {
+        const nonmoving_block_idx count = nonmovingSegmentBlockCount(seg);
+        for (nonmoving_block_idx i=0; i < count; i++) {
+            if (seg->bitmap[i] == nonmovingMarkEpoch) {
+                StgPtr p = nonmovingSegmentGetBlock(seg, i);
+                checkClosure((StgClosure *) p);
+            } else if (i < nonmovingSegmentInfo(seg)->next_free_snap){
+                seg->bitmap[i] = 0;
+            }
+        }
+        seg = seg->link;
+    }
+}
+
+void checkNonmovingHeap (const struct NonmovingHeap *heap)
+{
+    for (unsigned int i=0; i < NONMOVING_ALLOCA_CNT; i++) {
+        const struct NonmovingAllocator *alloc = heap->allocators[i];
+        checkNonmovingSegments(alloc->filled);
+        checkNonmovingSegments(alloc->active);
+        for (unsigned int cap=0; cap < n_capabilities; cap++) {
+            checkNonmovingSegments(alloc->current[cap]);
+        }
+    }
+}
+
+
 void
 checkHeapChunk(StgPtr start, StgPtr end)
 {
@@ -632,9 +672,9 @@ checkGlobalTSOList (bool checkTSOs)
 
               stack = tso->stackobj;
               while (1) {
-                  if (stack->dirty & 1) {
-                      ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & TSO_MARKED));
-                      stack->dirty &= ~TSO_MARKED;
+                  if (stack->dirty & STACK_DIRTY) {
+                      ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & STACK_SANE));
+                      stack->dirty &= ~STACK_SANE;
                   }
                   frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size
                                                 - sizeofW(StgUnderflowFrame));
@@ -669,7 +709,7 @@ checkMutableList( bdescr *mut_bd, uint32_t gen )
                 ((StgTSO *)p)->flags |= TSO_MARKED;
                 break;
             case STACK:
-                ((StgStack *)p)->dirty |= TSO_MARKED;
+                ((StgStack *)p)->dirty |= STACK_SANE;
                 break;
             }
         }
@@ -766,16 +806,42 @@ static void checkGeneration (generation *gen,
     uint32_t n;
     gen_workspace *ws;
 
-    ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+    //ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
     ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
 
 #if defined(THREADED_RTS)
-    // heap sanity checking doesn't work with SMP, because we can't
-    // zero the slop (see Updates.h).  However, we can sanity-check
-    // the heap after a major gc, because there is no slop.
+    // heap sanity checking doesn't work with SMP for two reasons:
+    //   * we can't zero the slop (see Updates.h).  However, we can sanity-check
+    //     the heap after a major gc, because there is no slop.
+    //
+    //   * the nonmoving collector may be mutating its large object lists, unless we
+    //     were in fact called by the nonmoving collector.
     if (!after_major_gc) return;
 #endif
 
+    if (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen) {
+        ASSERT(countNonMovingSegments(nonmovingHeap.free) == (W_) nonmovingHeap.n_free * NONMOVING_SEGMENT_BLOCKS);
+        ASSERT(countBlocks(nonmoving_large_objects) == n_nonmoving_large_blocks);
+        ASSERT(countBlocks(nonmoving_marked_large_objects) == n_nonmoving_marked_large_blocks);
+
+        // Compact regions
+        // Accounting here is tricky due to the fact that the CNF allocation
+        // code modifies generation->n_compact_blocks directly. However, most
+        // objects being swept by the nonmoving GC are tracked in
+        // nonmoving_*_compact_objects. Consequently we can only maintain a very loose
+        // sanity invariant here.
+        uint32_t counted_cnf_blocks = 0;
+        counted_cnf_blocks += countCompactBlocks(nonmoving_marked_compact_objects);
+        counted_cnf_blocks += countCompactBlocks(nonmoving_compact_objects);
+        counted_cnf_blocks += countCompactBlocks(oldest_gen->compact_objects);
+
+        uint32_t total_cnf_blocks = 0;
+        total_cnf_blocks += n_nonmoving_compact_blocks + oldest_gen->n_compact_blocks;
+        total_cnf_blocks += n_nonmoving_marked_compact_blocks;
+
+        ASSERT(counted_cnf_blocks == total_cnf_blocks);
+    }
+
     checkHeapChain(gen->blocks);
 
     for (n = 0; n < n_capabilities; n++) {
@@ -824,6 +890,15 @@ markCompactBlocks(bdescr *bd)
     }
 }
 
+static void
+markNonMovingSegments(struct NonmovingSegment *seg)
+{
+    while (seg) {
+        markBlocks(Bdescr((P_)seg));
+        seg = seg->link;
+    }
+}
+
 // If memInventory() calculates that we have a memory leak, this
 // function will try to find the block(s) that are leaking by marking
 // all the ones that we know about, and search through memory to find
@@ -834,7 +909,7 @@ markCompactBlocks(bdescr *bd)
 static void
 findMemoryLeak (void)
 {
-    uint32_t g, i;
+    uint32_t g, i, j;
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         for (i = 0; i < n_capabilities; i++) {
             markBlocks(capabilities[i]->mut_lists[g]);
@@ -854,6 +929,27 @@ findMemoryLeak (void)
     for (i = 0; i < n_capabilities; i++) {
         markBlocks(gc_threads[i]->free_blocks);
         markBlocks(capabilities[i]->pinned_object_block);
+        markBlocks(capabilities[i]->upd_rem_set.queue.blocks);
+    }
+
+    if (RtsFlags.GcFlags.useNonmoving) {
+        markBlocks(upd_rem_set_block_list);
+        markBlocks(nonmoving_large_objects);
+        markBlocks(nonmoving_marked_large_objects);
+        markBlocks(nonmoving_compact_objects);
+        markBlocks(nonmoving_marked_compact_objects);
+        for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
+            struct NonmovingAllocator *alloc = nonmovingHeap.allocators[i];
+            markNonMovingSegments(alloc->filled);
+            markNonMovingSegments(alloc->active);
+            for (j = 0; j < n_capabilities; j++) {
+                markNonMovingSegments(alloc->current[j]);
+            }
+        }
+        markNonMovingSegments(nonmovingHeap.sweep_list);
+        markNonMovingSegments(nonmovingHeap.free);
+        if (current_mark_queue)
+            markBlocks(current_mark_queue->blocks);
     }
 
 #if defined(PROFILING)
@@ -914,14 +1010,65 @@ void findSlop(bdescr *bd)
 static W_
 genBlocks (generation *gen)
 {
-    ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+    W_ ret = 0;
+    if (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen) {
+        // See Note [Live data accounting in nonmoving collector].
+        ASSERT(countNonMovingHeap(&nonmovingHeap) == gen->n_blocks);
+        ret += countAllocdBlocks(nonmoving_large_objects);
+        ret += countAllocdBlocks(nonmoving_marked_large_objects);
+        ret += countAllocdCompactBlocks(nonmoving_compact_objects);
+        ret += countAllocdCompactBlocks(nonmoving_marked_compact_objects);
+        ret += countNonMovingHeap(&nonmovingHeap);
+        if (current_mark_queue)
+            ret += countBlocks(current_mark_queue->blocks);
+    } else {
+        ASSERT(countBlocks(gen->blocks) == gen->n_blocks);
+        ASSERT(countCompactBlocks(gen->compact_objects) == gen->n_compact_blocks);
+        ASSERT(countCompactBlocks(gen->compact_blocks_in_import) == gen->n_compact_blocks_in_import);
+        ret += gen->n_blocks;
+    }
+
     ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks);
-    ASSERT(countCompactBlocks(gen->compact_objects) == gen->n_compact_blocks);
-    ASSERT(countCompactBlocks(gen->compact_blocks_in_import) == gen->n_compact_blocks_in_import);
-    return gen->n_blocks + gen->n_old_blocks +
+
+    ret += gen->n_old_blocks +
         countAllocdBlocks(gen->large_objects) +
         countAllocdCompactBlocks(gen->compact_objects) +
         countAllocdCompactBlocks(gen->compact_blocks_in_import);
+    return ret;
+}
+
+static W_
+countNonMovingSegments(struct NonmovingSegment *segs)
+{
+    W_ ret = 0;
+    while (segs) {
+        ret += countBlocks(Bdescr((P_)segs));
+        segs = segs->link;
+    }
+    return ret;
+}
+
+static W_
+countNonMovingAllocator(struct NonmovingAllocator *alloc)
+{
+    W_ ret = countNonMovingSegments(alloc->filled)
+           + countNonMovingSegments(alloc->active);
+    for (uint32_t i = 0; i < n_capabilities; ++i) {
+        ret += countNonMovingSegments(alloc->current[i]);
+    }
+    return ret;
+}
+
+static W_
+countNonMovingHeap(struct NonmovingHeap *heap)
+{
+    W_ ret = 0;
+    for (int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++) {
+        ret += countNonMovingAllocator(heap->allocators[alloc_idx]);
+    }
+    ret += countNonMovingSegments(heap->sweep_list);
+    ret += countNonMovingSegments(heap->free);
+    return ret;
 }
 
 void
@@ -929,11 +1076,20 @@ memInventory (bool show)
 {
   uint32_t g, i;
   W_ gen_blocks[RtsFlags.GcFlags.generations];
-  W_ nursery_blocks, retainer_blocks,
-      arena_blocks, exec_blocks, gc_free_blocks = 0;
+  W_ nursery_blocks = 0, retainer_blocks = 0,
+      arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0,
+      upd_rem_set_blocks = 0;
   W_ live_blocks = 0, free_blocks = 0;
   bool leak;
 
+#if defined(THREADED_RTS)
+  // Can't easily do a memory inventory: We might race with the nonmoving
+  // collector. In principle we could try to take nonmoving_collection_mutex
+  // and do an inventory if we have it but we don't currently implement this.
+  if (RtsFlags.GcFlags.useNonmoving)
+    return;
+#endif
+
   // count the blocks we current have
 
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
@@ -947,20 +1103,19 @@ memInventory (bool show)
       gen_blocks[g] += genBlocks(&generations[g]);
   }
 
-  nursery_blocks = 0;
   for (i = 0; i < n_nurseries; i++) {
       ASSERT(countBlocks(nurseries[i].blocks) == nurseries[i].n_blocks);
       nursery_blocks += nurseries[i].n_blocks;
   }
   for (i = 0; i < n_capabilities; i++) {
-      gc_free_blocks += countBlocks(gc_threads[i]->free_blocks);
+      W_ n = countBlocks(gc_threads[i]->free_blocks);
+      gc_free_blocks += n;
       if (capabilities[i]->pinned_object_block != NULL) {
           nursery_blocks += capabilities[i]->pinned_object_block->blocks;
       }
       nursery_blocks += countBlocks(capabilities[i]->pinned_object_blocks);
   }
 
-  retainer_blocks = 0;
 #if defined(PROFILING)
   if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
       retainer_blocks = retainerStackBlocks();
@@ -976,12 +1131,19 @@ memInventory (bool show)
   /* count the blocks on the free list */
   free_blocks = countFreeList();
 
+  // count UpdRemSet blocks
+  for (i = 0; i < n_capabilities; ++i) {
+      upd_rem_set_blocks += countBlocks(capabilities[i]->upd_rem_set.queue.blocks);
+  }
+  upd_rem_set_blocks += countBlocks(upd_rem_set_block_list);
+
   live_blocks = 0;
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
       live_blocks += gen_blocks[g];
   }
   live_blocks += nursery_blocks +
-               + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks;
+               + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks
+               + upd_rem_set_blocks;
 
 #define MB(n) (((double)(n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_)))
 
@@ -1010,6 +1172,8 @@ memInventory (bool show)
                  gc_free_blocks, MB(gc_free_blocks));
       debugBelch("  free         : %5" FMT_Word " blocks (%6.1lf MB)\n",
                  free_blocks, MB(free_blocks));
+      debugBelch("  UpdRemSet    : %5" FMT_Word " blocks (%6.1lf MB)\n",
+                 upd_rem_set_blocks, MB(upd_rem_set_blocks));
       debugBelch("  total        : %5" FMT_Word " blocks (%6.1lf MB)\n",
                  live_blocks + free_blocks, MB(live_blocks+free_blocks));
       if (leak) {
diff --git a/rts/sm/Sanity.h b/rts/sm/Sanity.h
index 9227e6fd181b19c9d913394fc8c09a1034b85287..b6f2054383177985a3f08a0342f980f183e5dc34 100644
--- a/rts/sm/Sanity.h
+++ b/rts/sm/Sanity.h
@@ -31,6 +31,7 @@ void checkStaticObjects ( StgClosure* static_objects );
 void checkStackChunk    ( StgPtr sp, StgPtr stack_end );
 StgOffset checkStackFrame ( StgPtr sp );
 StgOffset checkClosure  ( const StgClosure* p );
+void checkNonmovingHeap ( const struct NonmovingHeap *heap );
 
 void checkRunQueue      (Capability *cap);
 
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index c486cd96c5867e79926dfbb2702732cdd8918b44..501d958aaef6d54c4c850ead0902060e8adffcc8 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -62,8 +62,8 @@
 #include "Hash.h"
 
 #include "sm/MarkWeak.h"
-
-static void scavenge_stack (StgPtr p, StgPtr stack_end);
+#include "sm/NonMoving.h" // for nonmoving_set_closure_mark_bit
+#include "sm/NonMovingScav.h"
 
 static void scavenge_large_bitmap (StgPtr p,
                                    StgLargeBitmap *large_bitmap,
@@ -76,6 +76,15 @@ static void scavenge_large_bitmap (StgPtr p,
 # define scavenge_block(a) scavenge_block1(a)
 # define scavenge_mutable_list(bd,g) scavenge_mutable_list1(bd,g)
 # define scavenge_capability_mut_lists(cap) scavenge_capability_mut_Lists1(cap)
+# define scavengeTSO(tso) scavengeTSO1(tso)
+# define scavenge_stack(p, stack_end) scavenge_stack1(p, stack_end)
+# define scavenge_fun_srt(info) scavenge_fun_srt1(info)
+# define scavenge_fun_srt(info) scavenge_fun_srt1(info)
+# define scavenge_thunk_srt(info) scavenge_thunk_srt1(info)
+# define scavenge_mut_arr_ptrs(info) scavenge_mut_arr_ptrs1(info)
+# define scavenge_PAP(pap) scavenge_PAP1(pap)
+# define scavenge_AP(ap) scavenge_AP1(ap)
+# define scavenge_compact(str) scavenge_compact1(str)
 #endif
 
 static void do_evacuate(StgClosure **p, void *user STG_UNUSED)
@@ -87,7 +96,7 @@ static void do_evacuate(StgClosure **p, void *user STG_UNUSED)
    Scavenge a TSO.
    -------------------------------------------------------------------------- */
 
-static void
+void
 scavengeTSO (StgTSO *tso)
 {
     bool saved_eager;
@@ -165,7 +174,10 @@ evacuate_hash_entry(MapHashData *dat, StgWord key, const void *value)
     SET_GCT(old_gct);
 }
 
-static void
+/* Here we scavenge the sharing-preservation hash-table, which may contain keys
+ * living in from-space.
+ */
+void
 scavenge_compact(StgCompactNFData *str)
 {
     bool saved_eager;
@@ -198,7 +210,7 @@ scavenge_compact(StgCompactNFData *str)
    Mutable arrays of pointers
    -------------------------------------------------------------------------- */
 
-static StgPtr scavenge_mut_arr_ptrs (StgMutArrPtrs *a)
+StgPtr scavenge_mut_arr_ptrs (StgMutArrPtrs *a)
 {
     W_ m;
     bool any_failed;
@@ -348,14 +360,14 @@ scavenge_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size)
     return p;
 }
 
-STATIC_INLINE GNUC_ATTR_HOT StgPtr
+GNUC_ATTR_HOT StgPtr
 scavenge_PAP (StgPAP *pap)
 {
     evacuate(&pap->fun);
     return scavenge_PAP_payload (pap->fun, pap->payload, pap->n_args);
 }
 
-STATIC_INLINE StgPtr
+StgPtr
 scavenge_AP (StgAP *ap)
 {
     evacuate(&ap->fun);
@@ -366,7 +378,7 @@ scavenge_AP (StgAP *ap)
    Scavenge SRTs
    -------------------------------------------------------------------------- */
 
-STATIC_INLINE GNUC_ATTR_HOT void
+GNUC_ATTR_HOT void
 scavenge_thunk_srt(const StgInfoTable *info)
 {
     StgThunkInfoTable *thunk_info;
@@ -380,7 +392,7 @@ scavenge_thunk_srt(const StgInfoTable *info)
     }
 }
 
-STATIC_INLINE GNUC_ATTR_HOT void
+GNUC_ATTR_HOT void
 scavenge_fun_srt(const StgInfoTable *info)
 {
     StgFunInfoTable *fun_info;
@@ -1570,10 +1582,10 @@ static void
 scavenge_mutable_list(bdescr *bd, generation *gen)
 {
     StgPtr p, q;
-    uint32_t gen_no;
 
-    gen_no = gen->no;
+    uint32_t gen_no = gen->no;
     gct->evac_gen_no = gen_no;
+
     for (; bd != NULL; bd = bd->link) {
         for (q = bd->start; q < bd->free; q++) {
             p = (StgPtr)*q;
@@ -1648,7 +1660,10 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
                 ;
             }
 
-            if (scavenge_one(p)) {
+            if (RtsFlags.GcFlags.useNonmoving && major_gc && gen == oldest_gen) {
+                // We can't use scavenge_one here as we need to scavenge SRTs
+                nonmovingScavengeOne((StgClosure *)p);
+            } else if (scavenge_one(p)) {
                 // didn't manage to promote everything, so put the
                 // object back on the list.
                 recordMutableGen_GC((StgClosure *)p,gen_no);
@@ -1660,7 +1675,14 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
 void
 scavenge_capability_mut_lists (Capability *cap)
 {
-    uint32_t g;
+    // In a major GC only nonmoving heap's mut list is root
+    if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+        uint32_t g = oldest_gen->no;
+        scavenge_mutable_list(cap->saved_mut_lists[g], oldest_gen);
+        freeChain_sync(cap->saved_mut_lists[g]);
+        cap->saved_mut_lists[g] = NULL;
+        return;
+    }
 
     /* Mutable lists from each generation > N
      * we want to *scavenge* these roots, not evacuate them: they're not
@@ -1668,7 +1690,7 @@ scavenge_capability_mut_lists (Capability *cap)
      * Also do them in reverse generation order, for the usual reason:
      * namely to reduce the likelihood of spurious old->new pointers.
      */
-    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+    for (uint32_t g = RtsFlags.GcFlags.generations-1; g > N; g--) {
         scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]);
         freeChain_sync(cap->saved_mut_lists[g]);
         cap->saved_mut_lists[g] = NULL;
@@ -1795,7 +1817,7 @@ scavenge_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, StgWord size )
    AP_STACK_UPDs, since these are just sections of copied stack.
    -------------------------------------------------------------------------- */
 
-static void
+void
 scavenge_stack(StgPtr p, StgPtr stack_end)
 {
   const StgRetInfoTable* info;
@@ -2038,6 +2060,16 @@ loop:
     for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) {
         ws = &gct->gens[g];
 
+        if (ws->todo_seg != END_NONMOVING_TODO_LIST) {
+            struct NonmovingSegment *seg = ws->todo_seg;
+            ASSERT(seg->todo_link);
+            ws->todo_seg = seg->todo_link;
+            seg->todo_link = NULL;
+            scavengeNonmovingSegment(seg);
+            did_something = true;
+            break;
+        }
+
         gct->scan_bd = NULL;
 
         // If we have a scan block with some work to do,
diff --git a/rts/sm/Scav.h b/rts/sm/Scav.h
index 21ca691bff04d9d38d98aff47b3b7e0888dfe3d3..94250bcf7a1bca1f50d0bff699d34a6269cd1a85 100644
--- a/rts/sm/Scav.h
+++ b/rts/sm/Scav.h
@@ -17,10 +17,26 @@
 
 void    scavenge_loop (void);
 void    scavenge_capability_mut_lists (Capability *cap);
+void    scavengeTSO (StgTSO *tso);
+void    scavenge_stack (StgPtr p, StgPtr stack_end);
+void    scavenge_fun_srt (const StgInfoTable *info);
+void    scavenge_thunk_srt (const StgInfoTable *info);
+StgPtr  scavenge_mut_arr_ptrs (StgMutArrPtrs *a);
+StgPtr  scavenge_PAP (StgPAP *pap);
+StgPtr  scavenge_AP (StgAP *ap);
+void    scavenge_compact (StgCompactNFData *str);
 
 #if defined(THREADED_RTS)
 void    scavenge_loop1 (void);
 void    scavenge_capability_mut_Lists1 (Capability *cap);
+void    scavengeTSO1 (StgTSO *tso);
+void    scavenge_stack1 (StgPtr p, StgPtr stack_end);
+void    scavenge_fun_srt1 (const StgInfoTable *info);
+void    scavenge_thunk_srt1 (const StgInfoTable *info);
+StgPtr  scavenge_mut_arr_ptrs1 (StgMutArrPtrs *a);
+StgPtr  scavenge_PAP1 (StgPAP *pap);
+StgPtr  scavenge_AP1 (StgAP *ap);
+void    scavenge_compact1 (StgCompactNFData *str);
 #endif
 
 #include "EndPrivate.h"
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 0130a08f7c4219596f1ed050bd5ffa8a7c07e705..f04b3c5929195a11e352d6d5efb58d351d31b696 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -29,6 +29,7 @@
 #include "Trace.h"
 #include "GC.h"
 #include "Evac.h"
+#include "NonMoving.h"
 #if defined(ios_HOST_OS)
 #include "Hash.h"
 #endif
@@ -82,7 +83,7 @@ Mutex sm_mutex;
 static void allocNurseries (uint32_t from, uint32_t to);
 static void assignNurseriesToCapabilities (uint32_t from, uint32_t to);
 
-static void
+void
 initGeneration (generation *gen, int g)
 {
     gen->no = g;
@@ -170,6 +171,18 @@ initStorage (void)
   }
   oldest_gen->to = oldest_gen;
 
+  // Nonmoving heap uses oldest_gen so initialize it after initializing oldest_gen
+  nonmovingInit();
+
+#if defined(THREADED_RTS)
+  // nonmovingAddCapabilities allocates segments, which requires taking the gc
+  // sync lock, so initialize it before nonmovingAddCapabilities
+  initSpinLock(&gc_alloc_block_sync);
+#endif
+
+  if (RtsFlags.GcFlags.useNonmoving)
+      nonmovingAddCapabilities(n_capabilities);
+
   /* The oldest generation has one step. */
   if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
       if (RtsFlags.GcFlags.generations == 1) {
@@ -195,9 +208,6 @@ initStorage (void)
 
   exec_block = NULL;
 
-#if defined(THREADED_RTS)
-  initSpinLock(&gc_alloc_block_sync);
-#endif
   N = 0;
 
   for (n = 0; n < n_numa_nodes; n++) {
@@ -271,6 +281,14 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
         }
     }
 
+    // Initialize NonmovingAllocators and UpdRemSets
+    if (RtsFlags.GcFlags.useNonmoving) {
+        nonmovingAddCapabilities(to);
+        for (i = 0; i < to; ++i) {
+            init_upd_rem_set(&capabilities[i]->upd_rem_set);
+        }
+    }
+
 #if defined(THREADED_RTS) && defined(CC_LLVM_BACKEND) && (CC_SUPPORTS_TLS == 0)
     newThreadLocalKey(&gctKey);
 #endif
@@ -282,6 +300,7 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
 void
 exitStorage (void)
 {
+    nonmovingExit();
     updateNurseriesStats();
     stat_exit();
 }
@@ -302,7 +321,8 @@ freeStorage (bool free_heap)
 }
 
 /* -----------------------------------------------------------------------------
-   Note [CAF management].
+   Note [CAF management]
+   ~~~~~~~~~~~~~~~~~~~~~
 
    The entry code for every CAF does the following:
 
@@ -337,6 +357,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [atomic CAF entry]
+   ~~~~~~~~~~~~~~~~~~~~~~~
 
    With THREADED_RTS, newCAF() is required to be atomic (see
    #5558). This is because if two threads happened to enter the same
@@ -350,6 +371,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [GHCi CAFs]
+   ~~~~~~~~~~~~~~~~
 
    For GHCI, we have additional requirements when dealing with CAFs:
 
@@ -369,6 +391,51 @@ freeStorage (bool free_heap)
 
       -- SDM 29/1/01
 
+   ------------------
+   Note [Static objects under the nonmoving collector]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Static object management is a bit tricky under the nonmoving collector as we
+   need to maintain a bit more state than in the moving collector. In
+   particular, the moving collector uses the low bits of the STATIC_LINK field
+   to determine whether the object has been moved to the scavenger's work list
+   (see Note [STATIC_LINK fields] in Storage.h).
+
+   However, the nonmoving collector also needs a place to keep its mark bit.
+   This is problematic as we therefore need at least three bits of state
+   but can assume only two bits are available in STATIC_LINK (due to 32-bit
+   systems).
+
+   To accomodate this we move handling of static objects entirely to the
+   oldest generation when the nonmoving collector is in use. To do this safely
+   and efficiently we allocate the blackhole created by lockCAF() directly in
+   the non-moving heap. This means that the moving collector can completely
+   ignore static objects in minor collections since they are guaranteed not to
+   have any references into the moving heap. Of course, the blackhole itself
+   likely will contain a reference into the moving heap but this is
+   significantly easier to handle, being a heap-allocated object (see Note
+   [Aging under the non-moving collector] in NonMoving.c for details).
+
+   During the moving phase of a major collection we treat static objects
+   as we do any other reference into the non-moving heap by pushing them
+   to the non-moving mark queue (see Note [Aging under the non-moving
+   collector]).
+
+   This allows the non-moving collector to have full control over the flags
+   in STATIC_LINK, which it uses as described in Note [STATIC_LINK fields]).
+   This is implemented by NonMovingMark.c:bump_static_flag.
+
+   In short, the plan is:
+
+     - lockCAF allocates its blackhole in the nonmoving heap. This is important
+       to ensure that we do not need to place the static object on the mut_list
+       lest we would need somw way to ensure that it evacuate only once during
+       a moving collection.
+
+     - evacuate_static_object adds merely pushes objects to the mark queue
+
+     - the nonmoving collector uses the flags in STATIC_LINK as its mark bit.
+
    -------------------------------------------------------------------------- */
 
 STATIC_INLINE StgInd *
@@ -402,11 +469,36 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // successfully claimed by us; overwrite with IND_STATIC
 #endif
 
+    // Push stuff that will become unreachable after updating to UpdRemSet to
+    // maintain snapshot invariant
+    const StgInfoTable *orig_info_tbl = INFO_PTR_TO_STRUCT(orig_info);
+    // OSA: Assertions to make sure my understanding of static thunks is correct
+    ASSERT(orig_info_tbl->type == THUNK_STATIC);
+    // Secondly I think static thunks can't have payload: anything that they
+    // reference should be in SRTs
+    ASSERT(orig_info_tbl->layout.payload.ptrs == 0);
+    // Becuase the payload is empty we just push the SRT
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(orig_info_tbl);
+        if (thunk_info->i.srt) {
+            updateRemembSetPushClosure(cap, GET_SRT(thunk_info));
+        }
+    }
+
     // For the benefit of revertCAFs(), save the original info pointer
     caf->saved_info = orig_info;
 
     // Allocate the blackhole indirection closure
-    bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    if (RtsFlags.GcFlags.useNonmoving) {
+        // See Note [Static objects under the nonmoving collector].
+        ACQUIRE_SM_LOCK;
+        bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh));
+        RELEASE_SM_LOCK;
+        recordMutableCap((StgClosure*)bh,
+                         regTableToCapability(reg), oldest_gen->no);
+    } else {
+        bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    }
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
     // Ensure that above writes are visible before we introduce reference as CAF indirectee.
@@ -448,7 +540,9 @@ newCAF(StgRegTable *reg, StgIndStatic *caf)
     else
     {
         // Put this CAF on the mutable list for the old generation.
-        if (oldest_gen->no != 0) {
+        // N.B. the nonmoving collector works a bit differently: see
+        // Note [Static objects under the nonmoving collector].
+        if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
             recordMutableCap((StgClosure*)caf,
                              regTableToCapability(reg), oldest_gen->no);
         }
@@ -525,7 +619,9 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
     if (!bh) return NULL;
 
     // Put this CAF on the mutable list for the old generation.
-    if (oldest_gen->no != 0) {
+    // N.B. the nonmoving collector works a bit differently:
+    // see Note [Static objects under the nonmoving collector].
+    if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
         recordMutableCap((StgClosure*)caf,
                          regTableToCapability(reg), oldest_gen->no);
     }
@@ -1073,6 +1169,27 @@ allocatePinned (Capability *cap, W_ n)
    Write Barriers
    -------------------------------------------------------------------------- */
 
+/* These write barriers on heavily mutated objects serve two purposes:
+ *
+ * - Efficient maintenance of the generational invariant: Record whether or not
+ *   we have added a particular mutable object to mut_list as they may contain
+ *   references to younger generations.
+ *
+ * - Maintenance of the nonmoving collector's snapshot invariant: Record objects
+ *   which are about to no longer be reachable due to mutation.
+ *
+ * In each case we record whether the object has been added to the mutable list
+ * by way of either the info pointer or a dedicated "dirty" flag. The GC will
+ * clear this flag and remove the object from mut_list (or rather, not re-add it)
+ * to if it finds the object contains no references into any younger generation.
+ *
+ * Note that all dirty objects will be marked as clean during preparation for a
+ * concurrent collection. Consequently, we can use the dirtiness flag to determine
+ * whether or not we need to add overwritten pointers to the update remembered
+ * set (since we need only write the value prior to the first update to maintain
+ * the snapshot invariant).
+ */
+
 /*
    This is the write barrier for MUT_VARs, a.k.a. IORefs.  A
    MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
@@ -1080,25 +1197,39 @@ allocatePinned (Capability *cap, W_ n)
    and is put on the mutable list.
 */
 void
-dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
+dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
 {
     Capability *cap = regTableToCapability(reg);
     // No barrier required here as no other heap object fields are read. See
     // note [Heap memory barriers] in SMP.h.
-    if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
-        p->header.info = &stg_MUT_VAR_DIRTY_info;
-        recordClosureMutated(cap,p);
+    if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) {
+        mvar->header.info = &stg_MUT_VAR_DIRTY_info;
+        recordClosureMutated(cap, (StgClosure *) mvar);
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            updateRemembSetPushClosure_(reg, old);
+        }
     }
 }
 
+/*
+ * This is the write barrier for TVARs.
+ * old is the pointer that we overwrote, which is required by the concurrent
+ * garbage collector. Note that we, while StgTVars contain multiple pointers,
+ * only overwrite one per dirty_TVAR call so we only need to take one old
+ * pointer argument.
+ */
 void
-dirty_TVAR(Capability *cap, StgTVar *p)
+dirty_TVAR(Capability *cap, StgTVar *p,
+           StgClosure *old)
 {
     // No barrier required here as no other heap object fields are read. See
     // note [Heap memory barriers] in SMP.h.
     if (p->header.info == &stg_TVAR_CLEAN_info) {
         p->header.info = &stg_TVAR_DIRTY_info;
         recordClosureMutated(cap,(StgClosure*)p);
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            updateRemembSetPushClosure(cap, old);
+        }
     }
 }
 
@@ -1113,6 +1244,9 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            updateRemembSetPushClosure(cap, (StgClosure *) tso->_link);
+        }
     }
     tso->_link = target;
 }
@@ -1123,6 +1257,9 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
+        IF_NONMOVING_WRITE_BARRIER_ENABLED {
+            updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev);
+        }
     }
     tso->block_info.prev = target;
 }
@@ -1134,15 +1271,49 @@ dirty_TSO (Capability *cap, StgTSO *tso)
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
     }
+
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        updateRemembSetPushTSO(cap, tso);
+    }
 }
 
 void
 dirty_STACK (Capability *cap, StgStack *stack)
 {
-    if (stack->dirty == 0) {
-        stack->dirty = 1;
+    // First push to upd_rem_set before we set stack->dirty since we
+    // the nonmoving collector may already be marking the stack.
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        updateRemembSetPushStack(cap, stack);
+    }
+
+    if (! (stack->dirty & STACK_DIRTY)) {
+        stack->dirty = STACK_DIRTY;
         recordClosureMutated(cap,(StgClosure*)stack);
     }
+
+}
+
+/*
+ * This is the concurrent collector's write barrier for MVARs. In the other
+ * write barriers above this is folded into the dirty_* functions.  However, in
+ * the case of MVars we need to separate the acts of adding the MVar to the
+ * mutable list and adding its fields to the update remembered set.
+ *
+ * Specifically, the wakeup loop in stg_putMVarzh wants to freely mutate the
+ * pointers of the MVar but needs to keep its lock, meaning we can't yet add it
+ * to the mutable list lest the assertion checking for clean MVars on the
+ * mutable list would fail.
+ */
+void
+update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val)
+{
+    Capability *cap = regTableToCapability(reg);
+    IF_NONMOVING_WRITE_BARRIER_ENABLED {
+        StgMVar *mvar = (StgMVar *) p;
+        updateRemembSetPushClosure(cap, old_val);
+        updateRemembSetPushClosure(cap, (StgClosure *) mvar->head);
+        updateRemembSetPushClosure(cap, (StgClosure *) mvar->tail);
+    }
 }
 
 /*
@@ -1154,9 +1325,11 @@ dirty_STACK (Capability *cap, StgStack *stack)
    such as Chaneneos and cheap-concurrency.
 */
 void
-dirty_MVAR(StgRegTable *reg, StgClosure *p)
+dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val)
 {
-    recordClosureMutated(regTableToCapability(reg),p);
+    Capability *cap = regTableToCapability(reg);
+    update_MVAR(reg, p, old_val);
+    recordClosureMutated(cap, p);
 }
 
 /* -----------------------------------------------------------------------------
@@ -1232,8 +1405,8 @@ W_ countOccupied (bdescr *bd)
 
 W_ genLiveWords (generation *gen)
 {
-    return gen->n_words + gen->n_large_words +
-        gen->n_compact_blocks * BLOCK_SIZE_W;
+    return (gen->live_estimate ? gen->live_estimate : gen->n_words) +
+        gen->n_large_words + gen->n_compact_blocks * BLOCK_SIZE_W;
 }
 
 W_ genLiveBlocks (generation *gen)
@@ -1289,9 +1462,9 @@ calcNeeded (bool force_major, memcount *blocks_needed)
     for (uint32_t g = 0; g < RtsFlags.GcFlags.generations; g++) {
         generation *gen = &generations[g];
 
-        W_ blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?)
-                  + gen->n_large_blocks
-                  + gen->n_compact_blocks;
+        W_ blocks = gen->live_estimate ? (gen->live_estimate / BLOCK_SIZE_W) : gen->n_blocks;
+        blocks += gen->n_large_blocks
+                + gen->n_compact_blocks;
 
         // we need at least this much space
         needed += blocks;
@@ -1309,7 +1482,7 @@ calcNeeded (bool force_major, memcount *blocks_needed)
                 //  mark stack:
                 needed += gen->n_blocks / 100;
             }
-            if (gen->compact) {
+            if (gen->compact || (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
                 continue; // no additional space needed for compaction
             } else {
                 needed += gen->n_blocks;
@@ -1408,6 +1581,7 @@ void flushExec (W_ len, AdjustorExecutable exec_addr)
   __clear_cache((void*)begin, (void*)end);
 # endif
 #elif defined(__GNUC__)
+  /* For all other platforms, fall back to a libgcc builtin. */
   unsigned char* begin = (unsigned char*)exec_addr;
   unsigned char* end   = begin + len;
 # if GCC_HAS_BUILTIN_CLEAR_CACHE
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index aaa44428b3dfa66f408615f1e2350b00650b6c00..cdb9720650f79ba62ff0387430719076a87f217c 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -17,6 +17,7 @@
    -------------------------------------------------------------------------- */
 
 void initStorage(void);
+void initGeneration(generation *gen, int g);
 void exitStorage(void);
 void freeStorage(bool free_heap);
 
@@ -46,8 +47,9 @@ extern Mutex sm_mutex;
    The write barrier for MVARs and TVARs
    -------------------------------------------------------------------------- */
 
-void dirty_MVAR(StgRegTable *reg, StgClosure *p);
-void dirty_TVAR(Capability *cap, StgTVar *p);
+void update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val);
+void dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old);
+void dirty_TVAR(Capability *cap, StgTVar *p, StgClosure *old);
 
 /* -----------------------------------------------------------------------------
    Nursery manipulation
diff --git a/testsuite/config/ghc b/testsuite/config/ghc
index bc888d166103571ec3d874283213a2f23a2fc7b6..9a3459ea96fa7459f1f42056c36ac15f12b0647b 100644
--- a/testsuite/config/ghc
+++ b/testsuite/config/ghc
@@ -26,7 +26,10 @@ config.other_ways         = ['prof', 'normal_h',
                              'profllvm', 'profoptllvm', 'profthreadedllvm',
                              'debug',
                              'ghci-ext', 'ghci-ext-prof',
-                             'ext-interp']
+                             'ext-interp',
+                             'nonmoving',
+                             'nonmoving_thr',
+                             'nonmoving_thr_ghc']
 
 if ghc_with_native_codegen:
     config.compile_ways.append('optasm')
@@ -96,7 +99,10 @@ config.way_flags = {
     'profthreadedllvm' : ['-O', '-prof', '-static', '-fprof-auto', '-threaded', '-fllvm'],
     'ghci-ext'         : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '+RTS', '-I0.1', '-RTS'],
     'ghci-ext-prof'    : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '-prof', '+RTS', '-I0.1', '-RTS'],
-    'ext-interp' : ['-fexternal-interpreter'],
+    'ext-interp'   : ['-fexternal-interpreter'],
+    'nonmoving'    : [],
+    'nonmoving_thr': ['-threaded'],
+    'nonmoving_thr_ghc': ['+RTS', '-xn', '-N2', '-RTS', '-threaded'],
    }
 
 config.way_rts_flags = {
@@ -135,6 +141,9 @@ config.way_rts_flags = {
     'ghci-ext'         : [],
     'ghci-ext-prof'    : [],
     'ext-interp'       : [],
+    'nonmoving'        : ['-xn'],
+    'nonmoving_thr'    : ['-xn', '-N2'],
+    'nonmoving_thr_ghc': ['-xn', '-N2'],
    }
 
 # Useful classes of ways that can be used with only_ways(), omit_ways() and
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 0882f2b6050cf5395c5f50d10a3a096b8303df64..f96820de81fa4a4c8d87c1c713c67b67d89599ed 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -1,5 +1,6 @@
 # Test +RTS -G1 here (it isn't tested anywhere else)
-setTestOpts(unless(fast(), extra_ways(['g1'])))
+# N.B. Nonmoving collector doesn't support -G1
+setTestOpts(unless(fast(), [ extra_ways(['g1']), omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])]))
 
 test('cgrun001', normal, compile_and_run, [''])
 test('cgrun002', normal, compile_and_run, [''])
@@ -194,9 +195,11 @@ test('T15696_3', normal, compile_and_run, ['-O'])
 
 test('T15892',
      [ ignore_stdout,
-        # we want to do lots of major GC to make the bug more likely to
-        # happen, so -G1 -A32k:
-        extra_run_opts('+RTS -G1 -A32k -RTS') ],
+       # -G1 is unsupported by the nonmoving GC
+       omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
+       # we want to do lots of major GC to make the bug more likely to
+       # happen, so -G1 -A32k:
+       extra_run_opts('+RTS -G1 -A32k -RTS') ],
      compile_and_run, ['-O'])
 test('T16617', normal, compile_and_run, [''])
 test('T16449_2', exit_code(0), compile_and_run, [''])
diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T
index 467040223f5611cffd9d7d6abd4f817d0ca7d13d..9297c5890e365607cce9a8e8456f8e322bcbef99 100644
--- a/testsuite/tests/concurrent/should_run/all.T
+++ b/testsuite/tests/concurrent/should_run/all.T
@@ -7,7 +7,7 @@ test('conc027', normal, compile_and_run, [''])
 test('conc051', normal, compile_and_run, [''])
 
 if ('threaded1' in config.run_ways):
-   only_threaded_ways = only_ways(['ghci','threaded1','threaded2'])
+   only_threaded_ways = only_ways(['ghci','threaded1','threaded2', 'nonmoving_thr'])
 else:
    only_threaded_ways = skip
 
@@ -203,8 +203,8 @@ test('foreignInterruptible', [when(fast(), skip),
                               ],
       compile_and_run, [''])
 
-test('conc037', only_ways(['threaded1','threaded2']), compile_and_run, [''])
-test('conc038', only_ways(['threaded1','threaded2']), compile_and_run, [''])
+test('conc037', only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), compile_and_run, [''])
+test('conc038', only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), compile_and_run, [''])
 
 # Omit for GHCi, uses foreign export
 # Omit for the threaded ways, because in this case the main thread is allowed to
@@ -224,7 +224,7 @@ test('conc045', normal, compile_and_run, [''])
 test('conc058', normal, compile_and_run, [''])
 
 test('conc059',
-     [only_ways(['threaded1', 'threaded2']),
+     [only_ways(['threaded1', 'threaded2', 'nonmoving_thr']),
       pre_cmd('$MAKE -s --no-print-directory conc059_setup')],
      compile_and_run, ['conc059_c.c -no-hs-main'])
 
@@ -243,7 +243,7 @@ test('conc067', ignore_stdout, compile_and_run, [''])
 test('conc068', [ omit_ways(concurrent_ways), exit_code(1) ], compile_and_run, [''])
 
 test('setnumcapabilities001',
-     [ only_ways(['threaded1','threaded2']),
+     [ only_ways(['threaded1','threaded2', 'nonmoving_thr']),
        extra_run_opts('8 12 2000'),
        req_smp ],
      compile_and_run, [''])
@@ -254,7 +254,7 @@ test('compareAndSwap', [omit_ways(['ghci','hpc']), reqlib('primitive')], compile
 test('hs_try_putmvar001',
      [
      when(opsys('mingw32'),skip), # uses pthread APIs in the C code
-     only_ways(['threaded1','threaded2']),
+     only_ways(['threaded1', 'threaded2', 'nonmoving_thr']),
       extra_clean(['hs_try_putmvar001_c.o'])],
      compile_and_run,
      ['hs_try_putmvar001_c.c'])
@@ -272,7 +272,7 @@ test('hs_try_putmvar003',
      [
      when(opsys('mingw32'),skip), # uses pthread APIs in the C code
      pre_cmd('$MAKE -s --no-print-directory hs_try_putmvar003_setup'),
-     only_ways(['threaded1','threaded2']),
+     only_ways(['threaded1', 'threaded2', 'nonmoving_thr']),
      extra_clean(['hs_try_putmvar003_c.o']),
      extra_run_opts('1 16 32 100'),
      fragile_for(16361, ['threaded1'])
diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T
index afac3752fa2deb765b41bb47956c83540c6e8935..6b80e193d1f2a72181b2dc5a64f29525a793e4fb 100644
--- a/testsuite/tests/perf/compiler/all.T
+++ b/testsuite/tests/perf/compiler/all.T
@@ -385,7 +385,9 @@ test ('T9630',
       extra_clean(['T9630a.hi', 'T9630a.o']),
 
       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
-      extra_hc_opts('+RTS -G1 -RTS')
+      extra_hc_opts('+RTS -G1 -RTS'),
+      # The nonmoving collector does not support -G1
+      omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])
       ],
       multimod_compile,
       ['T9630', '-v0 -O'])
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 9e20ba0b81e216e032c6d680e6b629cd0c9dc5d1..36f63c571ead24c7092b5837a5bb8f87f0003e3d 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -12,7 +12,10 @@ test('testmblockalloc',
 
 # See bug #101, test requires +RTS -c (or equivalently +RTS -M<something>)
 # only GHCi triggers the bug, but we run the test all ways for completeness.
-test('bug1010', normal, compile_and_run, ['+RTS -c -RTS'])
+test('bug1010',
+     # Non-moving GC doesn't support -c
+     omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
+     compile_and_run, ['+RTS -c -RTS'])
 
 def normalise_address(str):
     return re.sub('Access violation in generated code when reading [0]+',
@@ -67,8 +70,12 @@ test('outofmem', when(opsys('darwin'), skip),
                  makefile_test, ['outofmem'])
 test('outofmem2', normal, makefile_test, ['outofmem2'])
 
-test('T2047', [ignore_stdout, extra_run_opts('+RTS -c -RTS')],
-              compile_and_run, ['-package containers'])
+test('T2047',
+     [ignore_stdout,
+      extra_run_opts('+RTS -c -RTS'),
+      # Non-moving collector doesn't support -c
+      omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])],
+     compile_and_run, ['-package containers'])
 
 # Blackhole-detection test.
 # Skip GHCi due to #2786
@@ -183,7 +190,7 @@ test('T6006', [ omit_ways(prof_ways + ['ghci']),
 test('T7037', [], makefile_test, ['T7037'])
 
 test('T7087', exit_code(1), compile_and_run, [''])
-test('T7160', normal, compile_and_run, [''])
+test('T7160', omit_ways(['nonmoving_thr', 'nonmoving_thr_ghc']), compile_and_run, [''])
 
 test('T7040', [omit_ways(['ghci'])], compile_and_run, ['T7040_c.c'])
 
diff --git a/testsuite/tests/rts/testblockalloc.c b/testsuite/tests/rts/testblockalloc.c
index 577245f45e2739698bebb2f0c6a32bee3c26335d..53eed240156922fd8c32a2ec22ef3eb7d3b0d341 100644
--- a/testsuite/tests/rts/testblockalloc.c
+++ b/testsuite/tests/rts/testblockalloc.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 
 extern bdescr *allocGroup_lock_lock(uint32_t n);
+extern bdescr *allocAlignedGroupOnNode (uint32_t node, W_ n);
 extern void freeGroup_lock(bdescr *p);
 
 const int ARRSIZE  = 256;
@@ -13,64 +14,110 @@ const int SEED     = 0xf00f00;
 
 extern StgWord mblocks_allocated;
 
-int main (int argc, char *argv[])
+static void test_random_alloc(void)
 {
-    int i, j, b;
-
     bdescr *a[ARRSIZE];
 
-    srand(SEED);
+    // repeatedly sweep though the array, allocating new random-sized
+    // objects and deallocating the old ones.
+    for (int i=0; i < LOOPS; i++)
+    {
+        for (int j=0; j < ARRSIZE; j++)
+        {
+            if (i > 0)
+            {
+                IF_DEBUG(block_alloc, debugBelch("A%d: freeing %p, %d blocks @ %p\n", j, a[j], a[j]->blocks, a[j]->start));
+                freeGroup_lock(a[j]);
+                DEBUG_ONLY(checkFreeListSanity());
+            }
+
+            int b = (rand() % MAXALLOC) + 1;
+            a[j] = allocGroup_lock(b);
+            IF_DEBUG(block_alloc, debugBelch("A%d: allocated %p, %d blocks @ %p\n", j, a[j], b, a[j]->start));
+            // allocating zero blocks isn't allowed
+            DEBUG_ONLY(checkFreeListSanity());
+        }
+    }
 
+    for (int j=0; j < ARRSIZE; j++)
     {
-        RtsConfig conf = defaultRtsConfig;
-        conf.rts_opts_enabled = RtsOptsAll;
-        hs_init_ghc(&argc, &argv, conf);
+        freeGroup_lock(a[j]);
     }
+}
+
+static void test_sequential_alloc(void)
+{
+    bdescr *a[ARRSIZE];
 
-   // repeatedly sweep though the array, allocating new random-sized
-   // objects and deallocating the old ones.
-   for (i=0; i < LOOPS; i++)
-   {
-       for (j=0; j < ARRSIZE; j++)
-       {
-           if (i > 0)
-           {
-               IF_DEBUG(block_alloc, debugBelch("A%d: freeing %p, %d blocks @ %p\n", j, a[j], a[j]->blocks, a[j]->start));
-               freeGroup_lock(a[j]);
-               DEBUG_ONLY(checkFreeListSanity());
-           }
-           b = (rand() % MAXALLOC) + 1;
-           a[j] = allocGroup_lock(b);
-           IF_DEBUG(block_alloc, debugBelch("A%d: allocated %p, %d blocks @ %p\n", j, a[j], b, a[j]->start));
-           // allocating zero blocks isn't allowed
-           DEBUG_ONLY(checkFreeListSanity());
-       }
-   }
-
-   for (j=0; j < ARRSIZE; j++)
-   {
-       freeGroup_lock(a[j]);
-   }
-    
     // this time, sweep forwards allocating new blocks, and then
     // backwards deallocating them.
-    for (i=0; i < LOOPS; i++)
+    for (int i=0; i < LOOPS; i++)
     {
-        for (j=0; j < ARRSIZE; j++)
+        for (int j=0; j < ARRSIZE; j++)
         {
-            b = (rand() % MAXALLOC) + 1;
+            int b = (rand() % MAXALLOC) + 1;
             a[j] = allocGroup_lock(b);
             IF_DEBUG(block_alloc, debugBelch("B%d,%d: allocated %p, %d blocks @ %p\n", i, j, a[j], b, a[j]->start));
             DEBUG_ONLY(checkFreeListSanity());
         }
-        for (j=ARRSIZE-1; j >= 0; j--)
+        for (int j=ARRSIZE-1; j >= 0; j--)
         {
             IF_DEBUG(block_alloc, debugBelch("B%d,%d: freeing %p, %d blocks @ %p\n", i, j, a[j], a[j]->blocks, a[j]->start));
             freeGroup_lock(a[j]);
             DEBUG_ONLY(checkFreeListSanity());
         }
     }
-    
+}
+
+static void test_aligned_alloc(void)
+{
+    bdescr *a[ARRSIZE];
+
+    // this time, sweep forwards allocating new blocks, and then
+    // backwards deallocating them.
+    for (int i=0; i < LOOPS; i++)
+    {
+        for (int j=0; j < ARRSIZE; j++)
+        {
+            // allocAlignedGroupOnNode does not support allocating more than
+            // BLOCKS_PER_MBLOCK/2 blocks.
+            int b = rand() % (BLOCKS_PER_MBLOCK / 2);
+            if (b == 0) { b = 1; }
+            a[j] = allocAlignedGroupOnNode(0, b);
+            if ((((W_)(a[j]->start)) % (b*BLOCK_SIZE)) != 0)
+            {
+                barf("%p is not aligned to allocation size %d", a[j], b);
+            }
+            IF_DEBUG(block_alloc, debugBelch("B%d,%d: allocated %p, %d blocks @ %p\n", i, j, a[j], b, a[j]->start));
+            DEBUG_ONLY(checkFreeListSanity());
+        }
+        for (int j=ARRSIZE-1; j >= 0; j--)
+        {
+            IF_DEBUG(block_alloc, debugBelch("B%d,%d: freeing %p, %d blocks @ %p\n", i, j, a[j], a[j]->blocks, a[j]->start));
+            freeGroup_lock(a[j]);
+            DEBUG_ONLY(checkFreeListSanity());
+        }
+    }
+}
+
+int main (int argc, char *argv[])
+{
+    int i, j, b;
+
+    bdescr *a[ARRSIZE];
+
+    srand(SEED);
+
+    {
+        RtsConfig conf = defaultRtsConfig;
+        conf.rts_opts_enabled = RtsOptsAll;
+        hs_init_ghc(&argc, &argv, conf);
+    }
+
+    test_random_alloc();
+    test_sequential_alloc();
+    test_aligned_alloc();
+
     DEBUG_ONLY(checkFreeListSanity());
 
     hs_exit(); // will do a memory leak test
diff --git a/utils/deriveConstants/Main.hs b/utils/deriveConstants/Main.hs
index 54533254dd1adfe502bd64b07619696c91320ec0..f6f590715ba73c46313744aa597250d6e67e0fe6 100644
--- a/utils/deriveConstants/Main.hs
+++ b/utils/deriveConstants/Main.hs
@@ -307,6 +307,9 @@ wanteds os = concat
                              "sizeofW(StgHeader) - sizeofW(StgProfHeader)"
           ,constantWord Both "PROF_HDR_SIZE" "sizeofW(StgProfHeader)"
 
+           -- Stack flags for C--
+          ,constantWord C "STACK_DIRTY" "STACK_DIRTY"
+
            -- Size of a storage manager block (in bytes).
           ,constantWord Both "BLOCK_SIZE"  "BLOCK_SIZE"
           ,constantWord C    "MBLOCK_SIZE" "MBLOCK_SIZE"