diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs
index 7cd5c1bc20887799c780f05c6e2ae72e0809f24a..bee69f6cc9409fb0f607f564ff52bf26d55e0f27 100644
--- a/compiler/cmm/CmmMachOp.hs
+++ b/compiler/cmm/CmmMachOp.hs
@@ -589,6 +589,7 @@ data CallishMachOp
   | MO_SubIntC   Width
   | MO_U_Mul2    Width
 
+  | MO_ReadBarrier
   | MO_WriteBarrier
   | MO_Touch         -- Keep variables live (when using interior pointers)
 
diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y
index bb389d17ae68ecf2afbd426280673acb0945006a..bdc1e5dbc0d6b1ee3df041b9e308fa946d6ec432 100644
--- a/compiler/cmm/CmmParse.y
+++ b/compiler/cmm/CmmParse.y
@@ -999,6 +999,7 @@ machOps = listToUFM $
 callishMachOps :: UniqFM ([CmmExpr] -> (CallishMachOp, [CmmExpr]))
 callishMachOps = listToUFM $
         map (\(x, y) -> (mkFastString x, y)) [
+        ( "read_barrier", (,) MO_ReadBarrier ),
         ( "write_barrier", (,) MO_WriteBarrier ),
         ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ),
         ( "memset", memcpyLikeTweakArgs MO_Memset ),
diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs
index 822de431a46848e8379229a7f1a6a87145fa799a..dd2ea6cd753d553baefe1f59cf83bbc193c0dfb5 100644
--- a/compiler/cmm/PprC.hs
+++ b/compiler/cmm/PprC.hs
@@ -808,6 +808,7 @@ pprCallishMachOp_for_C mop
         MO_F32_Exp      -> text "expf"
         MO_F32_Sqrt     -> text "sqrtf"
         MO_F32_Fabs     -> text "fabsf"
+        MO_ReadBarrier  -> text "read_barrier"
         MO_WriteBarrier -> text "write_barrier"
         MO_Memcpy _     -> text "memcpy"
         MO_Memset _     -> text "memset"
diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
index 236b26dbdfee8237e4ac5720ab489dbd55bae6a0..3d5adaee3638056e1d27da91ec596c66296f1c51 100644
--- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
+++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs
@@ -169,17 +169,25 @@ barrier = do
     let s = Fence False SyncSeqCst
     return (unitOL s, [])
 
+-- | Insert a 'barrier', unless the target platform is in the provided list of
+--   exceptions (where no code will be emitted instead).
+barrierUnless :: [Arch] -> LlvmM StmtData
+barrierUnless exs = do
+    platform <- getLlvmPlatform
+    if platformArch platform `elem` exs
+        then return (nilOL, [])
+        else barrier
+
 -- | Foreign Calls
 genCall :: ForeignTarget -> [CmmFormal] -> [CmmActual]
               -> LlvmM StmtData
 
--- Write barrier needs to be handled specially as it is implemented as an LLVM
--- intrinsic function.
+-- Barriers need to be handled specially as they are implemented as LLVM
+-- intrinsic functions.
+genCall (PrimTarget MO_ReadBarrier) _ _ =
+    barrierUnless [ArchX86, ArchX86_64, ArchSPARC]
 genCall (PrimTarget MO_WriteBarrier) _ _ = do
-    platform <- getLlvmPlatform
-    if platformArch platform `elem` [ArchX86, ArchX86_64, ArchSPARC]
-       then return (nilOL, [])
-       else barrier
+    barrierUnless [ArchX86, ArchX86_64, ArchSPARC]
 
 genCall (PrimTarget MO_Touch) _ _
  = return (nilOL, [])
@@ -827,6 +835,7 @@ cmmPrimOpFunctions mop = do
     -- We support MO_U_Mul2 through ordinary LLVM mul instruction, see the
     -- appropriate case of genCall.
     MO_U_Mul2 {}     -> unsupported
+    MO_ReadBarrier   -> unsupported
     MO_WriteBarrier  -> unsupported
     MO_Touch         -> unsupported
     MO_UF_Conv _     -> unsupported
diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs
index 86525f4736debd72fc729e38fde9d102705e7268..7b0eda1f1b760dc48731b649f609d3cd7792bea4 100644
--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1122,6 +1122,8 @@ genCCall :: ForeignTarget      -- function to call
          -> [CmmFormal]        -- where to put the result
          -> [CmmActual]        -- arguments (of mixed type)
          -> NatM InstrBlock
+genCCall (PrimTarget MO_ReadBarrier) _ _
+ = return $ unitOL LWSYNC
 genCCall (PrimTarget MO_WriteBarrier) _ _
  = return $ unitOL LWSYNC
 
@@ -2027,6 +2029,7 @@ genCCall' dflags gcp target dest_regs args
                     MO_AddIntC {}    -> unsupported
                     MO_SubIntC {}    -> unsupported
                     MO_U_Mul2 {}     -> unsupported
+                    MO_ReadBarrier   -> unsupported
                     MO_WriteBarrier  -> unsupported
                     MO_Touch         -> unsupported
                     MO_Prefetch_Data _ -> unsupported
diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs
index 851a6f2f0a2670de73f412fdd5aac48993a7821d..52f03dde7e52e849420d4729c31cc4e5671f1f01 100644
--- a/compiler/nativeGen/SPARC/CodeGen.hs
+++ b/compiler/nativeGen/SPARC/CodeGen.hs
@@ -401,6 +401,8 @@ genCCall
 --
 -- In the SPARC case we don't need a barrier.
 --
+genCCall (PrimTarget MO_ReadBarrier) _ _
+ = return $ nilOL
 genCCall (PrimTarget MO_WriteBarrier) _ _
  = return $ nilOL
 
@@ -687,6 +689,7 @@ outOfLineMachOp_table mop
         MO_AddIntC {}    -> unsupported
         MO_SubIntC {}    -> unsupported
         MO_U_Mul2 {}     -> unsupported
+        MO_ReadBarrier   -> unsupported
         MO_WriteBarrier  -> unsupported
         MO_Touch         -> unsupported
         (MO_Prefetch_Data _) -> unsupported
diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs
index 61686186f17f76ddca58c5228c53ecc9ed6cc1a1..027b1ac6dd2498fff13a4cf8bd6689440b539307 100644
--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -1916,8 +1916,9 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _
         possibleWidth = minimum [left, sizeBytes]
         dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
 
+genCCall _ _ (PrimTarget MO_ReadBarrier) _ _ _  = return nilOL
 genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
-        -- write barrier compiles to no code on x86/x86-64;
+        -- barriers compile to no code on x86/x86-64;
         -- we keep it this long in order to prevent earlier optimisations.
 
 genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL
@@ -2990,6 +2991,7 @@ outOfLineCmmOp bid mop res args
               MO_AddWordC {}   -> unsupported
               MO_SubWordC {}   -> unsupported
               MO_U_Mul2 {}     -> unsupported
+              MO_ReadBarrier   -> unsupported
               MO_WriteBarrier  -> unsupported
               MO_Touch         -> unsupported
               (MO_Prefetch_Data _ ) -> unsupported
diff --git a/includes/Cmm.h b/includes/Cmm.h
index ba84328a73eca4e4d474673eef2d318e78d905a5..d3570309858641d9e52c013dad2474fc224cc8f9 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -303,7 +303,7 @@
 #define ENTER_(ret,x)                                   \
  again:                                                 \
   W_ info;                                              \
-  LOAD_INFO(ret,x)                                       \
+  LOAD_INFO(ret,x)                                      \
   switch [INVALID_OBJECT .. N_CLOSURE_TYPES]            \
          (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {       \
   case                                                  \
@@ -626,6 +626,11 @@
 #define OVERWRITING_CLOSURE_OFS(c,n) /* nothing */
 #endif
 
+#if defined(THREADED_RTS)
+#define prim_read_barrier prim %read_barrier()
+#else
+#define prim_read_barrier /* nothing */
+#endif
 #if defined(THREADED_RTS)
 #define prim_write_barrier prim %write_barrier()
 #else
diff --git a/includes/stg/SMP.h b/includes/stg/SMP.h
index 16761516ac7436bd38c0c60b2522e855cc98f645..eef372f6fc7b5a5224830dc0d6c180651def46e8 100644
--- a/includes/stg/SMP.h
+++ b/includes/stg/SMP.h
@@ -96,6 +96,57 @@ EXTERN_INLINE void write_barrier(void);
 EXTERN_INLINE void store_load_barrier(void);
 EXTERN_INLINE void load_load_barrier(void);
 
+/*
+ * Note [Heap memory barriers]
+ *
+ * Machines with weak memory ordering semantics have consequences for how
+ * closures are observed and mutated. For example, consider a closure that needs
+ * to be updated to an indirection. In order for the indirection to be safe for
+ * concurrent observers to enter, said observers must read the indirection's
+ * info table before they read the indirectee. Furthermore, the entering
+ * observer makes assumptions about the closure based on its info table
+ * contents, e.g. an INFO_TYPE of IND imples the closure has an indirectee
+ * pointer that is safe to follow.
+ *
+ * When a closure is updated with an indirection, both its info table and its
+ * indirectee must be written. With weak memory ordering, these two writes can
+ * be arbitrarily reordered, and perhaps even interleaved with other threads'
+ * reads and writes (in the absence of memory barrier instructions). Consider
+ * this example of a bad reordering:
+ *
+ * - An updater writes to a closure's info table (INFO_TYPE is now IND).
+ * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND.
+ * - A concurrent observer reads the closure's indirectee and enters it.
+ * - An updater writes the closure's indirectee.
+ *
+ * Here the update to the indirectee comes too late and the concurrent observer
+ * has jumped off into the abyss. Speculative execution can also cause us
+ * issues, consider:
+ *
+ * - An observer is about to case on a value in closure's info table.
+ * - The observer speculatively reads one or more of closure's fields.
+ * - An updater writes to closure's info table.
+ * - The observer takes a branch based on the new info table value, but with the
+ *   old closure fields!
+ * - The updater writes to the closure's other fields, but its too late.
+ *
+ * Because of these effects, reads and writes to a closure's info table must be
+ * ordered carefully with respect to reads and writes to the closure's other
+ * fields, and memory barriers must be placed to ensure that reads and writes
+ * occur in program order. Specifically, updates to a closure must follow the
+ * following pattern:
+ *
+ * - Update the closure's (non-info table) fields.
+ * - Write barrier.
+ * - Update the closure's info table.
+ *
+ * Observing a closure's fields must follow the following pattern:
+ *
+ * - Read the closure's info pointer.
+ * - Read barrier.
+ * - Read the closure's (non-info table) fields.
+ */
+
 /* ----------------------------------------------------------------------------
    Implementations
    ------------------------------------------------------------------------- */
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index b08a8bf5385bff9ddffe9c516eaa0589c0c93dd5..c1ab2dc71f9fd5a33608a266a4233ac6c5aacfa2 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -62,8 +62,11 @@ again:
     W_  info;
     P_ untaggedfun;
     W_ arity;
+    // We must obey the correct heap object observation pattern in
+    // note [Heap memory barriers] in SMP.h.
     untaggedfun = UNTAG(fun);
     info = %INFO_PTR(untaggedfun);
+    prim_read_barrier;
     switch [INVALID_OBJECT .. N_CLOSURE_TYPES]
         (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {
         case
@@ -104,7 +107,6 @@ again:
                 CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
                 P_ pap;
                 pap = Hp - SIZEOF_StgPAP + WDS(1);
-                SET_HDR(pap, stg_PAP_info, CCCS);
                 StgPAP_arity(pap) = arity;
                 if (arity <= TAG_MASK) {
                   // TODO: Shouldn't this already be tagged? If not why did we
@@ -113,6 +115,8 @@ again:
                 }
                 StgPAP_fun(pap)   = fun;
                 StgPAP_n_args(pap) = 0;
+                prim_write_barrier;
+                SET_HDR(pap, stg_PAP_info, CCCS);
                 return (pap);
             }
         }
@@ -132,7 +136,6 @@ again:
                 pap = Hp - size + WDS(1);
                 // We'll lose the original PAP, so we should enter its CCS
                 ccall enterFunCCS(BaseReg "ptr", StgHeader_ccs(untaggedfun) "ptr");
-                SET_HDR(pap, stg_PAP_info, CCCS);
                 StgPAP_arity(pap) = StgPAP_arity(untaggedfun);
                 StgPAP_n_args(pap) = StgPAP_n_args(untaggedfun);
                 StgPAP_fun(pap)   = StgPAP_fun(fun);
@@ -140,6 +143,8 @@ again:
                 i = TO_W_(StgPAP_n_args(untaggedfun));
             loop:
                 if (i == 0) {
+                    prim_write_barrier;
+                    SET_HDR(pap, stg_PAP_info, CCCS);
                     return (pap);
                 }
                 i = i - 1;
@@ -282,6 +287,7 @@ for:
       info = %GET_FUN_INFO(UNTAG(R1));
       W_ type;
       type = TO_W_(StgFunInfoExtra_fun_type(info));
+      prim_read_barrier;
       if (type == ARG_GEN) {
           jump StgFunInfoExtra_slow_apply(info) [R1];
       }
@@ -360,6 +366,7 @@ for:
       info = %GET_FUN_INFO(UNTAG(R1));
       W_ type;
       type = TO_W_(StgFunInfoExtra_fun_type(info));
+      prim_read_barrier;
       if (type == ARG_GEN) {
           jump StgFunInfoExtra_slow_apply(info) [R1];
       }
@@ -424,12 +431,14 @@ for:
   TICK_ENT_VIA_NODE();
 
 #if defined(NO_ARG_REGS)
+  prim_read_barrier;
   jump %GET_ENTRY(UNTAG(R1)) [R1];
 #else
       W_ info;
       info = %GET_FUN_INFO(UNTAG(R1));
       W_ type;
       type = TO_W_(StgFunInfoExtra_fun_type(info));
+      prim_read_barrier;
       if (type == ARG_GEN) {
           jump StgFunInfoExtra_slow_apply(info) [R1];
       }
diff --git a/rts/Compact.cmm b/rts/Compact.cmm
index 061646846d612c93bb5797bfcea48535bcb731f7..7982fb2f818d31b56dd2eff2fb482d7575258f7d 100644
--- a/rts/Compact.cmm
+++ b/rts/Compact.cmm
@@ -69,6 +69,7 @@ eval:
     tag = GETTAG(p);
     p = UNTAG(p);
     info  = %INFO_PTR(p);
+    prim_read_barrier;
     type = TO_W_(%INFO_TYPE(%STD_INFO(info)));
 
     switch [0 .. N_CLOSURE_TYPES] type {
@@ -168,7 +169,6 @@ eval:
         cards = SIZEOF_StgMutArrPtrs + WDS(ptrs);
         ALLOCATE(compact, BYTES_TO_WDS(size), p, to, tag);
         P_[pp] = tag | to;
-        SET_HDR(to, StgHeader_info(p), StgHeader_ccs(p));
         StgMutArrPtrs_ptrs(to) = ptrs;
         StgMutArrPtrs_size(to) = StgMutArrPtrs_size(p);
         prim %memcpy(to + cards, p + cards , size - cards, 1);
@@ -182,6 +182,7 @@ eval:
             i = i + 1;
             goto loop0;
         }
+        SET_HDR(to, StgHeader_info(p), StgHeader_ccs(p));
         return();
     }
 
@@ -198,7 +199,6 @@ eval:
         ptrs = StgSmallMutArrPtrs_ptrs(p);
         ALLOCATE(compact, BYTES_TO_WDS(SIZEOF_StgSmallMutArrPtrs) + ptrs, p, to, tag);
         P_[pp] = tag | to;
-        SET_HDR(to, StgHeader_info(p), StgHeader_ccs(p));
         StgSmallMutArrPtrs_ptrs(to) = ptrs;
         i = 0;
       loop1:
@@ -210,6 +210,7 @@ eval:
             i = i + 1;
             goto loop1;
         }
+        SET_HDR(to, StgHeader_info(p), StgHeader_ccs(p));
         return();
     }
 
@@ -235,7 +236,6 @@ eval:
 
         ALLOCATE(compact, size, p, to, tag);
         P_[pp] = tag | to;
-        SET_HDR(to, StgHeader_info(p), StgHeader_ccs(p));
 
         // First, copy the non-pointers
         if (nptrs > 0) {
@@ -245,6 +245,7 @@ eval:
             i = i + 1;
             if (i < ptrs + nptrs) ( likely: True ) goto loop2;
         }
+        SET_HDR(to, StgHeader_info(p), StgHeader_ccs(p));
 
         // Next, recursively compact and copy the pointers
         if (ptrs == 0) { return(); }
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index e4b9d5696e46701075008c1c3453e6fa74a8cac9..b1a25d743a8cf2c0e2f42d47a39ca7102736f690 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -249,10 +249,11 @@ StgClosure * newEmptyPAP (Capability *cap,
                           uint32_t arity)
 {
     StgPAP *pap = (StgPAP *)allocate(cap, sizeofW(StgPAP));
-    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     pap->arity = arity;
     pap->n_args = 0;
     pap->fun = tagged_obj;
+    write_barrier();
+    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     return (StgClosure *)pap;
 }
 
@@ -266,7 +267,6 @@ StgClosure * copyPAP  (Capability *cap, StgPAP *oldpap)
     uint32_t size = PAP_sizeW(oldpap->n_args);
     StgPAP *pap = (StgPAP *)allocate(cap, size);
     enterFunCCS(&cap->r, oldpap->header.prof.ccs);
-    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     pap->arity = oldpap->arity;
     pap->n_args = oldpap->n_args;
     pap->fun = oldpap->fun;
@@ -274,6 +274,8 @@ StgClosure * copyPAP  (Capability *cap, StgPAP *oldpap)
     for (i = 0; i < ((StgPAP *)pap)->n_args; i++) {
         pap->payload[i] = oldpap->payload[i];
     }
+    write_barrier();
+    SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS);
     return (StgClosure *)pap;
 }
 
@@ -481,8 +483,9 @@ eval_obj:
         {
             StgUpdateFrame *__frame;
             __frame = (StgUpdateFrame *)Sp;
-            SET_INFO((StgClosure *)__frame, (StgInfoTable *)&stg_upd_frame_info);
             __frame->updatee = (StgClosure *)(ap);
+            write_barrier();
+            SET_INFO((StgClosure *)__frame, (StgInfoTable *)&stg_upd_frame_info);
         }
 
         ENTER_CCS_THUNK(cap,ap);
@@ -799,7 +802,6 @@ do_apply:
                 // build a new PAP and return it.
                 StgPAP *new_pap;
                 new_pap = (StgPAP *)allocate(cap, PAP_sizeW(pap->n_args + m));
-                SET_HDR(new_pap,&stg_PAP_info,cap->r.rCCCS);
                 new_pap->arity = pap->arity - n;
                 new_pap->n_args = pap->n_args + m;
                 new_pap->fun = pap->fun;
@@ -809,6 +811,8 @@ do_apply:
                 for (i = 0; i < m; i++) {
                     new_pap->payload[pap->n_args + i] = (StgClosure *)SpW(i);
                 }
+                write_barrier();
+                SET_HDR(new_pap,&stg_PAP_info,cap->r.rCCCS);
                 tagged_obj = (StgClosure *)new_pap;
                 Sp_addW(m);
                 goto do_return;
@@ -844,13 +848,14 @@ do_apply:
                 StgPAP *pap;
                 uint32_t i;
                 pap = (StgPAP *)allocate(cap, PAP_sizeW(m));
-                SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS);
                 pap->arity = arity - n;
                 pap->fun = obj;
                 pap->n_args = m;
                 for (i = 0; i < m; i++) {
                     pap->payload[i] = (StgClosure *)SpW(i);
                 }
+                write_barrier();
+                SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS);
                 tagged_obj = (StgClosure *)pap;
                 Sp_addW(m);
                 goto do_return;
@@ -1081,7 +1086,6 @@ run_BCO:
                   // the BCO
                   size_words = BCO_BITMAP_SIZE(obj) + 2;
                   new_aps = (StgAP_STACK *) allocate(cap, AP_STACK_sizeW(size_words));
-                  SET_HDR(new_aps,&stg_AP_STACK_info,cap->r.rCCCS);
                   new_aps->size = size_words;
                   new_aps->fun = &stg_dummy_ret_closure;
 
@@ -1095,6 +1099,9 @@ run_BCO:
                      new_aps->payload[i] = (StgClosure *)SpW(i-2);
                   }
 
+                  write_barrier();
+                  SET_HDR(new_aps,&stg_AP_STACK_info,cap->r.rCCCS);
+
                   // Arrange the stack to call the breakpoint IO action, and
                   // continue execution of this BCO when the IO action returns.
                   //
@@ -1421,9 +1428,10 @@ run_BCO:
             StgAP* ap;
             int n_payload = BCO_NEXT;
             ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
-            SpW(-1) = (W_)ap;
             ap->n_args = n_payload;
+            write_barrier();
             SET_HDR(ap, &stg_AP_info, cap->r.rCCCS)
+            SpW(-1) = (W_)ap;
             Sp_subW(1);
             goto nextInsn;
         }
@@ -1432,9 +1440,10 @@ run_BCO:
             StgAP* ap;
             int n_payload = BCO_NEXT;
             ap = (StgAP*)allocate(cap, AP_sizeW(n_payload));
-            SpW(-1) = (W_)ap;
             ap->n_args = n_payload;
+            write_barrier();
             SET_HDR(ap, &stg_AP_NOUPD_info, cap->r.rCCCS)
+            SpW(-1) = (W_)ap;
             Sp_subW(1);
             goto nextInsn;
         }
@@ -1444,10 +1453,11 @@ run_BCO:
             int arity = BCO_NEXT;
             int n_payload = BCO_NEXT;
             pap = (StgPAP*)allocate(cap, PAP_sizeW(n_payload));
-            SpW(-1) = (W_)pap;
             pap->n_args = n_payload;
             pap->arity = arity;
+            write_barrier();
             SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS)
+            SpW(-1) = (W_)pap;
             Sp_subW(1);
             goto nextInsn;
         }
@@ -1518,16 +1528,18 @@ run_BCO:
             int o_itbl         = BCO_GET_LARGE_ARG;
             int n_words        = BCO_NEXT;
             StgInfoTable* itbl = INFO_PTR_TO_STRUCT((StgInfoTable *)BCO_LIT(o_itbl));
+            load_load_barrier();
             int request        = CONSTR_sizeW( itbl->layout.payload.ptrs,
                                                itbl->layout.payload.nptrs );
             StgClosure* con = (StgClosure*)allocate_NONUPD(cap,request);
             ASSERT( itbl->layout.payload.ptrs + itbl->layout.payload.nptrs > 0);
-            SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), cap->r.rCCCS);
             for (i = 0; i < n_words; i++) {
                 con->payload[i] = (StgClosure*)SpW(i);
             }
             Sp_addW(n_words);
             Sp_subW(1);
+            write_barrier();
+            SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), cap->r.rCCCS);
             SpW(0) = (W_)con;
             IF_DEBUG(interpreter,
                      debugBelch("\tBuilt ");
diff --git a/rts/Messages.c b/rts/Messages.c
index 2b13b6306cd48436aecc46278264cfeebf859e5d..b75f26575d53a12d18158d6146b6bb137db745c5 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -28,6 +28,7 @@ void sendMessage(Capability *from_cap, Capability *to_cap, Message *msg)
 #if defined(DEBUG)
     {
         const StgInfoTable *i = msg->header.info;
+        load_load_barrier();
         if (i != &stg_MSG_THROWTO_info &&
             i != &stg_MSG_BLACKHOLE_info &&
             i != &stg_MSG_TRY_WAKEUP_info &&
@@ -70,6 +71,7 @@ executeMessage (Capability *cap, Message *m)
 loop:
     write_barrier(); // allow m->header to be modified by another thread
     i = m->header.info;
+    load_load_barrier();
     if (i == &stg_MSG_TRY_WAKEUP_info)
     {
         StgTSO *tso = ((MessageWakeup *)m)->tso;
@@ -173,6 +175,7 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg)
                   "blackhole %p", (W_)msg->tso->id, msg->bh);
 
     info = bh->header.info;
+    load_load_barrier();
 
     // If we got this message in our inbox, it might be that the
     // BLACKHOLE has already been updated, and GC has shorted out the
@@ -196,6 +199,7 @@ loop:
     // and turns this into an infinite loop.
     p = UNTAG_CLOSURE((StgClosure*)VOLATILE_LOAD(&((StgInd*)bh)->indirectee));
     info = p->header.info;
+    load_load_barrier();
 
     if (info == &stg_IND_info)
     {
@@ -226,7 +230,6 @@ loop:
         bq = (StgBlockingQueue*)allocate(cap, sizeofW(StgBlockingQueue));
 
         // initialise the BLOCKING_QUEUE object
-        SET_HDR(bq, &stg_BLOCKING_QUEUE_DIRTY_info, CCS_SYSTEM);
         bq->bh = bh;
         bq->queue = msg;
         bq->owner = owner;
@@ -238,6 +241,8 @@ loop:
         // a collision to update a BLACKHOLE and a BLOCKING_QUEUE
         // becomes orphaned (see updateThunk()).
         bq->link = owner->bq;
+        write_barrier();
+        SET_HDR(bq, &stg_BLOCKING_QUEUE_DIRTY_info, CCS_SYSTEM);
         owner->bq = bq;
         dirty_TSO(cap, owner); // we modified owner->bq
 
@@ -289,6 +294,7 @@ loop:
         recordClosureMutated(cap,(StgClosure*)msg);
 
         if (info == &stg_BLOCKING_QUEUE_CLEAN_info) {
+            write_barrier();
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
             recordClosureMutated(cap,(StgClosure*)bq);
         }
@@ -319,6 +325,7 @@ StgTSO * blackHoleOwner (StgClosure *bh)
     StgClosure *p;
 
     info = bh->header.info;
+    load_load_barrier();
 
     if (info != &stg_BLACKHOLE_info &&
         info != &stg_CAF_BLACKHOLE_info &&
@@ -334,6 +341,7 @@ loop:
     // and turns this into an infinite loop.
     p = UNTAG_CLOSURE((StgClosure*)VOLATILE_LOAD(&((StgInd*)bh)->indirectee));
     info = p->header.info;
+    load_load_barrier();
 
     if (info == &stg_IND_info) goto loop;
 
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 47b9bca04b8d142c7b101fc48c98fc975d1eab56..ed5d5a23710913c47e94ef875993941fee7d4ac0 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -68,8 +68,9 @@ stg_newByteArrayzh ( W_ n )
         jump stg_raisezh(base_GHCziIOziException_heapOverflow_closure);
     }
     TICK_ALLOC_PRIM(SIZEOF_StgArrBytes,WDS(payload_words),0);
-    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrBytes_bytes(p) = n;
+    prim_write_barrier;
+    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     return (p);
 }
 
@@ -105,8 +106,9 @@ stg_newPinnedByteArrayzh ( W_ n )
        to BA_ALIGN bytes: */
     p = p + ((-p - SIZEOF_StgArrBytes) & BA_MASK);
 
-    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrBytes_bytes(p) = n;
+    prim_write_barrier;
+    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     return (p);
 }
 
@@ -147,8 +149,9 @@ stg_newAlignedPinnedByteArrayzh ( W_ n, W_ alignment )
        <alignment> is a power of 2, which is technically not guaranteed */
     p = p + ((-p - SIZEOF_StgArrBytes) & (alignment - 1));
 
-    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     StgArrBytes_bytes(p) = n;
+    prim_write_barrier;
+    SET_HDR(p, stg_ARR_WORDS_info, CCCS);
     return (p);
 }
 
@@ -257,7 +260,6 @@ stg_newArrayzh ( W_ n /* words */, gcptr init )
     }
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(size), 0);
 
-    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
     StgMutArrPtrs_ptrs(arr) = n;
     StgMutArrPtrs_size(arr) = size;
 
@@ -270,6 +272,9 @@ stg_newArrayzh ( W_ n /* words */, gcptr init )
         goto for;
     }
 
+    prim_write_barrier;
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
+
     return (arr);
 }
 
@@ -281,11 +286,13 @@ stg_unsafeThawArrayzh ( gcptr arr )
     // mut_list so no need to add it again. MUT_ARR_PTRS_FROZEN_CLEAN means it's
     // not and we should add it to a mut_list.
     if (StgHeader_info(arr) != stg_MUT_ARR_PTRS_FROZEN_DIRTY_info) {
+        prim_write_barrier; // see below:
         SET_INFO(arr,stg_MUT_ARR_PTRS_DIRTY_info);
         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE():
         recordMutable(arr);
         return (arr);
     } else {
+        prim_write_barrier;
         SET_INFO(arr,stg_MUT_ARR_PTRS_DIRTY_info);
         return (arr);
     }
@@ -373,7 +380,6 @@ stg_newArrayArrayzh ( W_ n /* words */ )
     }
     TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(size), 0);
 
-    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
     StgMutArrPtrs_ptrs(arr) = n;
     StgMutArrPtrs_size(arr) = size;
 
@@ -386,6 +392,9 @@ stg_newArrayArrayzh ( W_ n /* words */ )
         goto for;
     }
 
+    prim_write_barrier;
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
+
     return (arr);
 }
 
@@ -408,7 +417,6 @@ stg_newSmallArrayzh ( W_ n /* words */, gcptr init )
     }
     TICK_ALLOC_PRIM(SIZEOF_StgSmallMutArrPtrs, WDS(n), 0);
 
-    SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS);
     StgSmallMutArrPtrs_ptrs(arr) = n;
 
     // Initialise all elements of the array with the value in R2
@@ -423,6 +431,9 @@ stg_newSmallArrayzh ( W_ n /* words */, gcptr init )
         goto for;
     }
 
+    prim_write_barrier;
+    SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS);
+
     return (arr);
 }
 
@@ -431,11 +442,13 @@ stg_unsafeThawSmallArrayzh ( gcptr arr )
     // See stg_unsafeThawArrayzh
     if (StgHeader_info(arr) != stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info) {
         SET_INFO(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+        prim_write_barrier;
         recordMutable(arr);
         // must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
         return (arr);
     } else {
         SET_INFO(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+        prim_write_barrier;
         return (arr);
     }
 }
@@ -465,13 +478,14 @@ stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
 {
     W_ dst_p, src_p, bytes;
 
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
-
     dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
     src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
     bytes = WDS(n);
     prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
 
+    prim_write_barrier;
+    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
     return ();
 }
 
@@ -479,8 +493,6 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
 {
     W_ dst_p, src_p, bytes;
 
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
-
     dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
     src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
     bytes = WDS(n);
@@ -490,6 +502,9 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
         prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
     }
 
+    prim_write_barrier;
+    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
     return ();
 }
 
@@ -525,8 +540,9 @@ stg_newMutVarzh ( gcptr init )
     ALLOC_PRIM_P (SIZEOF_StgMutVar, stg_newMutVarzh, init);
 
     mv = Hp - SIZEOF_StgMutVar + WDS(1);
-    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS);
     StgMutVar_var(mv) = init;
+    prim_write_barrier;
+    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS);
 
     return (mv);
 }
@@ -609,16 +625,18 @@ stg_atomicModifyMutVar2zh ( gcptr mv, gcptr f )
     TICK_ALLOC_THUNK_2();
     CCCS_ALLOC(THUNK_2_SIZE);
     z = Hp - THUNK_2_SIZE + WDS(1);
-    SET_HDR(z, stg_ap_2_upd_info, CCCS);
     LDV_RECORD_CREATE(z);
     StgThunk_payload(z,0) = f;
+    prim_write_barrier;
+    SET_HDR(z, stg_ap_2_upd_info, CCCS);
 
     TICK_ALLOC_THUNK_1();
     CCCS_ALLOC(THUNK_1_SIZE);
     y = z - THUNK_1_SIZE;
-    SET_HDR(y, stg_sel_0_upd_info, CCCS);
     LDV_RECORD_CREATE(y);
     StgThunk_payload(y,0) = z;
+    prim_write_barrier;
+    SET_HDR(y, stg_sel_0_upd_info, CCCS);
 
   retry:
     x = StgMutVar_var(mv);
@@ -668,9 +686,10 @@ stg_atomicModifyMutVarzuzh ( gcptr mv, gcptr f )
     TICK_ALLOC_THUNK();
     CCCS_ALLOC(THUNK_SIZE);
     z = Hp - THUNK_SIZE + WDS(1);
-    SET_HDR(z, stg_ap_2_upd_info, CCCS);
     LDV_RECORD_CREATE(z);
     StgThunk_payload(z,0) = f;
+    prim_write_barrier;
+    SET_HDR(z, stg_ap_2_upd_info, CCCS);
 
   retry:
     x = StgMutVar_var(mv);
@@ -703,7 +722,6 @@ stg_mkWeakzh ( gcptr key,
     ALLOC_PRIM (SIZEOF_StgWeak)
 
     w = Hp - SIZEOF_StgWeak + WDS(1);
-    SET_HDR(w, stg_WEAK_info, CCCS);
 
     StgWeak_key(w)         = key;
     StgWeak_value(w)       = value;
@@ -711,6 +729,10 @@ stg_mkWeakzh ( gcptr key,
     StgWeak_cfinalizers(w) = stg_NO_FINALIZER_closure;
 
     StgWeak_link(w) = Capability_weak_ptr_list_hd(MyCapability());
+
+    prim_write_barrier;
+    SET_HDR(w, stg_WEAK_info, CCCS);
+
     Capability_weak_ptr_list_hd(MyCapability()) = w;
     if (Capability_weak_ptr_list_tl(MyCapability()) == NULL) {
         Capability_weak_ptr_list_tl(MyCapability()) = w;
@@ -737,13 +759,15 @@ stg_addCFinalizzerToWeakzh ( W_ fptr,   // finalizer
     ALLOC_PRIM (SIZEOF_StgCFinalizerList)
 
     c = Hp - SIZEOF_StgCFinalizerList + WDS(1);
-    SET_HDR(c, stg_C_FINALIZER_LIST_info, CCCS);
 
     StgCFinalizerList_fptr(c) = fptr;
     StgCFinalizerList_ptr(c) = ptr;
     StgCFinalizerList_eptr(c) = eptr;
     StgCFinalizerList_flag(c) = flag;
 
+    prim_write_barrier;
+    SET_HDR(c, stg_C_FINALIZER_LIST_info, CCCS);
+
     LOCK_CLOSURE(w, info);
 
     if (info == stg_DEAD_WEAK_info) {
@@ -818,6 +842,7 @@ stg_deRefWeakzh ( gcptr w )
     gcptr val;
 
     info = GET_INFO(w);
+    prim_read_barrier;
 
     if (info == stg_WHITEHOLE_info) {
         // w is locked by another thread. Now it's not immediately clear if w is
@@ -1389,11 +1414,13 @@ stg_readTVarzh (P_ tvar)
 
 stg_readTVarIOzh ( P_ tvar /* :: TVar a */ )
 {
-    W_ result;
+    W_ result, resultinfo;
 
 again:
     result = StgTVar_current_value(tvar);
-    if (%INFO_PTR(result) == stg_TREC_HEADER_info) {
+    resultinfo = %INFO_PTR(result);
+    prim_read_barrier;
+    if (resultinfo == stg_TREC_HEADER_info) {
         goto again;
     }
     return (result);
@@ -1462,11 +1489,12 @@ stg_newMVarzh ()
     ALLOC_PRIM_ (SIZEOF_StgMVar, stg_newMVarzh);
 
     mvar = Hp - SIZEOF_StgMVar + WDS(1);
-    SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
-        // MVARs start dirty: generation 0 has no mutable list
     StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
     StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+    prim_write_barrier;
+    SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS);
+        // MVARs start dirty: generation 0 has no mutable list
     return (mvar);
 }
 
@@ -1486,7 +1514,7 @@ stg_newMVarzh ()
 
 stg_takeMVarzh ( P_ mvar /* :: MVar a */ )
 {
-    W_ val, info, tso, q;
+    W_ val, info, tso, q, qinfo;
 
     LOCK_CLOSURE(mvar, info);
 
@@ -1508,10 +1536,12 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ )
 
         q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
 
-        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
         StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
         StgMVarTSOQueue_tso(q)  = CurrentTSO;
 
+        prim_write_barrier;
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+
         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
             StgMVar_head(mvar) = q;
         } else {
@@ -1540,8 +1570,10 @@ loop:
         unlockClosure(mvar, info);
         return (val);
     }
-    if (StgHeader_info(q) == stg_IND_info ||
-        StgHeader_info(q) == stg_MSG_NULL_info) {
+    qinfo = StgHeader_info(q);
+    prim_read_barrier;
+    if (qinfo == stg_IND_info ||
+        qinfo == stg_MSG_NULL_info) {
         q = StgInd_indirectee(q);
         goto loop;
     }
@@ -1579,7 +1611,7 @@ loop:
 
 stg_tryTakeMVarzh ( P_ mvar /* :: MVar a */ )
 {
-    W_ val, info, tso, q;
+    W_ val, info, tso, q, qinfo;
 
     LOCK_CLOSURE(mvar, info);
 
@@ -1606,8 +1638,11 @@ loop:
         return (1, val);
     }
 
-    if (StgHeader_info(q) == stg_IND_info ||
-        StgHeader_info(q) == stg_MSG_NULL_info) {
+    qinfo = StgHeader_info(q);
+    prim_read_barrier;
+
+    if (qinfo == stg_IND_info ||
+        qinfo == stg_MSG_NULL_info) {
         q = StgInd_indirectee(q);
         goto loop;
     }
@@ -1646,7 +1681,7 @@ loop:
 stg_putMVarzh ( P_ mvar, /* :: MVar a */
                 P_ val,  /* :: a */ )
 {
-    W_ info, tso, q;
+    W_ info, tso, q, qinfo;
 
     LOCK_CLOSURE(mvar, info);
 
@@ -1666,10 +1701,12 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */
 
         q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1);
 
-        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
         StgMVarTSOQueue_link(q) = END_TSO_QUEUE;
         StgMVarTSOQueue_tso(q)  = CurrentTSO;
 
+        prim_write_barrier;
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+
         if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
             StgMVar_head(mvar) = q;
         } else {
@@ -1696,8 +1733,12 @@ loop:
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
         return ();
     }
-    if (StgHeader_info(q) == stg_IND_info ||
-        StgHeader_info(q) == stg_MSG_NULL_info) {
+
+    qinfo = StgHeader_info(q);
+    prim_read_barrier;
+
+    if (qinfo == stg_IND_info ||
+        qinfo == stg_MSG_NULL_info) {
         q = StgInd_indirectee(q);
         goto loop;
     }
@@ -1754,7 +1795,7 @@ loop:
 stg_tryPutMVarzh ( P_ mvar, /* :: MVar a */
                    P_ val,  /* :: a */ )
 {
-    W_ info, tso, q;
+    W_ info, tso, q, qinfo;
 
     LOCK_CLOSURE(mvar, info);
 
@@ -1777,8 +1818,12 @@ loop:
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
         return (1);
     }
-    if (StgHeader_info(q) == stg_IND_info ||
-        StgHeader_info(q) == stg_MSG_NULL_info) {
+
+    qinfo = StgHeader_info(q);
+    prim_read_barrier;
+
+    if (qinfo == stg_IND_info ||
+        qinfo == stg_MSG_NULL_info) {
         q = StgInd_indirectee(q);
         goto loop;
     }
@@ -1849,10 +1894,12 @@ stg_readMVarzh ( P_ mvar, /* :: MVar a */ )
 
         // readMVars are pushed to the front of the queue, so
         // they get handled immediately
-        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
         StgMVarTSOQueue_link(q) = StgMVar_head(mvar);
         StgMVarTSOQueue_tso(q)  = CurrentTSO;
 
+        prim_write_barrier;
+        SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM);
+
         StgTSO__link(CurrentTSO)       = q;
         StgTSO_block_info(CurrentTSO)  = mvar;
         StgTSO_why_blocked(CurrentTSO) = BlockedOnMVarRead::I16;
@@ -1915,9 +1962,10 @@ stg_makeStableNamezh ( P_ obj )
         // too complicated and doesn't buy us much. See D5342?id=18700.)
         ("ptr" sn_obj) = ccall allocate(MyCapability() "ptr",
                                         BYTES_TO_WDS(SIZEOF_StgStableName));
-        SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS);
         StgStableName_sn(sn_obj) = index;
         snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry) = sn_obj;
+        prim_write_barrier;
+        SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS);
     } else {
         sn_obj = snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry);
     }
@@ -1958,7 +2006,6 @@ stg_newBCOzh ( P_ instrs,
     ALLOC_PRIM (bytes);
 
     bco = Hp - bytes + WDS(1);
-    SET_HDR(bco, stg_BCO_info, CCS_MAIN);
 
     StgBCO_instrs(bco)     = instrs;
     StgBCO_literals(bco)   = literals;
@@ -1976,6 +2023,9 @@ for:
         goto for;
     }
 
+    prim_write_barrier;
+    SET_HDR(bco, stg_BCO_info, CCS_MAIN);
+
     return (bco);
 }
 
@@ -1994,11 +2044,13 @@ stg_mkApUpd0zh ( P_ bco )
     CCCS_ALLOC(SIZEOF_StgAP);
 
     ap = Hp - SIZEOF_StgAP + WDS(1);
-    SET_HDR(ap, stg_AP_info, CCS_MAIN);
 
     StgAP_n_args(ap) = HALF_W_(0);
     StgAP_fun(ap) = bco;
 
+    prim_write_barrier;
+    SET_HDR(ap, stg_AP_info, CCS_MAIN);
+
     return (ap);
 }
 
@@ -2006,6 +2058,7 @@ stg_unpackClosurezh ( P_ closure )
 {
     W_ info, ptrs, nptrs, p, ptrs_arr, dat_arr;
     info  = %GET_STD_INFO(UNTAG(closure));
+    prim_read_barrier;
 
     ptrs  = TO_W_(%INFO_PTRS(info));
     nptrs = TO_W_(%INFO_NPTRS(info));
@@ -2026,7 +2079,6 @@ stg_unpackClosurezh ( P_ closure )
     dat_arr = Hp - dat_arr_sz + WDS(1);
 
 
-    SET_HDR(dat_arr, stg_ARR_WORDS_info, CCCS);
     StgArrBytes_bytes(dat_arr) = WDS(len);
     p = 0;
 for:
@@ -2041,6 +2093,9 @@ for:
     // Follow the pointers
     ("ptr" ptrArray) = foreign "C" heap_view_closurePtrs(MyCapability() "ptr", clos "ptr");
 
+    prim_write_barrier;
+    SET_HDR(dat_arr, stg_ARR_WORDS_info, CCCS);
+
     return (info, dat_arr, ptrArray);
 }
 
@@ -2334,7 +2389,10 @@ stg_noDuplicatezh /* no arg list: explicit stack layout */
 
 stg_getApStackValzh ( P_ ap_stack, W_ offset )
 {
-   if (%INFO_PTR(UNTAG(ap_stack)) == stg_AP_STACK_info) {
+   W_ ap_stackinfo;
+   ap_stackinfo = %INFO_PTR(UNTAG(ap_stack));
+   prim_read_barrier;
+   if (ap_stackinfo == stg_AP_STACK_info) {
        return (1,StgAP_STACK_payload(ap_stack,offset));
    } else {
        return (0,ap_stack);
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index f58f9177c8c31fb14b984e9ee35827b7622b90bb..359718f9df8b9e6e991f8b561d2944b2ca456193 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -870,6 +870,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 ap->payload[i] = (StgClosure *)*sp++;
             }
 
+            write_barrier();
             SET_HDR(ap,&stg_AP_STACK_info,
                     ((StgClosure *)frame)->header.prof.ccs /* ToDo */);
             TICK_ALLOC_UP_THK(WDS(words+1),0);
@@ -921,6 +922,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 ap->payload[i] = (StgClosure *)*sp++;
             }
 
+            write_barrier();
             SET_HDR(ap,&stg_AP_STACK_NOUPD_info,stack->header.prof.ccs);
             TICK_ALLOC_SE_THK(WDS(words+1),0);
 
@@ -959,6 +961,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
             //
             raise = (StgThunk *)allocate(cap,sizeofW(StgThunk)+1);
             TICK_ALLOC_SE_THK(WDS(1),0);
+            write_barrier();
             SET_HDR(raise,&stg_raise_info,cf->header.prof.ccs);
             raise->payload[0] = exception;
 
@@ -1039,8 +1042,9 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 
                 atomically = (StgThunk*)allocate(cap,sizeofW(StgThunk)+1);
                 TICK_ALLOC_SE_THK(1,0);
-                SET_HDR(atomically,&stg_atomically_info,af->header.prof.ccs);
                 atomically->payload[0] = af->code;
+                write_barrier();
+                SET_HDR(atomically,&stg_atomically_info,af->header.prof.ccs);
 
                 // discard stack up to and including the ATOMICALLY_FRAME
                 frame += sizeofW(StgAtomicallyFrame);
diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c
index 26433ac209ba19aa13c1e6fd9d91899f254069db..1a2b7f0065b7bc1aef13baae43ee0e5c41016848 100644
--- a/rts/RtsAPI.c
+++ b/rts/RtsAPI.c
@@ -30,8 +30,9 @@ HaskellObj
 rts_mkChar (Capability *cap, HsChar c)
 {
   StgClosure *p = (StgClosure *)allocate(cap, CONSTR_sizeW(0,1));
-  SET_HDR(p, Czh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
+  write_barrier();
+  SET_HDR(p, Czh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -39,8 +40,9 @@ HaskellObj
 rts_mkInt (Capability *cap, HsInt i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, Izh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgInt)i;
+  write_barrier();
+  SET_HDR(p, Izh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -48,9 +50,10 @@ HaskellObj
 rts_mkInt8 (Capability *cap, HsInt8 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the bits above the lowest 8 */
   p->payload[0]  = (StgClosure *)(StgInt)i;
+  write_barrier();
+  SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -58,9 +61,10 @@ HaskellObj
 rts_mkInt16 (Capability *cap, HsInt16 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
   /* Make sure we mask out the relevant bits */
   p->payload[0]  = (StgClosure *)(StgInt)i;
+  write_barrier();
+  SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -68,8 +72,9 @@ HaskellObj
 rts_mkInt32 (Capability *cap, HsInt32 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgInt)i;
+  write_barrier();
+  SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -77,8 +82,9 @@ HaskellObj
 rts_mkInt64 (Capability *cap, HsInt64 i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
-  SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
   ASSIGN_Int64((P_)&(p->payload[0]), i);
+  write_barrier();
+  SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -86,8 +92,9 @@ HaskellObj
 rts_mkWord (Capability *cap, HsWord i)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)i;
+  write_barrier();
+  SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -96,8 +103,9 @@ rts_mkWord8 (Capability *cap, HsWord8 w)
 {
   /* see rts_mkInt* comments */
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
+  write_barrier();
+  SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -106,8 +114,9 @@ rts_mkWord16 (Capability *cap, HsWord16 w)
 {
   /* see rts_mkInt* comments */
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
+  write_barrier();
+  SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -116,8 +125,9 @@ rts_mkWord32 (Capability *cap, HsWord32 w)
 {
   /* see rts_mkInt* comments */
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
+  write_barrier();
+  SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -126,8 +136,9 @@ rts_mkWord64 (Capability *cap, HsWord64 w)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,2));
   /* see mk_Int8 comment */
-  SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
   ASSIGN_Word64((P_)&(p->payload[0]), w);
+  write_barrier();
+  SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -136,8 +147,9 @@ HaskellObj
 rts_mkFloat (Capability *cap, HsFloat f)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,1));
-  SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
   ASSIGN_FLT((P_)p->payload, (StgFloat)f);
+  write_barrier();
+  SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -145,8 +157,9 @@ HaskellObj
 rts_mkDouble (Capability *cap, HsDouble d)
 {
   StgClosure *p = (StgClosure *)allocate(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
-  SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
   ASSIGN_DBL((P_)p->payload, (StgDouble)d);
+  write_barrier();
+  SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -154,8 +167,9 @@ HaskellObj
 rts_mkStablePtr (Capability *cap, HsStablePtr s)
 {
   StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
-  SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)s;
+  write_barrier();
+  SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -163,8 +177,9 @@ HaskellObj
 rts_mkPtr (Capability *cap, HsPtr a)
 {
   StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
-  SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)a;
+  write_barrier();
+  SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -172,8 +187,9 @@ HaskellObj
 rts_mkFunPtr (Capability *cap, HsFunPtr a)
 {
   StgClosure *p = (StgClosure *)allocate(cap,sizeofW(StgHeader)+1);
-  SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
   p->payload[0]  = (StgClosure *)a;
+  write_barrier();
+  SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
   return p;
 }
 
@@ -202,9 +218,10 @@ rts_apply (Capability *cap, HaskellObj f, HaskellObj arg)
     // Here we don't want to use CCS_SYSTEM, because it's a hidden cost centre,
     // and evaluating Haskell code under a hidden cost centre leads to
     // confusing profiling output. (#7753)
-    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_MAIN);
     ap->payload[0] = f;
     ap->payload[1] = arg;
+    write_barrier();
+    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_MAIN);
     return (StgClosure *)ap;
 }
 
diff --git a/rts/Sparks.c b/rts/Sparks.c
index bd5e120863054a41178c0af23696fb092d3269ff..4022691da2a10a8ff25434e3386c97fcb8183eb3 100644
--- a/rts/Sparks.c
+++ b/rts/Sparks.c
@@ -182,6 +182,7 @@ pruneSparkQueue (Capability *cap)
           traceEventSparkFizzle(cap);
       } else {
           info = spark->header.info;
+          load_load_barrier();
           if (IS_FORWARDING_PTR(info)) {
               tmp = (StgClosure*)UN_FORWARDING_PTR(info);
               /* if valuable work: shift inside the pool */
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index fdd9f1565eb592ee6ffc5633f9801830585d1656..f2aae9e2a5ccd600b4a6502e4128bc04239d181b 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -292,12 +292,14 @@ INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
     TICK_ENT_DYN_IND(); /* tick */
 
 retry:
+    prim_read_barrier;
     p = StgInd_indirectee(node);
     if (GETTAG(p) != 0) {
         return (p);
     }
 
     info = StgHeader_info(p);
+    prim_read_barrier;
     if (info == stg_IND_info) {
         // This could happen, if e.g. we got a BLOCKING_QUEUE that has
         // just been replaced with an IND by another thread in
@@ -313,9 +315,10 @@ retry:
         ("ptr" msg) = ccall allocate(MyCapability() "ptr",
                                      BYTES_TO_WDS(SIZEOF_MessageBlackHole));
 
-        SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM);
         MessageBlackHole_tso(msg) = CurrentTSO;
         MessageBlackHole_bh(msg) = node;
+        prim_write_barrier;
+        SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM);
 
         (r) = ccall messageBlackHole(MyCapability() "ptr", msg "ptr");
 
@@ -375,6 +378,7 @@ INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE")
 loop:
     // spin until the WHITEHOLE is updated
     info = StgHeader_info(node);
+    prim_read_barrier;
     if (info == stg_WHITEHOLE_info) {
 #if defined(PROF_SPIN)
         W_[whitehole_lockClosure_spin] =
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index a916891aa8cc37b29c2157de3e17e924adc2f7a6..ea8f298a89ce0361608a6fb4c772e8887f974985 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -195,6 +195,7 @@ threadPaused(Capability *cap, StgTSO *tso)
     const StgRetInfoTable *info;
     const StgInfoTable *bh_info;
     const StgInfoTable *cur_bh_info USED_IF_THREADS;
+    const StgInfoTable *frame_info;
     StgClosure *bh;
     StgPtr stack_end;
     uint32_t words_to_squeeze = 0;
@@ -220,13 +221,16 @@ threadPaused(Capability *cap, StgTSO *tso)
 
     while ((P_)frame < stack_end) {
         info = get_ret_itbl(frame);
+        load_load_barrier();
 
         switch (info->i.type) {
 
         case UPDATE_FRAME:
 
             // If we've already marked this frame, then stop here.
-            if (frame->header.info == (StgInfoTable *)&stg_marked_upd_frame_info) {
+            frame_info = frame->header.info;
+            load_load_barrier();
+            if (frame_info == (StgInfoTable *)&stg_marked_upd_frame_info) {
                 if (prev_was_update_frame) {
                     words_to_squeeze += sizeofW(StgUpdateFrame);
                     weight += weight_pending;
@@ -235,10 +239,12 @@ threadPaused(Capability *cap, StgTSO *tso)
                 goto end;
             }
 
+            write_barrier();
             SET_INFO(frame, (StgInfoTable *)&stg_marked_upd_frame_info);
 
             bh = ((StgUpdateFrame *)frame)->updatee;
             bh_info = bh->header.info;
+            load_load_barrier();
 
 #if defined(THREADED_RTS)
         retry:
diff --git a/rts/Threads.c b/rts/Threads.c
index 977635322d82defac33875606c1a1b5df974cbd9..2305f7b2273039a3db4b738061cb07435f08f8ea 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -82,14 +82,14 @@ createThread(Capability *cap, W_ size)
     stack_size = round_to_mblocks(size - sizeofW(StgTSO));
     stack = (StgStack *)allocate(cap, stack_size);
     TICK_ALLOC_STACK(stack_size);
-    SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS);
     stack->stack_size   = stack_size - sizeofW(StgStack);
     stack->sp           = stack->stack + stack->stack_size;
     stack->dirty        = 1;
+    write_barrier();
+    SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS);
 
     tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
     TICK_ALLOC_TSO();
-    SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
 
     // Always start with the compiled code evaluator
     tso->what_next = ThreadRunGHC;
@@ -116,6 +116,9 @@ createThread(Capability *cap, W_ size)
     tso->prof.cccs = CCS_MAIN;
 #endif
 
+    write_barrier();
+    SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
+
     // put a stop frame on the stack
     stack->sp -= sizeofW(StgStopFrame);
     SET_HDR((StgClosure*)stack->sp,
@@ -257,8 +260,9 @@ tryWakeupThread (Capability *cap, StgTSO *tso)
     {
         MessageWakeup *msg;
         msg = (MessageWakeup *)allocate(cap,sizeofW(MessageWakeup));
-        SET_HDR(msg, &stg_MSG_TRY_WAKEUP_info, CCS_SYSTEM);
         msg->tso = tso;
+        write_barrier();
+        SET_HDR(msg, &stg_MSG_TRY_WAKEUP_info, CCS_SYSTEM);
         sendMessage(cap, tso->cap, (Message*)msg);
         debugTraceCap(DEBUG_sched, cap, "message: try wakeup thread %ld on cap %d",
                       (W_)tso->id, tso->cap->no);
@@ -363,6 +367,7 @@ wakeBlockingQueue(Capability *cap, StgBlockingQueue *bq)
     for (msg = bq->queue; msg != (MessageBlackHole*)END_TSO_QUEUE;
          msg = msg->link) {
         i = msg->header.info;
+        load_load_barrier();
         if (i != &stg_IND_info) {
             ASSERT(i == &stg_MSG_BLACKHOLE_info);
             tryWakeupThread(cap,msg->tso);
@@ -384,6 +389,8 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
 {
     StgBlockingQueue *bq, *next;
     StgClosure *p;
+    const StgInfoTable *bqinfo;
+    const StgInfoTable *pinfo;
 
     debugTraceCap(DEBUG_sched, cap,
                   "collision occurred; checking blocking queues for thread %ld",
@@ -392,15 +399,18 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
     for (bq = tso->bq; bq != (StgBlockingQueue*)END_TSO_QUEUE; bq = next) {
         next = bq->link;
 
-        if (bq->header.info == &stg_IND_info) {
+        bqinfo = bq->header.info;
+        load_load_barrier();
+        if (bqinfo == &stg_IND_info) {
             // ToDo: could short it out right here, to avoid
             // traversing this IND multiple times.
             continue;
         }
 
         p = bq->bh;
-
-        if (p->header.info != &stg_BLACKHOLE_info ||
+        pinfo = p->header.info;
+        load_load_barrier();
+        if (pinfo != &stg_BLACKHOLE_info ||
             ((StgInd *)p)->indirectee != (StgClosure*)bq)
         {
             wakeBlockingQueue(cap,bq);
@@ -424,6 +434,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
     const StgInfoTable *i;
 
     i = thunk->header.info;
+    load_load_barrier();
     if (i != &stg_BLACKHOLE_info &&
         i != &stg_CAF_BLACKHOLE_info &&
         i != &__stg_EAGER_BLACKHOLE_info &&
@@ -444,6 +455,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
     }
 
     i = v->header.info;
+    load_load_barrier();
     if (i == &stg_TSO_info) {
         checkBlockingQueues(cap, tso);
         return;
@@ -597,12 +609,13 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
     new_stack = (StgStack*) allocate(cap, chunk_size);
     cap->r.rCurrentTSO = NULL;
 
-    SET_HDR(new_stack, &stg_STACK_info, old_stack->header.prof.ccs);
     TICK_ALLOC_STACK(chunk_size);
 
     new_stack->dirty = 0; // begin clean, we'll mark it dirty below
     new_stack->stack_size = chunk_size - sizeofW(StgStack);
     new_stack->sp = new_stack->stack + new_stack->stack_size;
+    write_barrier();
+    SET_HDR(new_stack, &stg_STACK_info, old_stack->header.prof.ccs);
 
     tso->tot_stack_size += new_stack->stack_size;
 
@@ -651,8 +664,9 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
         } else {
             new_stack->sp -= sizeofW(StgUnderflowFrame);
             frame = (StgUnderflowFrame*)new_stack->sp;
-            frame->info = &stg_stack_underflow_frame_info;
             frame->next_chunk  = old_stack;
+            write_barrier();
+            frame->info = &stg_stack_underflow_frame_info;
         }
 
         // copy the stack chunk between tso->sp and sp to
@@ -738,6 +752,7 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
 bool performTryPutMVar(Capability *cap, StgMVar *mvar, StgClosure *value)
 {
     const StgInfoTable *info;
+    const StgInfoTable *qinfo;
     StgMVarTSOQueue *q;
     StgTSO *tso;
 
@@ -752,6 +767,8 @@ bool performTryPutMVar(Capability *cap, StgMVar *mvar, StgClosure *value)
 
     q = mvar->head;
 loop:
+    qinfo = q->header.info;
+    load_load_barrier();
     if (q == (StgMVarTSOQueue*)&stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
         if (info == &stg_MVAR_CLEAN_info) {
@@ -762,8 +779,8 @@ loop:
         unlockClosure((StgClosure*)mvar, &stg_MVAR_DIRTY_info);
         return true;
     }
-    if (q->header.info == &stg_IND_info ||
-        q->header.info == &stg_MSG_NULL_info) {
+    if (qinfo == &stg_IND_info ||
+        qinfo == &stg_MSG_NULL_info) {
         q = (StgMVarTSOQueue*)((StgInd*)q)->indirectee;
         goto loop;
     }
diff --git a/rts/TopHandler.c b/rts/TopHandler.c
index c0ac936b85ea5db318115711b2c9c1122e9ea740..d5175015e776b2ba92c7b930ac64857267efa427 100644
--- a/rts/TopHandler.c
+++ b/rts/TopHandler.c
@@ -29,6 +29,7 @@ StgTSO *getTopHandlerThread(void) {
     StgWeak *weak = (StgWeak*)deRefStablePtr(topHandlerPtr);
     RELEASE_LOCK(&m);
     const StgInfoTable *info = weak->header.info;
+    load_load_barrier();
     if (info == &stg_WEAK_info) {
         StgClosure *key = ((StgWeak*)weak)->key;
 
diff --git a/rts/Updates.h b/rts/Updates.h
index 1ba398bd354d35540faf8526e1dd8a38d2ed231d..c4cc05d63f61ac4dbf34ffe47da50b9804a4e68a 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -39,10 +39,10 @@
                  PROF_HDR_FIELDS(w_,ccs,p2)              \
                  p_ updatee
 
-
 #define updateWithIndirection(p1, p2, and_then) \
     W_ bd;                                                      \
                                                                 \
+    prim_write_barrier;                                         \
     OVERWRITING_CLOSURE(p1);                                    \
     StgInd_indirectee(p1) = p2;                                 \
     prim_write_barrier;                                         \
@@ -69,6 +69,7 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     ASSERT( (P_)p1 != (P_)p2 );
     /* not necessarily true: ASSERT( !closure_IND(p1) ); */
     /* occurs in RaiseAsync.c:raiseAsync() */
+    write_barrier();
     OVERWRITING_CLOSURE(p1);
     ((StgInd *)p1)->indirectee = p2;
     write_barrier();
diff --git a/rts/Weak.c b/rts/Weak.c
index a322d822afccfe575cd073a8a535450e4c6c1032..d084f0daed677f27b587886ea2a2e8e4963b5a29 100644
--- a/rts/Weak.c
+++ b/rts/Weak.c
@@ -42,6 +42,7 @@ void
 runAllCFinalizers(StgWeak *list)
 {
     StgWeak *w;
+    const StgInfoTable *winfo;
     Task *task;
 
     task = myTask();
@@ -57,7 +58,9 @@ runAllCFinalizers(StgWeak *list)
         // If there's no major GC between the time that the finalizer for the
         // object from the oldest generation is manually called and shutdown
         // we end up running the same finalizer twice. See #7170.
-        if (w->header.info != &stg_DEAD_WEAK_info) {
+        winfo = w->header.info;
+        load_load_barrier();
+        if (winfo != &stg_DEAD_WEAK_info) {
             runCFinalizers((StgCFinalizerList *)w->cfinalizers);
         }
     }
@@ -126,6 +129,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         // there's a later call to finalizeWeak# on this weak pointer,
         // we don't run the finalizer again.
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
+        write_barrier();
     }
 
     n_finalizers = i;
@@ -138,7 +142,6 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     size = n + mutArrPtrsCardTableSize(n);
     arr = (StgMutArrPtrs *)allocate(cap, sizeofW(StgMutArrPtrs) + size);
     TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
-    SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info, CCS_SYSTEM);
     arr->ptrs = n;
     arr->size = size;
 
@@ -154,6 +157,9 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         arr->payload[i] = (StgClosure *)(W_)(-1);
     }
 
+    write_barrier();
+    SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info, CCS_SYSTEM);
+
     t = createIOThread(cap,
                        RtsFlags.GcFlags.initialStkSize,
                        rts_apply(cap,
diff --git a/rts/sm/CNF.c b/rts/sm/CNF.c
index 8d0ebccaf3fd5d0a2cff77759e107afcc11e438b..1e4f8166829e9d883c67dfcc4c346f9809021b9a 100644
--- a/rts/sm/CNF.c
+++ b/rts/sm/CNF.c
@@ -373,7 +373,6 @@ compactNew (Capability *cap, StgWord size)
                                          ALLOCATE_NEW);
 
     self = firstBlockGetCompact(block);
-    SET_HDR((StgClosure*)self, &stg_COMPACT_NFDATA_CLEAN_info, CCS_SYSTEM);
     self->autoBlockW = aligned_size / sizeof(StgWord);
     self->nursery = block;
     self->last = block;
@@ -390,6 +389,9 @@ compactNew (Capability *cap, StgWord size)
 
     debugTrace(DEBUG_compact, "compactNew: size %" FMT_Word, size);
 
+    write_barrier();
+    SET_HDR((StgClosure*)self, &stg_COMPACT_NFDATA_CLEAN_info, CCS_SYSTEM);
+
     return self;
 }
 
@@ -542,8 +544,10 @@ insertCompactHash (Capability *cap,
                    StgClosure *p, StgClosure *to)
 {
     insertHashTable(str->hash, (StgWord)p, (const void*)to);
-    if (str->header.info == &stg_COMPACT_NFDATA_CLEAN_info) {
-        str->header.info = &stg_COMPACT_NFDATA_DIRTY_info;
+    const StgInfoTable *strinfo = str->header.info;
+    load_load_barrier();
+    if (strinfo == &stg_COMPACT_NFDATA_CLEAN_info) {
+        strinfo = &stg_COMPACT_NFDATA_DIRTY_info;
         recordClosureMutated(cap, (StgClosure*)str);
     }
 }
@@ -686,6 +690,7 @@ verify_consistency_block (StgCompactNFData *str, StgCompactNFDataBlock *block)
         ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
 
         info = get_itbl(q);
+        load_load_barrier();
         switch (info->type) {
         case CONSTR_1_0:
             check_object_in_compact(str, UNTAG_CLOSURE(q->payload[0]));
@@ -925,6 +930,7 @@ fixup_block(StgCompactNFDataBlock *block, StgWord *fixup_table, uint32_t count)
     while (p < bd->free) {
         ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
         info = get_itbl((StgClosure*)p);
+        load_load_barrier();
 
         switch (info->type) {
         case CONSTR_1_0:
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c
index 3bfefa7ceb5674ab3a5b53976b21949483ebedc6..177803596227d488fea5ba4743496f0bd0803c5e 100644
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -160,6 +160,7 @@ get_threaded_info( StgPtr p )
     StgWord q;
 
     q = (W_)GET_INFO(UNTAG_CLOSURE((StgClosure *)p));
+    load_load_barrier();
 
 loop:
     switch (GET_CLOSURE_TAG((StgClosure *)q))
@@ -361,6 +362,7 @@ thread_stack(StgPtr p, StgPtr stack_end)
 
             fun_info = FUN_INFO_PTR_TO_STRUCT((StgInfoTable *)UNTAG_CLOSURE((StgClosure *)
                            get_threaded_info((StgPtr)ret_fun->fun)));
+            load_load_barrier();
                  // *before* threading it!
             thread(&ret_fun->fun);
             p = thread_arg_block(fun_info, ret_fun->payload);
@@ -383,6 +385,7 @@ thread_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size)
 
     fun_info = FUN_INFO_PTR_TO_STRUCT((StgInfoTable *)UNTAG_CLOSURE((StgClosure *)
                         get_threaded_info((StgPtr)fun)));
+    load_load_barrier();
     ASSERT(fun_info->i.type != PAP);
 
     p = (StgPtr)payload;
@@ -550,6 +553,8 @@ update_fwd_large( bdescr *bd )
 static /* STATIC_INLINE */ StgPtr
 thread_obj (const StgInfoTable *info, StgPtr p)
 {
+    load_load_barrier();
+
     switch (info->type) {
     case THUNK_0_1:
         return p + sizeofW(StgThunk) + 1;
@@ -823,6 +828,7 @@ update_fwd_compact( bdescr *blocks )
             // definitely have enough room.  Also see bug #1147.
             iptr = get_threaded_info(p);
             info = INFO_PTR_TO_STRUCT((StgInfoTable *)UNTAG_CLOSURE((StgClosure *)iptr));
+            load_load_barrier();
 
             q = p;
 
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index b4050512a6dd0140f422853372fa8e20a4c24caf..c0a31af7de6738b90cd0d4f7450e2b31afa46b17 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -111,6 +111,7 @@ copy_tag(StgClosure **p, const StgInfoTable *info,
     {
         const StgInfoTable *new_info;
         new_info = (const StgInfoTable *)cas((StgPtr)&src->header.info, (W_)info, MK_FORWARDING_PTR(to));
+        load_load_barrier();
         if (new_info != info) {
 #if defined(PROFILING)
             // We copied this object at the same time as another
@@ -129,8 +130,11 @@ copy_tag(StgClosure **p, const StgInfoTable *info,
         }
     }
 #else
-    src->header.info = (const StgInfoTable *)MK_FORWARDING_PTR(to);
+    // if somebody else reads the forwarding pointer, we better make
+    // sure there's a closure at the end of it.
+    write_barrier();
     *p = TAG_CLOSURE(tag,(StgClosure*)to);
+    src->header.info = (const StgInfoTable *)MK_FORWARDING_PTR(to);
 #endif
 
 #if defined(PROFILING)
@@ -205,6 +209,7 @@ spin:
     }
 #else
     info = (W_)src->header.info;
+    load_load_barrier();
 #endif
 
     to = alloc_for_copy(size_to_reserve, gen_no);
@@ -216,8 +221,8 @@ spin:
     }
 
     write_barrier();
-    src->header.info = (const StgInfoTable*)MK_FORWARDING_PTR(to);
     *p = (StgClosure *)to;
+    src->header.info = (const StgInfoTable*)MK_FORWARDING_PTR(to);
 
 #if defined(PROFILING)
     // We store the size of the just evacuated object in the LDV word so that
@@ -610,6 +615,7 @@ loop:
   gen_no = bd->dest_no;
 
   info = q->header.info;
+  load_load_barrier();
   if (IS_FORWARDING_PTR(info))
   {
     /* Already evacuated, just return the forwarding address.
@@ -720,11 +726,14 @@ loop:
       StgClosure *r;
       const StgInfoTable *i;
       r = ((StgInd*)q)->indirectee;
+      load_load_barrier();
       if (GET_CLOSURE_TAG(r) == 0) {
           i = r->header.info;
+          load_load_barrier();
           if (IS_FORWARDING_PTR(i)) {
               r = (StgClosure *)UN_FORWARDING_PTR(i);
               i = r->header.info;
+              load_load_barrier();
           }
           if (i == &stg_TSO_info
               || i == &stg_WHITEHOLE_info
@@ -917,6 +926,7 @@ evacuate_BLACKHOLE(StgClosure **p)
     }
     gen_no = bd->dest_no;
     info = q->header.info;
+    load_load_barrier();
     if (IS_FORWARDING_PTR(info))
     {
         StgClosure *e = (StgClosure*)UN_FORWARDING_PTR(info);
@@ -1099,6 +1109,7 @@ selector_chain:
             //     need the write-barrier stuff.
             //   - undo the chain we've built to point to p.
             SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr);
+            write_barrier();
             *q = (StgClosure *)p;
             if (evac) evacuate(q);
             unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p);
@@ -1108,6 +1119,7 @@ selector_chain:
 #else
     // Save the real info pointer (NOTE: not the same as get_itbl()).
     info_ptr = (StgWord)p->header.info;
+    load_load_barrier();
     SET_INFO((StgClosure *)p,&stg_WHITEHOLE_info);
 #endif
 
@@ -1126,6 +1138,7 @@ selector_loop:
     // that evacuate() doesn't mind if it gets passed a to-space pointer.
 
     info = (StgInfoTable*)selectee->header.info;
+    load_load_barrier();
 
     if (IS_FORWARDING_PTR(info)) {
         // We don't follow pointers into to-space; the constructor
@@ -1135,6 +1148,7 @@ selector_loop:
     }
 
     info = INFO_PTR_TO_STRUCT(info);
+    load_load_barrier();
     switch (info->type) {
       case WHITEHOLE:
           goto bale_out; // about to be evacuated by another thread (or a loop).
@@ -1165,6 +1179,7 @@ selector_loop:
                   SET_INFO((StgClosure*)p, (StgInfoTable *)info_ptr);
                   OVERWRITING_CLOSURE((StgClosure*)p);
                   SET_INFO((StgClosure*)p, &stg_WHITEHOLE_info);
+                  write_barrier();
               }
 #endif
 
@@ -1178,6 +1193,7 @@ selector_loop:
               if (!IS_FORWARDING_PTR(info_ptr))
               {
                   info = INFO_PTR_TO_STRUCT((StgInfoTable *)info_ptr);
+                  load_load_barrier();
                   switch (info->type) {
                   case IND:
                   case IND_STATIC:
@@ -1229,9 +1245,11 @@ selector_loop:
           // indirection, as in evacuate().
           if (GET_CLOSURE_TAG(r) == 0) {
               i = r->header.info;
+              load_load_barrier();
               if (IS_FORWARDING_PTR(i)) {
                   r = (StgClosure *)UN_FORWARDING_PTR(i);
                   i = r->header.info;
+                  load_load_barrier();
               }
               if (i == &stg_TSO_info
                   || i == &stg_WHITEHOLE_info
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 23ed3f0622f77ea3060d637c1cbbf06ed5bc6851..9d0709c8d0e7b5c589ae4775d515bef8dc54ecbc 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -76,6 +76,7 @@ isAlive(StgClosure *p)
     }
 
     info = q->header.info;
+    load_load_barrier();
 
     if (IS_FORWARDING_PTR(info)) {
         // alive!
@@ -83,6 +84,7 @@ isAlive(StgClosure *p)
     }
 
     info = INFO_PTR_TO_STRUCT(info);
+    load_load_barrier();
 
     switch (info->type) {
 
@@ -121,8 +123,9 @@ revertCAFs( void )
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
-        SET_INFO((StgClosure *)c, c->saved_info);
         c->saved_info = NULL;
+        write_barrier();
+        SET_INFO((StgClosure *)c, c->saved_info);
         // could, but not necessary: c->static_link = NULL;
     }
     revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c
index d5982e2f64686cb8ba91445c96931053a0bd34b5..c6c345b6fb3ac1fe22bdfccdfe590eb237d906ff 100644
--- a/rts/sm/MarkWeak.c
+++ b/rts/sm/MarkWeak.c
@@ -234,16 +234,17 @@ static bool tidyWeakList(generation *gen)
     last_w = &gen->old_weak_ptr_list;
     for (w = gen->old_weak_ptr_list; w != NULL; w = next_w) {
 
+        info = get_itbl((StgClosure *)w);
+
         /* There might be a DEAD_WEAK on the list if finalizeWeak# was
          * called on a live weak pointer object.  Just remove it.
          */
-        if (w->header.info == &stg_DEAD_WEAK_info) {
+        if (info == &stg_DEAD_WEAK_info) {
             next_w = w->link;
             *last_w = next_w;
             continue;
         }
 
-        info = get_itbl((StgClosure *)w);
         switch (info->type) {
 
         case WEAK:
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 8082b7e6d08978edd10a351de40b1d7f4d934a58..ff76f747c98780a4067bba54a1d20738c673d3d2 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -233,6 +233,7 @@ checkClosure( const StgClosure* p )
     p = UNTAG_CONST_CLOSURE(p);
 
     info = p->header.info;
+    load_load_barrier();
 
     if (IS_FORWARDING_PTR(info)) {
         barf("checkClosure: found EVACUATED closure %d", info->type);
@@ -243,6 +244,7 @@ checkClosure( const StgClosure* p )
 #endif
 
     info = INFO_PTR_TO_STRUCT(info);
+    load_load_barrier();
 
     switch (info->type) {
 
@@ -564,6 +566,7 @@ checkTSO(StgTSO *tso)
 
     next = tso->_link;
     info = (const StgInfoTable*) tso->_link->header.info;
+    load_load_barrier();
 
     ASSERT(next == END_TSO_QUEUE ||
            info == &stg_MVAR_TSO_QUEUE_info ||
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index 73a790e9eda2ca5e28735d250f5b30d0646d132e..a27158cd6708705e9f3e35edf63d8a181c02b522 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -187,6 +187,7 @@ scavenge_compact(StgCompactNFData *str)
                str, Bdescr((P_)str)->gen_no, str->totalW * sizeof(W_))
 
     gct->eager_promotion = saved_eager;
+    write_barrier();
     if (gct->failed_to_evac) {
         ((StgClosure *)str)->header.info = &stg_COMPACT_NFDATA_DIRTY_info;
     } else {
@@ -374,6 +375,7 @@ scavenge_thunk_srt(const StgInfoTable *info)
     if (!major_gc) return;
 
     thunk_info = itbl_to_thunk_itbl(info);
+    load_load_barrier();
     if (thunk_info->i.srt) {
         StgClosure *srt = (StgClosure*)GET_SRT(thunk_info);
         evacuate(&srt);
@@ -388,6 +390,7 @@ scavenge_fun_srt(const StgInfoTable *info)
     if (!major_gc) return;
 
     fun_info = itbl_to_fun_itbl(info);
+    load_load_barrier();
     if (fun_info->i.srt) {
         StgClosure *srt = (StgClosure*)GET_FUN_SRT(fun_info);
         evacuate(&srt);
@@ -431,7 +434,7 @@ scavenge_block (bdescr *bd)
   // time around the loop.
   while (p < bd->free || (bd == ws->todo_bd && p < ws->todo_free)) {
 
-      ASSERT(bd->link == NULL);
+    ASSERT(bd->link == NULL);
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
     info = get_itbl((StgClosure *)p);
 
@@ -450,6 +453,7 @@ scavenge_block (bdescr *bd)
         evacuate((StgClosure **)&mvar->value);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -467,6 +471,7 @@ scavenge_block (bdescr *bd)
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -601,6 +606,7 @@ scavenge_block (bdescr *bd)
         evacuate(&((StgMutVar *)p)->var);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -620,6 +626,7 @@ scavenge_block (bdescr *bd)
         evacuate((StgClosure**)&bq->link);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
         } else {
@@ -672,6 +679,7 @@ scavenge_block (bdescr *bd)
 
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -689,6 +697,7 @@ scavenge_block (bdescr *bd)
     {
         p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -714,6 +723,7 @@ scavenge_block (bdescr *bd)
         }
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -735,6 +745,7 @@ scavenge_block (bdescr *bd)
             evacuate((StgClosure **)p);
         }
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -875,6 +886,7 @@ scavenge_mark_stack(void)
             evacuate((StgClosure **)&mvar->value);
             gct->eager_promotion = saved_eager_promotion;
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 mvar->header.info = &stg_MVAR_DIRTY_info;
             } else {
@@ -891,6 +903,7 @@ scavenge_mark_stack(void)
             evacuate((StgClosure **)&tvar->first_watch_queue_entry);
             gct->eager_promotion = saved_eager_promotion;
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 tvar->header.info = &stg_TVAR_DIRTY_info;
             } else {
@@ -997,6 +1010,7 @@ scavenge_mark_stack(void)
             evacuate(&((StgMutVar *)p)->var);
             gct->eager_promotion = saved_eager_promotion;
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
             } else {
@@ -1016,6 +1030,7 @@ scavenge_mark_stack(void)
             evacuate((StgClosure**)&bq->link);
             gct->eager_promotion = saved_eager_promotion;
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
             } else {
@@ -1064,6 +1079,7 @@ scavenge_mark_stack(void)
 
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
             } else {
@@ -1083,6 +1099,7 @@ scavenge_mark_stack(void)
 
             scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
             } else {
@@ -1110,6 +1127,7 @@ scavenge_mark_stack(void)
             }
             gct->eager_promotion = saved_eager;
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
             } else {
@@ -1131,6 +1149,7 @@ scavenge_mark_stack(void)
                 evacuate((StgClosure **)p);
             }
 
+            write_barrier();
             if (gct->failed_to_evac) {
                 ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
             } else {
@@ -1237,6 +1256,7 @@ scavenge_one(StgPtr p)
         evacuate((StgClosure **)&mvar->value);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -1253,6 +1273,7 @@ scavenge_one(StgPtr p)
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -1317,6 +1338,7 @@ scavenge_one(StgPtr p)
         evacuate(&((StgMutVar *)p)->var);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -1336,6 +1358,7 @@ scavenge_one(StgPtr p)
         evacuate((StgClosure**)&bq->link);
         gct->eager_promotion = saved_eager_promotion;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
         } else {
@@ -1384,6 +1407,7 @@ scavenge_one(StgPtr p)
 
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -1401,6 +1425,7 @@ scavenge_one(StgPtr p)
         // follow everything
         scavenge_mut_arr_ptrs((StgMutArrPtrs *)p);
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -1428,6 +1453,7 @@ scavenge_one(StgPtr p)
         }
         gct->eager_promotion = saved_eager;
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -1449,6 +1475,7 @@ scavenge_one(StgPtr p)
             evacuate((StgClosure **)p);
         }
 
+        write_barrier();
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info;
         } else {
@@ -1572,6 +1599,10 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
     StgPtr p, q;
     uint32_t gen_no;
 
+#if defined(DEBUG)
+    const StgInfoTable *pinfo;
+#endif
+
     gen_no = gen->no;
     gct->evac_gen_no = gen_no;
     for (; bd != NULL; bd = bd->link) {
@@ -1599,9 +1630,11 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
             case TREC_CHUNK:
                 mutlist_TREC_CHUNK++; break;
             case MUT_PRIM:
-                if (((StgClosure*)p)->header.info == &stg_TVAR_WATCH_QUEUE_info)
+                pinfo = ((StgClosure*)p)->header.info;
+                load_load_barrier();
+                if (pinfo == &stg_TVAR_WATCH_QUEUE_info)
                     mutlist_TVAR_WATCH_QUEUE++;
-                else if (((StgClosure*)p)->header.info == &stg_TREC_HEADER_info)
+                else if (pinfo == &stg_TREC_HEADER_info)
                     mutlist_TREC_HEADER++;
                 else
                     mutlist_OTHERS++;
@@ -1631,6 +1664,7 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
 
                 scavenge_mut_arr_ptrs_marked((StgMutArrPtrs *)p);
 
+                write_barrier();
                 if (gct->failed_to_evac) {
                     ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
                 } else {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index f889e2262beec19c89dbfefb7314dc8e161b7c23..34d103e52f4d8e1a74ed4106d96f9a9e2e813bb9 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -407,8 +407,9 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
 
     // Allocate the blackhole indirection closure
     bh = (StgInd *)allocate(cap, sizeofW(*bh));
-    SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
+    write_barrier();
+    SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
 
     caf->indirectee = (StgClosure *)bh;
     write_barrier();
@@ -1081,6 +1082,8 @@ void
 dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
 {
     Capability *cap = regTableToCapability(reg);
+    // No barrier required here as no other heap object fields are read. See
+    // note [Heap memory barriers] in SMP.h.
     if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
         p->header.info = &stg_MUT_VAR_DIRTY_info;
         recordClosureMutated(cap,p);
@@ -1090,6 +1093,8 @@ dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
 void
 dirty_TVAR(Capability *cap, StgTVar *p)
 {
+    // No barrier required here as no other heap object fields are read. See
+    // note [Heap memory barriers] in SMP.h.
     if (p->header.info == &stg_TVAR_CLEAN_info) {
         p->header.info = &stg_TVAR_DIRTY_info;
         recordClosureMutated(cap,(StgClosure*)p);