diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs
index 094d6ff94e04e13defbb047afc53f482a63f9424..60857e294e6d72d0ae46286f7a4ee604776bf450 100644
--- a/compiler/GHC/StgToCmm/Bind.hs
+++ b/compiler/GHC/StgToCmm/Bind.hs
@@ -721,11 +721,19 @@ emitBlackHoleCode node = do
 
   when eager_blackholing $ do
     whenUpdRemSetEnabled $ emitUpdRemSetPushThunk node
-    emitStore (cmmOffsetW platform node (fixedHdrSizeW profile)) (currentTSOExpr platform)
+    emitAtomicStore platform MemOrderRelease
+        (cmmOffsetW platform node (fixedHdrSizeW profile))
+        (currentTSOExpr platform)
     -- See Note [Heap memory barriers] in SMP.h.
-    let w = wordWidth platform
-    emitPrimCall [] (MO_AtomicWrite w MemOrderRelease)
-        [node, CmmReg (CmmGlobal $ GlobalRegUse EagerBlackholeInfo $ bWord platform)]
+    emitAtomicStore platform MemOrderRelease
+        node
+        (CmmReg (CmmGlobal $ GlobalRegUse EagerBlackholeInfo $ bWord platform))
+
+emitAtomicStore :: Platform -> MemoryOrdering -> CmmExpr -> CmmExpr -> FCode ()
+emitAtomicStore platform mord addr val =
+    emitPrimCall [] (MO_AtomicWrite w mord) [addr, val]
+  where
+    w = typeWidth $ cmmExprType platform val
 
 setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode ()
         -- Nota Bene: this function does not change Node (even if it's a CAF),
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index b338b4c387341156f9e5e7e8ffa65c968a9c32f0..e561f4409fd28bdec2968d85d0dbc23c126c24d4 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -108,7 +108,7 @@ again:
             IND,
             IND_STATIC:
         {
-            fun = StgInd_indirectee(fun);
+            fun = %acquire StgInd_indirectee(fun);
             goto again;
         }
         case BCO:
@@ -693,7 +693,7 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
   }
   // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is
   // not reachable.
-  StgInd_indirectee(ap) = CurrentTSO;
+  %release StgInd_indirectee(ap) = CurrentTSO;
   SET_INFO_RELEASE(ap, __stg_EAGER_BLACKHOLE_info);
 
   /* ensure there is at least AP_STACK_SPLIM words of headroom available
diff --git a/rts/Compact.cmm b/rts/Compact.cmm
index ecb694cf5c8e8250d29663c3314966bb0aa7395e..3a6444a710bed2aa3331102d5ffa1f44e047ceaa 100644
--- a/rts/Compact.cmm
+++ b/rts/Compact.cmm
@@ -100,7 +100,7 @@ eval:
 
     // Follow indirections:
     case IND, IND_STATIC: {
-        p = StgInd_indirectee(p);
+        p = %acquire StgInd_indirectee(p);
         goto eval;
     }
 
diff --git a/rts/Heap.c b/rts/Heap.c
index 0935b66c272936ffebdefeb0a03f9f03ea005f49..1ecf1b6e1456af1daa35c9c0e5d1e3e45764a43c 100644
--- a/rts/Heap.c
+++ b/rts/Heap.c
@@ -173,7 +173,7 @@ StgWord collect_pointers(StgClosure *closure, StgClosure *ptrs[]) {
         case IND:
         case IND_STATIC:
         case BLACKHOLE:
-            ptrs[nptrs++] = (StgClosure *)(((StgInd *)closure)->indirectee);
+            ptrs[nptrs++] = (StgClosure *) ACQUIRE_LOAD(&((StgInd *)closure)->indirectee);
             break;
 
         case MUT_ARR_PTRS_CLEAN:
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index 9530ff826ba47dc11884169c2b18f1d80a6723c7..1be9cf832cb8c46e3768b020e405bec51428f791 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -420,7 +420,7 @@ eval_obj:
     case IND:
     case IND_STATIC:
     {
-        tagged_obj = ((StgInd*)obj)->indirectee;
+        tagged_obj = ACQUIRE_LOAD(&((StgInd*)obj)->indirectee);
         goto eval_obj;
     }
 
diff --git a/rts/Messages.c b/rts/Messages.c
index 139f61c3c857d289092b33ae59e21ad3729a3740..50df0ab41fefb012d62f3b155e294bf03492832a 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -191,9 +191,6 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg)
     StgClosure *p;
     const StgInfoTable *info;
     do {
-        // If we are being called from stg_BLACKHOLE then TSAN won't know about the
-        // previous read barrier that makes the following access safe.
-        TSAN_ANNOTATE_BENIGN_RACE(&((StgInd*)bh)->indirectee, "messageBlackHole");
         p = UNTAG_CLOSURE(ACQUIRE_LOAD(&((StgInd*)bh)->indirectee));
         info = RELAXED_LOAD(&p->header.info);
     } while (info == &stg_IND_info);
@@ -291,7 +288,7 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg)
             // makes it into the update remembered set
             updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);
         }
-        RELAXED_STORE(&msg->link, bq->queue);
+        msg->link = bq->queue;
         bq->queue = msg;
         // No barrier is necessary here: we are only exposing the
         // closure to the GC. See Note [Heap memory barriers] in SMP.h.
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index bccb12a0538932ea8fcb24029ad83593dddce1b0..11117c0ed82def6f91e1a8a427b3ebf1490d6821 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -1753,7 +1753,7 @@ loop:
     qinfo = GET_INFO_ACQUIRE(q);
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -1821,7 +1821,7 @@ loop:
 
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -1923,7 +1923,7 @@ loop:
 
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -2012,7 +2012,7 @@ loop:
 
     if (qinfo == stg_IND_info ||
         qinfo == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
@@ -2293,7 +2293,7 @@ loop:
     //Possibly IND added by removeFromMVarBlockedQueue
     if (StgHeader_info(q) == stg_IND_info ||
         StgHeader_info(q) == stg_MSG_NULL_info) {
-        q = StgInd_indirectee(q);
+        q = %acquire StgInd_indirectee(q);
         goto loop;
     }
 
diff --git a/rts/StableName.c b/rts/StableName.c
index 5d4f2002ad92c21846ceb4e56dd95e4b15ac8d4a..15ecb57f65963afba9f1f4078c2e0fc53486b7b1 100644
--- a/rts/StableName.c
+++ b/rts/StableName.c
@@ -156,11 +156,11 @@ removeIndirections (StgClosure* p)
         switch (get_itbl(q)->type) {
         case IND:
         case IND_STATIC:
-            p = ((StgInd *)q)->indirectee;
+            p = ACQUIRE_LOAD(&((StgInd *)q)->indirectee);
             continue;
 
         case BLACKHOLE:
-            p = ((StgInd *)q)->indirectee;
+            p = ACQUIRE_LOAD(&((StgInd *)q)->indirectee);
             if (GET_CLOSURE_TAG(p) != 0) {
                 continue;
             } else {
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index ea9fe77ed55e20aca64f7a605859d18d65a461fc..bd899d27711e9834f99ebb5d2243b31d4cb70afd 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -520,8 +520,9 @@ INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
     (P_ node)
 {
     TICK_ENT_DYN_IND(); /* tick */
-    ACQUIRE_FENCE;
-    node = UNTAG(StgInd_indirectee(node));
+    ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info);
+    node = %acquire StgInd_indirectee(node);
+    node = UNTAG(node);
     TICK_ENT_VIA_NODE();
     jump %GET_ENTRY(node) (node);
 }
@@ -529,8 +530,10 @@ INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
     /* explicit stack */
 {
     TICK_ENT_DYN_IND(); /* tick */
-    ACQUIRE_FENCE;
-    R1 = UNTAG(StgInd_indirectee(R1));
+    ACQUIRE_FENCE_ON(R1 + OFFSET_StgHeader_info);
+    P_ p;
+    p = %acquire StgInd_indirectee(R1);
+    R1 = UNTAG(p);
     TICK_ENT_VIA_NODE();
     jump %GET_ENTRY(R1) [R1];
 }
@@ -540,8 +543,10 @@ INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC")
     /* explicit stack */
 {
     TICK_ENT_STATIC_IND(); /* tick */
-    ACQUIRE_FENCE;
-    R1 = UNTAG(StgInd_indirectee(R1));
+    ACQUIRE_FENCE_ON(R1 + OFFSET_StgHeader_info);
+    P_ p;
+    p = %acquire StgInd_indirectee(R1);
+    R1 = UNTAG(p);
     TICK_ENT_VIA_NODE();
     jump %GET_ENTRY(R1) [R1];
 }
@@ -564,14 +569,11 @@ INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
     TICK_ENT_DYN_IND(); /* tick */
 
 retry:
-#if defined(TSAN_ENABLED)
-    // See Note [ThreadSanitizer and fences]
-    W_ unused; unused = %acquire GET_INFO(node);
-#endif
-    // Synchronizes with the release-store in updateWithIndirection.
+    // Synchronizes with the release-store in
+    // updateWithIndirection.
     // See Note [Heap memory barriers] in SMP.h.
-    ACQUIRE_FENCE;
-    p = %relaxed StgInd_indirectee(node);
+    ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info);
+    p = %acquire StgInd_indirectee(node);
     if (GETTAG(p) != 0) {
         return (p);
     }
@@ -656,7 +658,7 @@ INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE")
     i = 0;
 loop:
     // spin until the WHITEHOLE is updated
-    info = StgHeader_info(node);
+    info = %relaxed StgHeader_info(node);
     if (info == stg_WHITEHOLE_info) {
 #if defined(PROF_SPIN)
         W_[whitehole_lockClosure_spin] =
@@ -675,6 +677,7 @@ loop:
         // defined in CMM.
         goto loop;
     }
+    ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info);
     jump %ENTRY_CODE(info) (node);
 #else
     ccall barf("WHITEHOLE object (%p) entered!", R1) never returns;
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index d350d3fbed83f93966fc4145e0b54afe5236b240..8e8d6051a65162bfc321236a6f5778281726033f 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -352,7 +352,7 @@ threadPaused(Capability *cap, StgTSO *tso)
             OVERWRITING_CLOSURE_SIZE(bh, closure_sizeW_(bh, INFO_PTR_TO_STRUCT(bh_info)));
 
             // The payload of the BLACKHOLE points to the TSO
-            ((StgInd *)bh)->indirectee = (StgClosure *)tso;
+            RELEASE_STORE(&((StgInd *)bh)->indirectee, (StgClosure *)tso);
             SET_INFO_RELEASE(bh,&stg_BLACKHOLE_info);
 
             // .. and we need a write barrier, since we just mutated the closure:
diff --git a/rts/Threads.c b/rts/Threads.c
index 031ac21247cc00434bdfd957a7685b607997b77d..46a70a11a8bf3a5cb68d26fc839f01c16aa87dff 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -437,7 +437,7 @@ checkBlockingQueues (Capability *cap, StgTSO *tso)
         p = UNTAG_CLOSURE(bq->bh);
         const StgInfoTable *pinfo = ACQUIRE_LOAD(&p->header.info);
         if (pinfo != &stg_BLACKHOLE_info ||
-            ((StgInd *)p)->indirectee != (StgClosure*)bq)
+            (RELAXED_LOAD(&((StgInd *)p)->indirectee) != (StgClosure*)bq))
         {
             wakeBlockingQueue(cap,bq);
         }
@@ -468,7 +468,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
         return;
     }
 
-    v = UNTAG_CLOSURE(((StgInd*)thunk)->indirectee);
+    v = UNTAG_CLOSURE(ACQUIRE_LOAD(&((StgInd*)thunk)->indirectee));
 
     updateWithIndirection(cap, thunk, val);
 
@@ -808,7 +808,7 @@ loop:
     qinfo = ACQUIRE_LOAD(&q->header.info);
     if (qinfo == &stg_IND_info ||
         qinfo == &stg_MSG_NULL_info) {
-        q = (StgMVarTSOQueue*)((StgInd*)q)->indirectee;
+        q = (StgMVarTSOQueue*) ACQUIRE_LOAD(&((StgInd*)q)->indirectee);
         goto loop;
     }
 
diff --git a/rts/Updates.cmm b/rts/Updates.cmm
index c1de06c5e4529a88bdf1ef55dd54d59dea12003a..d8c6d625b48597d41e07da4eb785e70c7eb70f8c 100644
--- a/rts/Updates.cmm
+++ b/rts/Updates.cmm
@@ -59,7 +59,7 @@ INFO_TABLE_RET ( stg_marked_upd_frame, UPDATE_FRAME,
     ASSERT(HpAlloc == 0); // Note [HpAlloc]
 
     // we know the closure is a BLACKHOLE
-    v = StgInd_indirectee(updatee);
+    v = %acquire StgInd_indirectee(updatee);
 
     if (GETTAG(v) != 0) (likely: False) {
         // updated by someone else: discard our value and use the
diff --git a/rts/Updates.h b/rts/Updates.h
index bcb3ea8cc4c0b8e260d780070982116d92bdf69b..58967c332131597ab932b043fd664b463ef5b315 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -261,6 +261,66 @@
  * `tso_1` and other blocked threads may be unblocked more quickly.
  *
  *
+ * Waking up blocking queues
+ * -------------------------
+ * As noted above, when a thread updates a `BLACKHOLE`'d thunk it may find that
+ * some threads have added themselves to the thunk's blocking queue. Naturally,
+ * we must ensure that these threads are woken up. However, this gets a bit
+ * subtle since multiple threads may have raced to enter the thunk.
+ *
+ * That is, we may end up in a situation like one of these (TODO audit):
+ *
+ * ### Race A
+ *
+ *     Thread 0                      Thread 1                     Thread 2
+ *     --------------------------    --------------------------   ----------------------
+ *     enter thnk
+ *                                   enter thnk
+ *     thnk.indirectee := tso_0
+ *                                   thnk.indirectee := tso_1
+ *     thnk.info := BLACKHOLE
+ *                                   thnk.info := BLACKHOLE
+ *                                                                enter, block on thnk
+ *                                                                send MSG_BLACKHOLE to tso_1->cap
+ *     finishes evaluation
+ *     thnk.indirectee := result
+ *                                   handle MSG_BLACKHOLE
+ *                                   add
+ *
+ * ### Race B
+ *
+ *     Thread 0                      Thread 1                     Thread 2
+ *     --------------------------    --------------------------   ----------------------
+ *     enter thnk
+ *                                   enter thnk
+ *     thnk.indirectee := tso_0
+ *                                   thnk.indirectee := tso_1
+ *     thnk.info := BLACKHOLE
+ *                                   thnk.info := BLACKHOLE
+ *                                                                enter, block on thnk
+ *                                                                send MSG_BLACKHOLE to tso_1->cap
+ *                                   handle MSG_BLACKHOLE
+ *                                   add
+ *     finishes evaluation
+ *     thnk.indirectee := result
+ *
+ * ### Race C
+ *
+ *     Thread 0                      Thread 1                     Thread 2
+ *     --------------------------    --------------------------   ----------------------
+ *     enter thnk
+ *                                   enter thnk
+ *     thnk.indirectee := tso_0
+ *     thnk.info := BLACKHOLE
+ *                                                                enter, block on thnk
+ *                                                                send MSG_BLACKHOLE to tso_0->cap
+ *     handle MSG_BLACKHOLE
+ *     thnk.indirectee := new BLOCKING_QUEUE
+ *
+ *                                   thnk.indirectee := tso_1
+ *                                   thnk.info := BLACKHOLE
+ *
+ *
  * Exception handling
  * ------------------
  * When an exception is thrown to a thread which is evaluating a thunk, it is
@@ -400,8 +460,8 @@
     }                                                           \
                                                                 \
     OVERWRITING_CLOSURE(p1);                                    \
-    %relaxed StgInd_indirectee(p1) = p2;                        \
-    SET_INFO_RELEASE(p1, stg_BLACKHOLE_info);                   \
+    %release StgInd_indirectee(p1) = p2;                        \
+    %release SET_INFO(p1, stg_BLACKHOLE_info);                  \
     LDV_RECORD_CREATE(p1);                                      \
     and_then;
 
diff --git a/rts/include/Cmm.h b/rts/include/Cmm.h
index a19e3629309f211e4922c090056597b6045fdada..0f054290e43d3e5d919c52f72d0978c7e3015c72 100644
--- a/rts/include/Cmm.h
+++ b/rts/include/Cmm.h
@@ -35,6 +35,7 @@
 #define CMINUSMINUS 1
 
 #include "ghcconfig.h"
+#include "rts/TSANUtils.h"
 
 /* -----------------------------------------------------------------------------
    Types
@@ -311,7 +312,7 @@
 #define ENTER(x) ENTER_(return,x)
 #endif
 
-#define ENTER_R1() ENTER_(RET_R1,R1)
+#define ENTER_R1() P_ _r1; _r1 = R1; ENTER_(RET_R1, _r1)
 
 #define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1]
 
@@ -326,7 +327,7 @@
     IND,                                                \
     IND_STATIC:                                         \
    {                                                    \
-      x = StgInd_indirectee(x);                         \
+      x = %acquire StgInd_indirectee(x);                \
       goto again;                                       \
    }                                                    \
   case                                                  \
@@ -446,9 +447,17 @@
    HP_CHK_P(bytes);                             \
    TICK_ALLOC_RTS(bytes);
 
+// Load a field out of structure with relaxed ordering.
+#define RELAXED_LOAD_FIELD(fld, ptr) \
+    REP_##fld[(ptr) + OFFSET_##fld]
+
+// Load a field out of an StgClosure with relaxed ordering.
+#define RELAXED_LOAD_CLOSURE_FIELD(fld, ptr) \
+    REP_##fld[(ptr) + SIZEOF_StgHeader + OFFSET_##fld]
+
 #define CHECK_GC()                                                      \
   (bdescr_link(CurrentNursery) == NULL ||                               \
-   generation_n_new_large_words(W_[g0]) >= TO_W_(CLong[large_alloc_lim]))
+   RELAXED_LOAD_FIELD(generation_n_new_large_words, W_[g0]) >= TO_W_(CLong[large_alloc_lim]))
 
 // allocate() allocates from the nursery, so we check to see
 // whether the nursery is nearly empty in any function that uses
@@ -688,9 +697,13 @@
 #define RELEASE_FENCE prim %fence_release();
 #define ACQUIRE_FENCE prim %fence_acquire();
 
-// TODO
-#if 1
+#if TSAN_ENABLED
+// This is may be efficient than a fence but TSAN can reason about it.
+#if WORD_SIZE_IN_BITS == 64
 #define ACQUIRE_FENCE_ON(x) if (1) { W_ tmp; (tmp) = prim %load_acquire64(x); }
+#elif WORD_SIZE_IN_BITS == 32
+#define ACQUIRE_FENCE_ON(x) if (1) { W_ tmp; (tmp) = prim %load_acquire32(x); }
+#endif
 #else
 #define ACQUIRE_FENCE_ON(x) ACQUIRE_FENCE
 #endif
diff --git a/rts/include/rts/TSANUtils.h b/rts/include/rts/TSANUtils.h
index 7052c05a55fecc18b11e130df091ff782147a377..68b223012e296d4e191d486a1070281d74a24c10 100644
--- a/rts/include/rts/TSANUtils.h
+++ b/rts/include/rts/TSANUtils.h
@@ -73,6 +73,7 @@
 #endif
 #endif
 
+#if !defined(CMINUSMINUS)
 #if defined(TSAN_ENABLED)
 #if !defined(HAVE_C11_ATOMICS)
 #error TSAN cannot be enabled without C11 atomics support.
@@ -106,3 +107,4 @@ uint32_t ghc_tsan_atomic32_compare_exchange(uint32_t *ptr, uint32_t expected, ui
 uint16_t ghc_tsan_atomic16_compare_exchange(uint16_t *ptr, uint16_t expected, uint16_t new_value, int success_memorder, int failure_memorder);
 uint8_t ghc_tsan_atomic8_compare_exchange(uint8_t *ptr, uint8_t expected, uint8_t new_value, int success_memorder, int failure_memorder);
 
+#endif
diff --git a/rts/include/stg/SMP.h b/rts/include/stg/SMP.h
index ecd3b49186e7904909c0d9b5aeee95e1ca54732b..897d19665999dfe3b3827d18f19bd90e263944c3 100644
--- a/rts/include/stg/SMP.h
+++ b/rts/include/stg/SMP.h
@@ -118,31 +118,40 @@ EXTERN_INLINE void busy_wait_nop(void);
  * stores which formed the new object are visible (e.g. stores are flushed from
  * cache and the relevant cachelines invalidated in other cores).
  *
- * To ensure this we must use memory barriers. Which barriers are required to
- * access a field depends upon the type of the field. In general, fields come
- * in three flavours:
- *
- *  * Mutable GC Pointers (C type StgClosure*, Cmm type StgPtr)
- *  * Immutable GC Pointers (C type MUT_FIELD StgClosure*, Cmm type StgPtr)
- *  * Non-pointers (C type StgWord, Cmm type StdWord)
+ * To ensure this we must issue memory barriers when accessing closures and
+ * their fields. Since reasoning about concurrent memory access with barriers tends to be
+ * subtle and platform dependent, it is more common to instead write programs
+ * in terms of an abstract memory model and let the compiler (GHC and the
+ * system's C compiler) worry about what barriers are needed to realize the
+ * requested semantics on the target system. GHC relies on the widely used C11
+ * memory model for this.
+ *
+ * Also note that the majority of this Note are only concerned with mutation
+ * by the mutator. The GC is free to change nearly any field (which is
+ * necessary for a moving GC). Naturally, doing this safely requires care which
+ * we discuss in the "Barriers during GC" section below.
+ *
+ * Field access
+ * ------------
+ * Which barriers are required to access a field of a closure depends upon the
+ * identity of the field. In general, fields come in three flavours:
+ *
+ *  * Mutable GC Pointers (C type `StgClosure*`, Cmm type `StgPtr`)
+ *  * Immutable GC Pointers (C type `MUT_FIELD StgClosure*`, Cmm type `StgPtr`)
+ *  * Non-pointers (C type `StgWord`, Cmm type `StgWord`)
  *
  * Note that Addr# fields are *not* GC pointers and therefore are classified
- * as non-pointers. Responsibility for barriers lies with the party
- * dereferencing the pointer.
- *
- * Also note that we are only concerned with mutation by the mutator. The GC
- * is free to change nearly any field as this is necessary for a moving GC.
- * Naturally, doing this safely requires care which we discuss in section
- * below.
+ * as non-pointers. In this case responsibility for barriers lies with the
+ * party dereferencing the Addr#.
  *
  * Immutable pointer fields are those which the mutator cannot change after
  * an object is made visible on the heap. Most objects' fields are of this
  * flavour (e.g. all data constructor fields). As these fields are written
  * precisely once, no write barriers are needed on writes nor reads. This is
  * safe due to an argument hinging on causality: Consider an immutable field F
- * of an object O refers to object O'. Naturally, O' must have been visible to
- * the creator of O when O was constructed. Consequently, if O is visible to a
- * reader, O' must also be visible.
+ * of an object O which refers to object O'. Naturally, O' must have been
+ * visible to the creator of O when O was constructed. Consequently, if O is
+ * visible to a reader, O' must also be visible to the same reader.
  *
  * Mutable pointer fields are those which can be modified by the mutator. These
  * require a bit more care as they may break the causality argument given
@@ -151,6 +160,10 @@ EXTERN_INLINE void busy_wait_nop(void);
  * into F. Without explicit synchronization O' may not be visible to another
  * thread attempting to dereference F.
  *
+ * To ensure the visibility of the referent, writing to a mutable pointer field
+ * must be done via a release-store. Conversely, reading from such a field is
+ * done via an acquire-load.
+ *
  * Mutable fields include:
  *
  *   - StgMutVar: var
@@ -163,64 +176,102 @@ EXTERN_INLINE void busy_wait_nop(void);
  *   - StgMutArrPtrs: payload
  *   - StgSmallMutArrPtrs: payload
  *   - StgThunk although this is a somewhat special case; see below
- *
- * Writing to a mutable pointer field must be done via a release-store.
- * Reading from such a field is done via an acquire-load.
+ *   - StgInd: indirectee
  *
  * Finally, non-pointer fields can be safely mutated without barriers as
- * they do not refer to other memory. Technically, concurrent accesses to
- * non-pointer fields still do need to be atomic in many cases to avoid torn
- * accesses. However, this is something that we generally avoid by locking
- * closures prior to mutating non-pointer fields (see Locking closures below).
- *
- * Note that MUT_VARs offer both synchronized and unsynchronized primops.
- * Consequently, in these cases there is a burden on the user to ensure that
- * synchronization is provided where necessary.
+ * they do not refer to other memory locations. Technically, concurrent
+ * accesses to non-pointer fields still do need to be atomic in many cases to
+ * avoid torn accesses. However, this is something that we generally avoid by
+ * locking closures prior to mutating non-pointer fields (see Locking closures
+ * below).
  *
  * Locking closures
  * ----------------
  * Several primops temporarily turn closures into WHITEHOLEs to ensure that
  * they have exclusive access (see SMPClosureOps.h:reallyLockClosure).
+ * These include,
+ *
+ *   - takeMVar#, tryTakeMVar#
+ *   - putMVar#, tryPutMVar#
+ *   - readMVar#, tryReadMVar#
+ *   - readIOPort#
+ *   - writeIOPort#
+ *   - addCFinalizerToWeak#
+ *   - finalizeWeak#
+ *   - deRefWeak#
+ *
  * Locking is done via an atomic exchange operation on the closure's info table
  * pointer with sequential consistency (although only acquire ordering is
- * needed). This acquire ensures that we synchronize with any previous thread
- * that had locked the closure. Consequently, it is important that we take great
- * care in examining the mutable fields of a lockable closure prior to having
- * locked it.
- *
- * Naturally, unlocking is done via a release-store to restore the closure's
- * original info table pointer.
+ * needed). Similarly, unlocking is also done with an atomic exchange to
+ * restore the closure's original info table pointer (although
+ * this time only the release ordering is needed). This ensures
+ * that we synchronize with any previous thread that had locked the closure.
  *
  * Thunks
  * ------
  * As noted above, thunks are a rather special (yet quite common) case. In
- * particular, they have the unique property of being updatable, transforming
- * from a thunk to an indirection. This transformation requires its own
- * synchronization protocol. In particular, we must ensure that a reader
- * examining a thunk being updated can see the indirectee. Consequently, a
- * thunk update (see rts/Updates.h) does the following:
+ * particular, they have the unique property of being updatable (that is, can
+ * be transformed from a thunk into an indirection after evaluation). This
+ * transformation requires its own synchronization protocol to mediate the
+ * interaction between the updater and the reader. In particular, we
+ * must ensure that a reader examining a thunk being updated by another core
+ * can see the indirectee. Consequently, a thunk update (see rts/Updates.h)
+ * does the following:
+ *
+ *  U1. use a release-store to place the new indirectee into the thunk's
+ *      indirectee field
  *
- *  1. Use a relaxed-store to place the new indirectee into the thunk's
- *     indirectee field
- *  2. use a release-store to set the info table to stg_BLACKHOLE (which
- *     represents an indirection)
+ *  U2. use a release-store to set the info table to stg_BLACKHOLE (which
+ *      represents an indirection)
  *
  * Blackholing a thunk (either eagerly, by GHC.StgToCmm.Bind.emitBlackHoleCode,
  * or lazily, by ThreadPaused.c:threadPaused) is done similarly.
  *
- * Conversely, indirection entry (see the entry code of stg_BLACKHOLE, stg_IND,
- * and stg_IND_STATIC in rts/StgMiscClosure.cmm) does the following:
- *
- *  1. We jump into the entry code for, e.g., stg_BLACKHOLE; this of course
- *     implies that we have already read the thunk's info table pointer, which
- *     is done with a relaxed load.
- *  2. use an acquire-fence to ensure that our view on the thunk is
- *     up-to-date. This synchronizes with step (2) in the update
- *     procedure.
- *  3. relaxed-load the indirectee. Since thunks are updated at most
- *     once we know that the fence in the last step has given us
- *     an up-to-date view of the indirectee closure.
- *  4. enter the indirectee (or block if the indirectee is a TSO)
+ * Conversely, entering an indirection (see the entry code of stg_BLACKHOLE,
+ * stg_IND, and stg_IND_STATIC in rts/StgMiscClosure.cmm) does the
+ * following:
+ *
+ *  E1. jump into the entry code of the indirection (e.g. stg_BLACKHOLE);
+ *      this of course implies that we have already read the thunk's info table
+ *      pointer, which is done with a relaxed load.
+ *
+ *  E2. acquire-fence
+ *
+ *  E3. acquire-load the indirectee. Since thunks are updated at most
+ *      once we know that the fence in the last step has given us
+ *      an up-to-date view of the indirectee closure.
+ *
+ *  E4. enter the indirectee (or block if the indirectee is a TSO)
+ *
+ * The release/acquire pair (U2)/(E2) is somewhat surprising but is necessary as
+ * the C11 memory model does not guarantee that the store (U1) is visible to
+ * (E3) despite (U1) preceding (U2) in program-order (due to the relaxed
+ * ordering of (E3)). This is demonstrated by the following CppMem model:
+ *
+ *     int main() {
+ *       atomic_int x = 0;    // info table pointer
+ *       atomic_int y = 0;    // indirectee
+ *       {{{
+ *         {    // blackhole update
+ *           y.store(1, memory_order_release);                // U1
+ *           x.store(2, memory_order_release);                // U2
+ *         }
+ *       |||
+ *         {    // blackhole entry
+ *           r1=x.load(memory_order_relaxed).readsvalue(2);   // E1
+ *           //fence(memory_order_acquire);                   // E2
+ *           r2=y.load(memory_order_acquire);                 // E3
+ *         }
+ *       }}};
+ *       return 0;
+ *     }
+ *
+ * Under the C11 memory model this program admits an execution where the
+ * indirectee `r2=0`.
+ *
+ * Of course, this could also be addressed by strengthing the ordering of (E1)
+ * to acquire, but this would incur a significant cost on every closure entry
+ * (including non-blackholes).
  *
  * Other closures
  * --------------
@@ -328,6 +379,12 @@ EXTERN_INLINE void busy_wait_nop(void);
  * The work-stealing queue (WSDeque) also requires barriers; these are
  * documented in WSDeque.c.
  *
+ * Verifying memory ordering
+ * -------------------------
+ * To verify that GHC's RTS and the code produced by the compiler are free of
+ * data races we employ ThreadSaniziter. See Note [ThreadSanitizer] in TSANUtils.h
+ * for details on this facility.
+ *
  */
 
 /* ----------------------------------------------------------------------------
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 5c97d449628f45204dfc6fa8aa606a0df2ea6507..4eba066928314ecda90b644a1df502b60bfe27b5 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -1542,7 +1542,7 @@ selector_loop:
 bale_out:
     // We didn't manage to evaluate this thunk; restore the old info
     // pointer.  But don't forget: we still need to evacuate the thunk itself.
-    SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr);
+    SET_INFO_RELAXED((StgClosure *)p, (const StgInfoTable *)info_ptr);
     // THREADED_RTS: we just unlocked the thunk, so another thread
     // might get in and update it.  copy() will lock it again and
     // check whether it was updated in the meantime.
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 87dc4112f68bab099a2e659abd56d8d445ff7f2b..ab485f2d0c9b73af289ff9a67a356b3f20f8f1cd 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -688,8 +688,9 @@ void updateRemembSetPushThunkEager(Capability *cap,
     case IND:
     {
         StgInd *ind = (StgInd *) thunk;
-        if (check_in_nonmoving_heap(ind->indirectee)) {
-            push_closure(queue, ind->indirectee, NULL);
+        StgClosure *indirectee = ACQUIRE_LOAD(&ind->indirectee);
+        if (check_in_nonmoving_heap(indirectee)) {
+            push_closure(queue, indirectee, NULL);
         }
         break;
     }
@@ -1587,7 +1588,7 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
         // Synchronizes with the release-store in updateWithIndirection.
         // See Note [Heap memory barriers] in SMP.h.
         StgInd *ind = (StgInd *) p;
-        ACQUIRE_FENCE();
+        ACQUIRE_FENCE_ON(&p->header.info);
         StgClosure *indirectee = RELAXED_LOAD(&ind->indirectee);
         markQueuePushClosure(queue, indirectee, &ind->indirectee);
         if (GET_CLOSURE_TAG(indirectee) == 0 || origin == NULL) {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 12be375692dc5da5dd6d4b18bb397c0b4d136da0..8aa52df00dee1b28adcb9808fa1a5152f060faf8 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -596,8 +596,6 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
 
-    // RELEASE ordering to ensure that above writes are visible before we
-    // introduce reference as CAF indirectee.
     RELEASE_STORE(&caf->indirectee, (StgClosure *) bh);
     SET_INFO_RELEASE((StgClosure*)caf, &stg_IND_STATIC_info);
 
diff --git a/utils/genapply/Main.hs b/utils/genapply/Main.hs
index fd5557cfedf3b10064eaa27cb4b5631bbfb25971..7575bb7ce9fd4e192699621b44c5e2147226bfec 100644
--- a/utils/genapply/Main.hs
+++ b/utils/genapply/Main.hs
@@ -783,7 +783,11 @@ genApply regstatus args =
         text "case IND,",
         text "     IND_STATIC: {",
         nest 4 (vcat [
-          text "R1 = StgInd_indirectee(R1);",
+          -- N.B. annoyingly the %acquire syntax must place its result in a local register
+          -- as it is a Cmm prim call node.
+          text "P_ p;",
+          text "p = %acquire StgInd_indirectee(R1);",
+          text "R1 = p;",
             -- An indirection node might contain a tagged pointer
           text "goto again;"
          ]),