diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs index 094d6ff94e04e13defbb047afc53f482a63f9424..60857e294e6d72d0ae46286f7a4ee604776bf450 100644 --- a/compiler/GHC/StgToCmm/Bind.hs +++ b/compiler/GHC/StgToCmm/Bind.hs @@ -721,11 +721,19 @@ emitBlackHoleCode node = do when eager_blackholing $ do whenUpdRemSetEnabled $ emitUpdRemSetPushThunk node - emitStore (cmmOffsetW platform node (fixedHdrSizeW profile)) (currentTSOExpr platform) + emitAtomicStore platform MemOrderRelease + (cmmOffsetW platform node (fixedHdrSizeW profile)) + (currentTSOExpr platform) -- See Note [Heap memory barriers] in SMP.h. - let w = wordWidth platform - emitPrimCall [] (MO_AtomicWrite w MemOrderRelease) - [node, CmmReg (CmmGlobal $ GlobalRegUse EagerBlackholeInfo $ bWord platform)] + emitAtomicStore platform MemOrderRelease + node + (CmmReg (CmmGlobal $ GlobalRegUse EagerBlackholeInfo $ bWord platform)) + +emitAtomicStore :: Platform -> MemoryOrdering -> CmmExpr -> CmmExpr -> FCode () +emitAtomicStore platform mord addr val = + emitPrimCall [] (MO_AtomicWrite w mord) [addr, val] + where + w = typeWidth $ cmmExprType platform val setupUpdate :: ClosureInfo -> LocalReg -> FCode () -> FCode () -- Nota Bene: this function does not change Node (even if it's a CAF), diff --git a/rts/Apply.cmm b/rts/Apply.cmm index b338b4c387341156f9e5e7e8ffa65c968a9c32f0..e561f4409fd28bdec2968d85d0dbc23c126c24d4 100644 --- a/rts/Apply.cmm +++ b/rts/Apply.cmm @@ -108,7 +108,7 @@ again: IND, IND_STATIC: { - fun = StgInd_indirectee(fun); + fun = %acquire StgInd_indirectee(fun); goto again; } case BCO: @@ -693,7 +693,7 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK") } // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is // not reachable. - StgInd_indirectee(ap) = CurrentTSO; + %release StgInd_indirectee(ap) = CurrentTSO; SET_INFO_RELEASE(ap, __stg_EAGER_BLACKHOLE_info); /* ensure there is at least AP_STACK_SPLIM words of headroom available diff --git a/rts/Compact.cmm b/rts/Compact.cmm index ecb694cf5c8e8250d29663c3314966bb0aa7395e..3a6444a710bed2aa3331102d5ffa1f44e047ceaa 100644 --- a/rts/Compact.cmm +++ b/rts/Compact.cmm @@ -100,7 +100,7 @@ eval: // Follow indirections: case IND, IND_STATIC: { - p = StgInd_indirectee(p); + p = %acquire StgInd_indirectee(p); goto eval; } diff --git a/rts/Heap.c b/rts/Heap.c index 0935b66c272936ffebdefeb0a03f9f03ea005f49..1ecf1b6e1456af1daa35c9c0e5d1e3e45764a43c 100644 --- a/rts/Heap.c +++ b/rts/Heap.c @@ -173,7 +173,7 @@ StgWord collect_pointers(StgClosure *closure, StgClosure *ptrs[]) { case IND: case IND_STATIC: case BLACKHOLE: - ptrs[nptrs++] = (StgClosure *)(((StgInd *)closure)->indirectee); + ptrs[nptrs++] = (StgClosure *) ACQUIRE_LOAD(&((StgInd *)closure)->indirectee); break; case MUT_ARR_PTRS_CLEAN: diff --git a/rts/Interpreter.c b/rts/Interpreter.c index 9530ff826ba47dc11884169c2b18f1d80a6723c7..1be9cf832cb8c46e3768b020e405bec51428f791 100644 --- a/rts/Interpreter.c +++ b/rts/Interpreter.c @@ -420,7 +420,7 @@ eval_obj: case IND: case IND_STATIC: { - tagged_obj = ((StgInd*)obj)->indirectee; + tagged_obj = ACQUIRE_LOAD(&((StgInd*)obj)->indirectee); goto eval_obj; } diff --git a/rts/Messages.c b/rts/Messages.c index 139f61c3c857d289092b33ae59e21ad3729a3740..50df0ab41fefb012d62f3b155e294bf03492832a 100644 --- a/rts/Messages.c +++ b/rts/Messages.c @@ -191,9 +191,6 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg) StgClosure *p; const StgInfoTable *info; do { - // If we are being called from stg_BLACKHOLE then TSAN won't know about the - // previous read barrier that makes the following access safe. - TSAN_ANNOTATE_BENIGN_RACE(&((StgInd*)bh)->indirectee, "messageBlackHole"); p = UNTAG_CLOSURE(ACQUIRE_LOAD(&((StgInd*)bh)->indirectee)); info = RELAXED_LOAD(&p->header.info); } while (info == &stg_IND_info); @@ -291,7 +288,7 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg) // makes it into the update remembered set updateRemembSetPushClosure(cap, (StgClosure*)bq->queue); } - RELAXED_STORE(&msg->link, bq->queue); + msg->link = bq->queue; bq->queue = msg; // No barrier is necessary here: we are only exposing the // closure to the GC. See Note [Heap memory barriers] in SMP.h. diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index bccb12a0538932ea8fcb24029ad83593dddce1b0..11117c0ed82def6f91e1a8a427b3ebf1490d6821 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -1753,7 +1753,7 @@ loop: qinfo = GET_INFO_ACQUIRE(q); if (qinfo == stg_IND_info || qinfo == stg_MSG_NULL_info) { - q = StgInd_indirectee(q); + q = %acquire StgInd_indirectee(q); goto loop; } @@ -1821,7 +1821,7 @@ loop: if (qinfo == stg_IND_info || qinfo == stg_MSG_NULL_info) { - q = StgInd_indirectee(q); + q = %acquire StgInd_indirectee(q); goto loop; } @@ -1923,7 +1923,7 @@ loop: if (qinfo == stg_IND_info || qinfo == stg_MSG_NULL_info) { - q = StgInd_indirectee(q); + q = %acquire StgInd_indirectee(q); goto loop; } @@ -2012,7 +2012,7 @@ loop: if (qinfo == stg_IND_info || qinfo == stg_MSG_NULL_info) { - q = StgInd_indirectee(q); + q = %acquire StgInd_indirectee(q); goto loop; } @@ -2293,7 +2293,7 @@ loop: //Possibly IND added by removeFromMVarBlockedQueue if (StgHeader_info(q) == stg_IND_info || StgHeader_info(q) == stg_MSG_NULL_info) { - q = StgInd_indirectee(q); + q = %acquire StgInd_indirectee(q); goto loop; } diff --git a/rts/StableName.c b/rts/StableName.c index 5d4f2002ad92c21846ceb4e56dd95e4b15ac8d4a..15ecb57f65963afba9f1f4078c2e0fc53486b7b1 100644 --- a/rts/StableName.c +++ b/rts/StableName.c @@ -156,11 +156,11 @@ removeIndirections (StgClosure* p) switch (get_itbl(q)->type) { case IND: case IND_STATIC: - p = ((StgInd *)q)->indirectee; + p = ACQUIRE_LOAD(&((StgInd *)q)->indirectee); continue; case BLACKHOLE: - p = ((StgInd *)q)->indirectee; + p = ACQUIRE_LOAD(&((StgInd *)q)->indirectee); if (GET_CLOSURE_TAG(p) != 0) { continue; } else { diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm index ea9fe77ed55e20aca64f7a605859d18d65a461fc..bd899d27711e9834f99ebb5d2243b31d4cb70afd 100644 --- a/rts/StgMiscClosures.cmm +++ b/rts/StgMiscClosures.cmm @@ -520,8 +520,9 @@ INFO_TABLE(stg_IND,1,0,IND,"IND","IND") (P_ node) { TICK_ENT_DYN_IND(); /* tick */ - ACQUIRE_FENCE; - node = UNTAG(StgInd_indirectee(node)); + ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info); + node = %acquire StgInd_indirectee(node); + node = UNTAG(node); TICK_ENT_VIA_NODE(); jump %GET_ENTRY(node) (node); } @@ -529,8 +530,10 @@ INFO_TABLE(stg_IND,1,0,IND,"IND","IND") /* explicit stack */ { TICK_ENT_DYN_IND(); /* tick */ - ACQUIRE_FENCE; - R1 = UNTAG(StgInd_indirectee(R1)); + ACQUIRE_FENCE_ON(R1 + OFFSET_StgHeader_info); + P_ p; + p = %acquire StgInd_indirectee(R1); + R1 = UNTAG(p); TICK_ENT_VIA_NODE(); jump %GET_ENTRY(R1) [R1]; } @@ -540,8 +543,10 @@ INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC") /* explicit stack */ { TICK_ENT_STATIC_IND(); /* tick */ - ACQUIRE_FENCE; - R1 = UNTAG(StgInd_indirectee(R1)); + ACQUIRE_FENCE_ON(R1 + OFFSET_StgHeader_info); + P_ p; + p = %acquire StgInd_indirectee(R1); + R1 = UNTAG(p); TICK_ENT_VIA_NODE(); jump %GET_ENTRY(R1) [R1]; } @@ -564,14 +569,11 @@ INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE") TICK_ENT_DYN_IND(); /* tick */ retry: -#if defined(TSAN_ENABLED) - // See Note [ThreadSanitizer and fences] - W_ unused; unused = %acquire GET_INFO(node); -#endif - // Synchronizes with the release-store in updateWithIndirection. + // Synchronizes with the release-store in + // updateWithIndirection. // See Note [Heap memory barriers] in SMP.h. - ACQUIRE_FENCE; - p = %relaxed StgInd_indirectee(node); + ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info); + p = %acquire StgInd_indirectee(node); if (GETTAG(p) != 0) { return (p); } @@ -656,7 +658,7 @@ INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE") i = 0; loop: // spin until the WHITEHOLE is updated - info = StgHeader_info(node); + info = %relaxed StgHeader_info(node); if (info == stg_WHITEHOLE_info) { #if defined(PROF_SPIN) W_[whitehole_lockClosure_spin] = @@ -675,6 +677,7 @@ loop: // defined in CMM. goto loop; } + ACQUIRE_FENCE_ON(node + OFFSET_StgHeader_info); jump %ENTRY_CODE(info) (node); #else ccall barf("WHITEHOLE object (%p) entered!", R1) never returns; diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c index d350d3fbed83f93966fc4145e0b54afe5236b240..8e8d6051a65162bfc321236a6f5778281726033f 100644 --- a/rts/ThreadPaused.c +++ b/rts/ThreadPaused.c @@ -352,7 +352,7 @@ threadPaused(Capability *cap, StgTSO *tso) OVERWRITING_CLOSURE_SIZE(bh, closure_sizeW_(bh, INFO_PTR_TO_STRUCT(bh_info))); // The payload of the BLACKHOLE points to the TSO - ((StgInd *)bh)->indirectee = (StgClosure *)tso; + RELEASE_STORE(&((StgInd *)bh)->indirectee, (StgClosure *)tso); SET_INFO_RELEASE(bh,&stg_BLACKHOLE_info); // .. and we need a write barrier, since we just mutated the closure: diff --git a/rts/Threads.c b/rts/Threads.c index 031ac21247cc00434bdfd957a7685b607997b77d..46a70a11a8bf3a5cb68d26fc839f01c16aa87dff 100644 --- a/rts/Threads.c +++ b/rts/Threads.c @@ -437,7 +437,7 @@ checkBlockingQueues (Capability *cap, StgTSO *tso) p = UNTAG_CLOSURE(bq->bh); const StgInfoTable *pinfo = ACQUIRE_LOAD(&p->header.info); if (pinfo != &stg_BLACKHOLE_info || - ((StgInd *)p)->indirectee != (StgClosure*)bq) + (RELAXED_LOAD(&((StgInd *)p)->indirectee) != (StgClosure*)bq)) { wakeBlockingQueue(cap,bq); } @@ -468,7 +468,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val) return; } - v = UNTAG_CLOSURE(((StgInd*)thunk)->indirectee); + v = UNTAG_CLOSURE(ACQUIRE_LOAD(&((StgInd*)thunk)->indirectee)); updateWithIndirection(cap, thunk, val); @@ -808,7 +808,7 @@ loop: qinfo = ACQUIRE_LOAD(&q->header.info); if (qinfo == &stg_IND_info || qinfo == &stg_MSG_NULL_info) { - q = (StgMVarTSOQueue*)((StgInd*)q)->indirectee; + q = (StgMVarTSOQueue*) ACQUIRE_LOAD(&((StgInd*)q)->indirectee); goto loop; } diff --git a/rts/Updates.cmm b/rts/Updates.cmm index c1de06c5e4529a88bdf1ef55dd54d59dea12003a..d8c6d625b48597d41e07da4eb785e70c7eb70f8c 100644 --- a/rts/Updates.cmm +++ b/rts/Updates.cmm @@ -59,7 +59,7 @@ INFO_TABLE_RET ( stg_marked_upd_frame, UPDATE_FRAME, ASSERT(HpAlloc == 0); // Note [HpAlloc] // we know the closure is a BLACKHOLE - v = StgInd_indirectee(updatee); + v = %acquire StgInd_indirectee(updatee); if (GETTAG(v) != 0) (likely: False) { // updated by someone else: discard our value and use the diff --git a/rts/Updates.h b/rts/Updates.h index bcb3ea8cc4c0b8e260d780070982116d92bdf69b..58967c332131597ab932b043fd664b463ef5b315 100644 --- a/rts/Updates.h +++ b/rts/Updates.h @@ -261,6 +261,66 @@ * `tso_1` and other blocked threads may be unblocked more quickly. * * + * Waking up blocking queues + * ------------------------- + * As noted above, when a thread updates a `BLACKHOLE`'d thunk it may find that + * some threads have added themselves to the thunk's blocking queue. Naturally, + * we must ensure that these threads are woken up. However, this gets a bit + * subtle since multiple threads may have raced to enter the thunk. + * + * That is, we may end up in a situation like one of these (TODO audit): + * + * ### Race A + * + * Thread 0 Thread 1 Thread 2 + * -------------------------- -------------------------- ---------------------- + * enter thnk + * enter thnk + * thnk.indirectee := tso_0 + * thnk.indirectee := tso_1 + * thnk.info := BLACKHOLE + * thnk.info := BLACKHOLE + * enter, block on thnk + * send MSG_BLACKHOLE to tso_1->cap + * finishes evaluation + * thnk.indirectee := result + * handle MSG_BLACKHOLE + * add + * + * ### Race B + * + * Thread 0 Thread 1 Thread 2 + * -------------------------- -------------------------- ---------------------- + * enter thnk + * enter thnk + * thnk.indirectee := tso_0 + * thnk.indirectee := tso_1 + * thnk.info := BLACKHOLE + * thnk.info := BLACKHOLE + * enter, block on thnk + * send MSG_BLACKHOLE to tso_1->cap + * handle MSG_BLACKHOLE + * add + * finishes evaluation + * thnk.indirectee := result + * + * ### Race C + * + * Thread 0 Thread 1 Thread 2 + * -------------------------- -------------------------- ---------------------- + * enter thnk + * enter thnk + * thnk.indirectee := tso_0 + * thnk.info := BLACKHOLE + * enter, block on thnk + * send MSG_BLACKHOLE to tso_0->cap + * handle MSG_BLACKHOLE + * thnk.indirectee := new BLOCKING_QUEUE + * + * thnk.indirectee := tso_1 + * thnk.info := BLACKHOLE + * + * * Exception handling * ------------------ * When an exception is thrown to a thread which is evaluating a thunk, it is @@ -400,8 +460,8 @@ } \ \ OVERWRITING_CLOSURE(p1); \ - %relaxed StgInd_indirectee(p1) = p2; \ - SET_INFO_RELEASE(p1, stg_BLACKHOLE_info); \ + %release StgInd_indirectee(p1) = p2; \ + %release SET_INFO(p1, stg_BLACKHOLE_info); \ LDV_RECORD_CREATE(p1); \ and_then; diff --git a/rts/include/Cmm.h b/rts/include/Cmm.h index a19e3629309f211e4922c090056597b6045fdada..0f054290e43d3e5d919c52f72d0978c7e3015c72 100644 --- a/rts/include/Cmm.h +++ b/rts/include/Cmm.h @@ -35,6 +35,7 @@ #define CMINUSMINUS 1 #include "ghcconfig.h" +#include "rts/TSANUtils.h" /* ----------------------------------------------------------------------------- Types @@ -311,7 +312,7 @@ #define ENTER(x) ENTER_(return,x) #endif -#define ENTER_R1() ENTER_(RET_R1,R1) +#define ENTER_R1() P_ _r1; _r1 = R1; ENTER_(RET_R1, _r1) #define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1] @@ -326,7 +327,7 @@ IND, \ IND_STATIC: \ { \ - x = StgInd_indirectee(x); \ + x = %acquire StgInd_indirectee(x); \ goto again; \ } \ case \ @@ -446,9 +447,17 @@ HP_CHK_P(bytes); \ TICK_ALLOC_RTS(bytes); +// Load a field out of structure with relaxed ordering. +#define RELAXED_LOAD_FIELD(fld, ptr) \ + REP_##fld[(ptr) + OFFSET_##fld] + +// Load a field out of an StgClosure with relaxed ordering. +#define RELAXED_LOAD_CLOSURE_FIELD(fld, ptr) \ + REP_##fld[(ptr) + SIZEOF_StgHeader + OFFSET_##fld] + #define CHECK_GC() \ (bdescr_link(CurrentNursery) == NULL || \ - generation_n_new_large_words(W_[g0]) >= TO_W_(CLong[large_alloc_lim])) + RELAXED_LOAD_FIELD(generation_n_new_large_words, W_[g0]) >= TO_W_(CLong[large_alloc_lim])) // allocate() allocates from the nursery, so we check to see // whether the nursery is nearly empty in any function that uses @@ -688,9 +697,13 @@ #define RELEASE_FENCE prim %fence_release(); #define ACQUIRE_FENCE prim %fence_acquire(); -// TODO -#if 1 +#if TSAN_ENABLED +// This is may be efficient than a fence but TSAN can reason about it. +#if WORD_SIZE_IN_BITS == 64 #define ACQUIRE_FENCE_ON(x) if (1) { W_ tmp; (tmp) = prim %load_acquire64(x); } +#elif WORD_SIZE_IN_BITS == 32 +#define ACQUIRE_FENCE_ON(x) if (1) { W_ tmp; (tmp) = prim %load_acquire32(x); } +#endif #else #define ACQUIRE_FENCE_ON(x) ACQUIRE_FENCE #endif diff --git a/rts/include/rts/TSANUtils.h b/rts/include/rts/TSANUtils.h index 7052c05a55fecc18b11e130df091ff782147a377..68b223012e296d4e191d486a1070281d74a24c10 100644 --- a/rts/include/rts/TSANUtils.h +++ b/rts/include/rts/TSANUtils.h @@ -73,6 +73,7 @@ #endif #endif +#if !defined(CMINUSMINUS) #if defined(TSAN_ENABLED) #if !defined(HAVE_C11_ATOMICS) #error TSAN cannot be enabled without C11 atomics support. @@ -106,3 +107,4 @@ uint32_t ghc_tsan_atomic32_compare_exchange(uint32_t *ptr, uint32_t expected, ui uint16_t ghc_tsan_atomic16_compare_exchange(uint16_t *ptr, uint16_t expected, uint16_t new_value, int success_memorder, int failure_memorder); uint8_t ghc_tsan_atomic8_compare_exchange(uint8_t *ptr, uint8_t expected, uint8_t new_value, int success_memorder, int failure_memorder); +#endif diff --git a/rts/include/stg/SMP.h b/rts/include/stg/SMP.h index ecd3b49186e7904909c0d9b5aeee95e1ca54732b..897d19665999dfe3b3827d18f19bd90e263944c3 100644 --- a/rts/include/stg/SMP.h +++ b/rts/include/stg/SMP.h @@ -118,31 +118,40 @@ EXTERN_INLINE void busy_wait_nop(void); * stores which formed the new object are visible (e.g. stores are flushed from * cache and the relevant cachelines invalidated in other cores). * - * To ensure this we must use memory barriers. Which barriers are required to - * access a field depends upon the type of the field. In general, fields come - * in three flavours: - * - * * Mutable GC Pointers (C type StgClosure*, Cmm type StgPtr) - * * Immutable GC Pointers (C type MUT_FIELD StgClosure*, Cmm type StgPtr) - * * Non-pointers (C type StgWord, Cmm type StdWord) + * To ensure this we must issue memory barriers when accessing closures and + * their fields. Since reasoning about concurrent memory access with barriers tends to be + * subtle and platform dependent, it is more common to instead write programs + * in terms of an abstract memory model and let the compiler (GHC and the + * system's C compiler) worry about what barriers are needed to realize the + * requested semantics on the target system. GHC relies on the widely used C11 + * memory model for this. + * + * Also note that the majority of this Note are only concerned with mutation + * by the mutator. The GC is free to change nearly any field (which is + * necessary for a moving GC). Naturally, doing this safely requires care which + * we discuss in the "Barriers during GC" section below. + * + * Field access + * ------------ + * Which barriers are required to access a field of a closure depends upon the + * identity of the field. In general, fields come in three flavours: + * + * * Mutable GC Pointers (C type `StgClosure*`, Cmm type `StgPtr`) + * * Immutable GC Pointers (C type `MUT_FIELD StgClosure*`, Cmm type `StgPtr`) + * * Non-pointers (C type `StgWord`, Cmm type `StgWord`) * * Note that Addr# fields are *not* GC pointers and therefore are classified - * as non-pointers. Responsibility for barriers lies with the party - * dereferencing the pointer. - * - * Also note that we are only concerned with mutation by the mutator. The GC - * is free to change nearly any field as this is necessary for a moving GC. - * Naturally, doing this safely requires care which we discuss in section - * below. + * as non-pointers. In this case responsibility for barriers lies with the + * party dereferencing the Addr#. * * Immutable pointer fields are those which the mutator cannot change after * an object is made visible on the heap. Most objects' fields are of this * flavour (e.g. all data constructor fields). As these fields are written * precisely once, no write barriers are needed on writes nor reads. This is * safe due to an argument hinging on causality: Consider an immutable field F - * of an object O refers to object O'. Naturally, O' must have been visible to - * the creator of O when O was constructed. Consequently, if O is visible to a - * reader, O' must also be visible. + * of an object O which refers to object O'. Naturally, O' must have been + * visible to the creator of O when O was constructed. Consequently, if O is + * visible to a reader, O' must also be visible to the same reader. * * Mutable pointer fields are those which can be modified by the mutator. These * require a bit more care as they may break the causality argument given @@ -151,6 +160,10 @@ EXTERN_INLINE void busy_wait_nop(void); * into F. Without explicit synchronization O' may not be visible to another * thread attempting to dereference F. * + * To ensure the visibility of the referent, writing to a mutable pointer field + * must be done via a release-store. Conversely, reading from such a field is + * done via an acquire-load. + * * Mutable fields include: * * - StgMutVar: var @@ -163,64 +176,102 @@ EXTERN_INLINE void busy_wait_nop(void); * - StgMutArrPtrs: payload * - StgSmallMutArrPtrs: payload * - StgThunk although this is a somewhat special case; see below - * - * Writing to a mutable pointer field must be done via a release-store. - * Reading from such a field is done via an acquire-load. + * - StgInd: indirectee * * Finally, non-pointer fields can be safely mutated without barriers as - * they do not refer to other memory. Technically, concurrent accesses to - * non-pointer fields still do need to be atomic in many cases to avoid torn - * accesses. However, this is something that we generally avoid by locking - * closures prior to mutating non-pointer fields (see Locking closures below). - * - * Note that MUT_VARs offer both synchronized and unsynchronized primops. - * Consequently, in these cases there is a burden on the user to ensure that - * synchronization is provided where necessary. + * they do not refer to other memory locations. Technically, concurrent + * accesses to non-pointer fields still do need to be atomic in many cases to + * avoid torn accesses. However, this is something that we generally avoid by + * locking closures prior to mutating non-pointer fields (see Locking closures + * below). * * Locking closures * ---------------- * Several primops temporarily turn closures into WHITEHOLEs to ensure that * they have exclusive access (see SMPClosureOps.h:reallyLockClosure). + * These include, + * + * - takeMVar#, tryTakeMVar# + * - putMVar#, tryPutMVar# + * - readMVar#, tryReadMVar# + * - readIOPort# + * - writeIOPort# + * - addCFinalizerToWeak# + * - finalizeWeak# + * - deRefWeak# + * * Locking is done via an atomic exchange operation on the closure's info table * pointer with sequential consistency (although only acquire ordering is - * needed). This acquire ensures that we synchronize with any previous thread - * that had locked the closure. Consequently, it is important that we take great - * care in examining the mutable fields of a lockable closure prior to having - * locked it. - * - * Naturally, unlocking is done via a release-store to restore the closure's - * original info table pointer. + * needed). Similarly, unlocking is also done with an atomic exchange to + * restore the closure's original info table pointer (although + * this time only the release ordering is needed). This ensures + * that we synchronize with any previous thread that had locked the closure. * * Thunks * ------ * As noted above, thunks are a rather special (yet quite common) case. In - * particular, they have the unique property of being updatable, transforming - * from a thunk to an indirection. This transformation requires its own - * synchronization protocol. In particular, we must ensure that a reader - * examining a thunk being updated can see the indirectee. Consequently, a - * thunk update (see rts/Updates.h) does the following: + * particular, they have the unique property of being updatable (that is, can + * be transformed from a thunk into an indirection after evaluation). This + * transformation requires its own synchronization protocol to mediate the + * interaction between the updater and the reader. In particular, we + * must ensure that a reader examining a thunk being updated by another core + * can see the indirectee. Consequently, a thunk update (see rts/Updates.h) + * does the following: + * + * U1. use a release-store to place the new indirectee into the thunk's + * indirectee field * - * 1. Use a relaxed-store to place the new indirectee into the thunk's - * indirectee field - * 2. use a release-store to set the info table to stg_BLACKHOLE (which - * represents an indirection) + * U2. use a release-store to set the info table to stg_BLACKHOLE (which + * represents an indirection) * * Blackholing a thunk (either eagerly, by GHC.StgToCmm.Bind.emitBlackHoleCode, * or lazily, by ThreadPaused.c:threadPaused) is done similarly. * - * Conversely, indirection entry (see the entry code of stg_BLACKHOLE, stg_IND, - * and stg_IND_STATIC in rts/StgMiscClosure.cmm) does the following: - * - * 1. We jump into the entry code for, e.g., stg_BLACKHOLE; this of course - * implies that we have already read the thunk's info table pointer, which - * is done with a relaxed load. - * 2. use an acquire-fence to ensure that our view on the thunk is - * up-to-date. This synchronizes with step (2) in the update - * procedure. - * 3. relaxed-load the indirectee. Since thunks are updated at most - * once we know that the fence in the last step has given us - * an up-to-date view of the indirectee closure. - * 4. enter the indirectee (or block if the indirectee is a TSO) + * Conversely, entering an indirection (see the entry code of stg_BLACKHOLE, + * stg_IND, and stg_IND_STATIC in rts/StgMiscClosure.cmm) does the + * following: + * + * E1. jump into the entry code of the indirection (e.g. stg_BLACKHOLE); + * this of course implies that we have already read the thunk's info table + * pointer, which is done with a relaxed load. + * + * E2. acquire-fence + * + * E3. acquire-load the indirectee. Since thunks are updated at most + * once we know that the fence in the last step has given us + * an up-to-date view of the indirectee closure. + * + * E4. enter the indirectee (or block if the indirectee is a TSO) + * + * The release/acquire pair (U2)/(E2) is somewhat surprising but is necessary as + * the C11 memory model does not guarantee that the store (U1) is visible to + * (E3) despite (U1) preceding (U2) in program-order (due to the relaxed + * ordering of (E3)). This is demonstrated by the following CppMem model: + * + * int main() { + * atomic_int x = 0; // info table pointer + * atomic_int y = 0; // indirectee + * {{{ + * { // blackhole update + * y.store(1, memory_order_release); // U1 + * x.store(2, memory_order_release); // U2 + * } + * ||| + * { // blackhole entry + * r1=x.load(memory_order_relaxed).readsvalue(2); // E1 + * //fence(memory_order_acquire); // E2 + * r2=y.load(memory_order_acquire); // E3 + * } + * }}}; + * return 0; + * } + * + * Under the C11 memory model this program admits an execution where the + * indirectee `r2=0`. + * + * Of course, this could also be addressed by strengthing the ordering of (E1) + * to acquire, but this would incur a significant cost on every closure entry + * (including non-blackholes). * * Other closures * -------------- @@ -328,6 +379,12 @@ EXTERN_INLINE void busy_wait_nop(void); * The work-stealing queue (WSDeque) also requires barriers; these are * documented in WSDeque.c. * + * Verifying memory ordering + * ------------------------- + * To verify that GHC's RTS and the code produced by the compiler are free of + * data races we employ ThreadSaniziter. See Note [ThreadSanitizer] in TSANUtils.h + * for details on this facility. + * */ /* ---------------------------------------------------------------------------- diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index 5c97d449628f45204dfc6fa8aa606a0df2ea6507..4eba066928314ecda90b644a1df502b60bfe27b5 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -1542,7 +1542,7 @@ selector_loop: bale_out: // We didn't manage to evaluate this thunk; restore the old info // pointer. But don't forget: we still need to evacuate the thunk itself. - SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr); + SET_INFO_RELAXED((StgClosure *)p, (const StgInfoTable *)info_ptr); // THREADED_RTS: we just unlocked the thunk, so another thread // might get in and update it. copy() will lock it again and // check whether it was updated in the meantime. diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c index 87dc4112f68bab099a2e659abd56d8d445ff7f2b..ab485f2d0c9b73af289ff9a67a356b3f20f8f1cd 100644 --- a/rts/sm/NonMovingMark.c +++ b/rts/sm/NonMovingMark.c @@ -688,8 +688,9 @@ void updateRemembSetPushThunkEager(Capability *cap, case IND: { StgInd *ind = (StgInd *) thunk; - if (check_in_nonmoving_heap(ind->indirectee)) { - push_closure(queue, ind->indirectee, NULL); + StgClosure *indirectee = ACQUIRE_LOAD(&ind->indirectee); + if (check_in_nonmoving_heap(indirectee)) { + push_closure(queue, indirectee, NULL); } break; } @@ -1587,7 +1588,7 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin) // Synchronizes with the release-store in updateWithIndirection. // See Note [Heap memory barriers] in SMP.h. StgInd *ind = (StgInd *) p; - ACQUIRE_FENCE(); + ACQUIRE_FENCE_ON(&p->header.info); StgClosure *indirectee = RELAXED_LOAD(&ind->indirectee); markQueuePushClosure(queue, indirectee, &ind->indirectee); if (GET_CLOSURE_TAG(indirectee) == 0 || origin == NULL) { diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 12be375692dc5da5dd6d4b18bb397c0b4d136da0..8aa52df00dee1b28adcb9808fa1a5152f060faf8 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -596,8 +596,6 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf) bh->indirectee = (StgClosure *)cap->r.rCurrentTSO; SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs); - // RELEASE ordering to ensure that above writes are visible before we - // introduce reference as CAF indirectee. RELEASE_STORE(&caf->indirectee, (StgClosure *) bh); SET_INFO_RELEASE((StgClosure*)caf, &stg_IND_STATIC_info); diff --git a/utils/genapply/Main.hs b/utils/genapply/Main.hs index fd5557cfedf3b10064eaa27cb4b5631bbfb25971..7575bb7ce9fd4e192699621b44c5e2147226bfec 100644 --- a/utils/genapply/Main.hs +++ b/utils/genapply/Main.hs @@ -783,7 +783,11 @@ genApply regstatus args = text "case IND,", text " IND_STATIC: {", nest 4 (vcat [ - text "R1 = StgInd_indirectee(R1);", + -- N.B. annoyingly the %acquire syntax must place its result in a local register + -- as it is a Cmm prim call node. + text "P_ p;", + text "p = %acquire StgInd_indirectee(R1);", + text "R1 = p;", -- An indirection node might contain a tagged pointer text "goto again;" ]),