diff --git a/compiler/cmm/CmmMachOp.hs b/compiler/cmm/CmmMachOp.hs index 053b425ea1ba7ef9fd243ddbd09631d850910157..9740d21bef7b3123619c213491c3eb6541c3fef9 100644 --- a/compiler/cmm/CmmMachOp.hs +++ b/compiler/cmm/CmmMachOp.hs @@ -593,6 +593,7 @@ data CallishMachOp | MO_SubIntC Width | MO_U_Mul2 Width + | MO_ReadBarrier | MO_WriteBarrier | MO_Touch -- Keep variables live (when using interior pointers) diff --git a/compiler/cmm/CmmParse.y b/compiler/cmm/CmmParse.y index f83fb6b00006df02cacd910d21892224f45f6917..f5631452505cfdc3b2c5e7764fd363d991b54891 100644 --- a/compiler/cmm/CmmParse.y +++ b/compiler/cmm/CmmParse.y @@ -1001,6 +1001,7 @@ machOps = listToUFM $ callishMachOps :: UniqFM ([CmmExpr] -> (CallishMachOp, [CmmExpr])) callishMachOps = listToUFM $ map (\(x, y) -> (mkFastString x, y)) [ + ( "read_barrier", (MO_ReadBarrier,)), ( "write_barrier", (MO_WriteBarrier,)), ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ), ( "memset", memcpyLikeTweakArgs MO_Memset ), diff --git a/compiler/cmm/PprC.hs b/compiler/cmm/PprC.hs index f68496879533bff5677784091b736da0a33d370b..7227edd57eb67b62bd0e88a516d755d2391adf6a 100644 --- a/compiler/cmm/PprC.hs +++ b/compiler/cmm/PprC.hs @@ -812,6 +812,7 @@ pprCallishMachOp_for_C mop MO_F32_ExpM1 -> text "expm1f" MO_F32_Sqrt -> text "sqrtf" MO_F32_Fabs -> text "fabsf" + MO_ReadBarrier -> text "load_load_barrier" MO_WriteBarrier -> text "write_barrier" MO_Memcpy _ -> text "memcpy" MO_Memset _ -> text "memset" diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs index 6e6e6ac79192403b84b67cb984421bcb85d79fe2..68a79878d3b390e57c123db92212569a294a534e 100644 --- a/compiler/codeGen/StgCmmBind.hs +++ b/compiler/codeGen/StgCmmBind.hs @@ -632,6 +632,7 @@ emitBlackHoleCode node = do when eager_blackholing $ do emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr + -- See Note [Heap memory barriers] in SMP.h. emitPrimCall [] MO_WriteBarrier [] emitStore node (CmmReg (CmmGlobal EagerBlackholeInfo)) diff --git a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs index bdf6a2642f27213a39bbb44c7203fd88643a73ad..86a59381b283e3992938c35934d989014bfd4348 100644 --- a/compiler/llvmGen/LlvmCodeGen/CodeGen.hs +++ b/compiler/llvmGen/LlvmCodeGen/CodeGen.hs @@ -169,17 +169,25 @@ barrier = do let s = Fence False SyncSeqCst return (unitOL s, []) +-- | Insert a 'barrier', unless the target platform is in the provided list of +-- exceptions (where no code will be emitted instead). +barrierUnless :: [Arch] -> LlvmM StmtData +barrierUnless exs = do + platform <- getLlvmPlatform + if platformArch platform `elem` exs + then return (nilOL, []) + else barrier + -- | Foreign Calls genCall :: ForeignTarget -> [CmmFormal] -> [CmmActual] -> LlvmM StmtData --- Write barrier needs to be handled specially as it is implemented as an LLVM --- intrinsic function. +-- Barriers need to be handled specially as they are implemented as LLVM +-- intrinsic functions. +genCall (PrimTarget MO_ReadBarrier) _ _ = + barrierUnless [ArchX86, ArchX86_64, ArchSPARC] genCall (PrimTarget MO_WriteBarrier) _ _ = do - platform <- getLlvmPlatform - if platformArch platform `elem` [ArchX86, ArchX86_64, ArchSPARC] - then return (nilOL, []) - else barrier + barrierUnless [ArchX86, ArchX86_64, ArchSPARC] genCall (PrimTarget MO_Touch) _ _ = return (nilOL, []) @@ -831,6 +839,7 @@ cmmPrimOpFunctions mop = do -- We support MO_U_Mul2 through ordinary LLVM mul instruction, see the -- appropriate case of genCall. MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported MO_UF_Conv _ -> unsupported diff --git a/compiler/nativeGen/PPC/CodeGen.hs b/compiler/nativeGen/PPC/CodeGen.hs index 516cda0eb3c243d11434b34d26324267139148f0..a49526c93a3ca2ba9f144a3473039d9b3d8ef301 100644 --- a/compiler/nativeGen/PPC/CodeGen.hs +++ b/compiler/nativeGen/PPC/CodeGen.hs @@ -1123,6 +1123,8 @@ genCCall :: ForeignTarget -- function to call -> [CmmFormal] -- where to put the result -> [CmmActual] -- arguments (of mixed type) -> NatM InstrBlock +genCCall (PrimTarget MO_ReadBarrier) _ _ + = return $ unitOL LWSYNC genCCall (PrimTarget MO_WriteBarrier) _ _ = return $ unitOL LWSYNC @@ -2030,6 +2032,7 @@ genCCall' dflags gcp target dest_regs args MO_AddIntC {} -> unsupported MO_SubIntC {} -> unsupported MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported MO_Prefetch_Data _ -> unsupported diff --git a/compiler/nativeGen/SPARC/CodeGen.hs b/compiler/nativeGen/SPARC/CodeGen.hs index 30a4d6979b0008c3b81afcfe28fbe55328d553e8..056d0c6fbf0059a0226b3d57aa6d7797f9ebdf3f 100644 --- a/compiler/nativeGen/SPARC/CodeGen.hs +++ b/compiler/nativeGen/SPARC/CodeGen.hs @@ -401,6 +401,8 @@ genCCall -- -- In the SPARC case we don't need a barrier. -- +genCCall (PrimTarget MO_ReadBarrier) _ _ + = return $ nilOL genCCall (PrimTarget MO_WriteBarrier) _ _ = return $ nilOL @@ -691,6 +693,7 @@ outOfLineMachOp_table mop MO_AddIntC {} -> unsupported MO_SubIntC {} -> unsupported MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported (MO_Prefetch_Data _) -> unsupported diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 73cfb28d461678915f63a32fb76e8e8c90d583be..13662f6807c59b73fb7ad7b916bf459f8d6b5d77 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1891,8 +1891,9 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _ possibleWidth = minimum [left, sizeBytes] dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) +genCCall _ _ (PrimTarget MO_ReadBarrier) _ _ _ = return nilOL genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL - -- write barrier compiles to no code on x86/x86-64; + -- barriers compile to no code on x86/x86-64; -- we keep it this long in order to prevent earlier optimisations. genCCall _ _ (PrimTarget MO_Touch) _ _ _ = return nilOL @@ -2948,6 +2949,7 @@ outOfLineCmmOp bid mop res args MO_AddWordC {} -> unsupported MO_SubWordC {} -> unsupported MO_U_Mul2 {} -> unsupported + MO_ReadBarrier -> unsupported MO_WriteBarrier -> unsupported MO_Touch -> unsupported (MO_Prefetch_Data _ ) -> unsupported diff --git a/includes/Cmm.h b/includes/Cmm.h index 99f5233ab522de4a8e395009aa3cfae27934f26d..e7d4b4944fdb1eb1dacc27ed11ed2491f7c9fb1b 100644 --- a/includes/Cmm.h +++ b/includes/Cmm.h @@ -308,7 +308,9 @@ #define ENTER_(ret,x) \ again: \ W_ info; \ - LOAD_INFO(ret,x) \ + LOAD_INFO(ret,x) \ + /* See Note [Heap memory barriers] in SMP.h */ \ + prim_read_barrier; \ switch [INVALID_OBJECT .. N_CLOSURE_TYPES] \ (TO_W_( %INFO_TYPE(%STD_INFO(info)) )) { \ case \ @@ -631,6 +633,14 @@ #define OVERWRITING_CLOSURE_OFS(c,n) /* nothing */ #endif +// Memory barriers. +// For discussion of how these are used to fence heap object +// accesses see Note [Heap memory barriers] in SMP.h. +#if defined(THREADED_RTS) +#define prim_read_barrier prim %read_barrier() +#else +#define prim_read_barrier /* nothing */ +#endif #if defined(THREADED_RTS) #define prim_write_barrier prim %write_barrier() #else diff --git a/includes/stg/SMP.h b/includes/stg/SMP.h index 16761516ac7436bd38c0c60b2522e855cc98f645..5487a9a4d679aafffa52555fc8e7bfa6c264a887 100644 --- a/includes/stg/SMP.h +++ b/includes/stg/SMP.h @@ -96,6 +96,151 @@ EXTERN_INLINE void write_barrier(void); EXTERN_INLINE void store_load_barrier(void); EXTERN_INLINE void load_load_barrier(void); +/* + * Note [Heap memory barriers] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Machines with weak memory ordering semantics have consequences for how + * closures are observed and mutated. For example, consider a thunk that needs + * to be updated to an indirection. In order for the indirection to be safe for + * concurrent observers to enter, said observers must read the indirection's + * info table before they read the indirectee. Furthermore, the indirectee must + * be set before the info table pointer. This ensures that if the observer sees + * an IND info table then the indirectee is valid. + * + * When a closure is updated with an indirection, both its info table and its + * indirectee must be written. With weak memory ordering, these two writes can + * be arbitrarily reordered, and perhaps even interleaved with other threads' + * reads and writes (in the absence of memory barrier instructions). Consider + * this example of a bad reordering: + * + * - An updater writes to a closure's info table (INFO_TYPE is now IND). + * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND. + * - A concurrent observer reads the closure's indirectee and enters it. + * - An updater writes the closure's indirectee. + * + * Here the update to the indirectee comes too late and the concurrent observer + * has jumped off into the abyss. Speculative execution can also cause us + * issues, consider: + * + * - an observer is about to case on a value in closure's info table. + * - the observer speculatively reads one or more of closure's fields. + * - an updater writes to closure's info table. + * - the observer takes a branch based on the new info table value, but with the + * old closure fields! + * - the updater writes to the closure's other fields, but its too late. + * + * Because of these effects, reads and writes to a closure's info table must be + * ordered carefully with respect to reads and writes to the closure's other + * fields, and memory barriers must be placed to ensure that reads and writes + * occur in program order. Specifically, updates to an already existing closure + * must follow the following pattern: + * + * - Update the closure's (non-info table) fields. + * - Write barrier. + * - Update the closure's info table. + * + * Observing the fields of an updateable closure (e.g. a THUNK) must follow the + * following pattern: + * + * - Read the closure's info pointer. + * - Read barrier. + * - Read the closure's (non-info table) fields. + * + * We must also take care when we expose a newly-allocated closure to other cores + * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message, + * or MutVar#). Specifically, we need to ensure that all writes constructing the + * closure are visible *before* the write exposing the new closure is made visible: + * + * - Allocate memory for the closure + * - Write the closure's info pointer and fields (ordering betweeen this doesn't + * matter since the closure isn't yet visible to anyone else). + * - Write barrier + * - Make closure visible to other cores + * + * Note that thread stacks are inherently thread-local and consequently allocating an + * object and introducing a reference to it to our stack needs no barrier. + * + * There are several ways in which the mutator may make a newly-allocated + * closure visible to other cores: + * + * - Eager blackholing a THUNK: + * This is protected by an explicit write barrier in the eager blackholing + * code produced by the codegen. See StgCmmBind.emitBlackHoleCode. + * + * - Lazy blackholing a THUNK: + * This is is protected by an explicit write barrier in the thread suspension + * code. See ThreadPaused.c:threadPaused. + * + * - Updating a BLACKHOLE: + * This case is protected by explicit write barriers in the the update frame + * entry code (see rts/Updates.h). + * + * - Blocking on an MVar# (e.g. takeMVar#): + * In this case the appropriate MVar primops (e.g. stg_takeMVarzh). include + * explicit memory barriers to ensure that the the newly-allocated + * MVAR_TSO_QUEUE is visible to other cores. + * + * - Write to an MVar# (e.g. putMVar#): + * This protected by the full barrier implied by the CAS in putMVar#. + * + * - Write to a TVar#: + * This is protected by the full barrier implied by the CAS in STM.c:lock_stm. + * + * - Write to an Array#, ArrayArray#, or SmallArray#: + * This case is protected by an explicit write barrier in the code produced + * for this primop by the codegen. See StgCmmPrim.doWritePtrArrayOp and + * StgCmmPrim.doWriteSmallPtrArrayOp. Relevant issue: #12469. + * + * - Write to MutVar# via writeMutVar#: + * This case is protected by an explicit write barrier in the code produced + * for this primop by the codegen. + * + * - Write to MutVar# via atomicModifyMutVar# or casMutVar#: + * This is protected by the full barrier implied by the cmpxchg operations + * in this primops. + * + * - Sending a Message to another capability: + * This is protected by the acquition and release of the target capability's + * lock in Messages.c:sendMessage. + * + * Finally, we must ensure that we flush all cores store buffers before + * entering and leaving GC, since stacks may be read by other cores. This + * happens as a side-effect of taking and release mutexes (which implies + * acquire and release barriers, respectively). + * + * N.B. recordClosureMutated places a reference to the mutated object on + * the capability-local mut_list. Consequently this does not require any memory + * barrier. + * + * During parallel GC we need to be careful during evacuation: before replacing + * a closure with a forwarding pointer we must commit a write barrier to ensure + * that the copy we made in to-space is visible to other cores. + * + * However, we can be a bit lax when *reading* during GC. Specifically, the GC + * can only make a very limited set of changes to existing closures: + * + * - it can replace a closure's info table with stg_WHITEHOLE. + * - it can replace a previously-whitehole'd closure's info table with a + * forwarding pointer + * - it can replace a previously-whitehole'd closure's info table with a + * valid info table pointer (done in eval_thunk_selector) + * - it can update the value of a pointer field after evacuating it + * + * This is quite nice since we don't need to worry about an interleaving + * of writes producing an invalid state: a closure's fields remain valid after + * an update of its info table pointer and vice-versa. + * + * After a round of parallel scavenging we must also ensure that any writes the + * GC thread workers made are visible to the main GC thread. This is ensured by + * the full barrier implied by the atomic decrement in + * GC.c:scavenge_until_all_done. + * + * The work-stealing queue (WSDeque) also requires barriers; these are + * documented in WSDeque.c. + * + */ + /* ---------------------------------------------------------------------------- Implementations ------------------------------------------------------------------------- */ diff --git a/rts/Apply.cmm b/rts/Apply.cmm index 13eb135412c6cef3756fc799960dbf9bcbb69554..8d7fc3c01224a42fccb321a79ed700c076a1e0f8 100644 --- a/rts/Apply.cmm +++ b/rts/Apply.cmm @@ -62,6 +62,8 @@ again: W_ info; P_ untaggedfun; W_ arity; + // We must obey the correct heap object observation pattern in + // Note [Heap memory barriers] in SMP.h. untaggedfun = UNTAG(fun); info = %INFO_PTR(untaggedfun); switch [INVALID_OBJECT .. N_CLOSURE_TYPES] diff --git a/rts/Compact.cmm b/rts/Compact.cmm index 061646846d612c93bb5797bfcea48535bcb731f7..bae94a03cd8c3189d1368af61baffd1fdaabe5f5 100644 --- a/rts/Compact.cmm +++ b/rts/Compact.cmm @@ -53,6 +53,9 @@ import CLOSURE base_GHCziIOziException_cannotCompactPinned_closure; // data structure. It takes the location to store the address of the // compacted object as an argument, so that it can be tail-recursive. // +// N.B. No memory barrier (see Note [Heap memory barriers] in SMP.h) is needed +// here since this is essentially an allocation of a new object which won't +// be visible to other cores until after we return. stg_compactAddWorkerzh ( P_ compact, // The Compact# object P_ p, // The object to compact diff --git a/rts/Interpreter.c b/rts/Interpreter.c index e4b9d5696e46701075008c1c3453e6fa74a8cac9..2a886ff8a4758d49ed1160cb362ab88f798a2ca0 100644 --- a/rts/Interpreter.c +++ b/rts/Interpreter.c @@ -266,7 +266,6 @@ StgClosure * copyPAP (Capability *cap, StgPAP *oldpap) uint32_t size = PAP_sizeW(oldpap->n_args); StgPAP *pap = (StgPAP *)allocate(cap, size); enterFunCCS(&cap->r, oldpap->header.prof.ccs); - SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS); pap->arity = oldpap->arity; pap->n_args = oldpap->n_args; pap->fun = oldpap->fun; @@ -274,6 +273,8 @@ StgClosure * copyPAP (Capability *cap, StgPAP *oldpap) for (i = 0; i < ((StgPAP *)pap)->n_args; i++) { pap->payload[i] = oldpap->payload[i]; } + // No write barrier is needed here as this is a new allocation + SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS); return (StgClosure *)pap; } @@ -799,7 +800,6 @@ do_apply: // build a new PAP and return it. StgPAP *new_pap; new_pap = (StgPAP *)allocate(cap, PAP_sizeW(pap->n_args + m)); - SET_HDR(new_pap,&stg_PAP_info,cap->r.rCCCS); new_pap->arity = pap->arity - n; new_pap->n_args = pap->n_args + m; new_pap->fun = pap->fun; @@ -809,6 +809,8 @@ do_apply: for (i = 0; i < m; i++) { new_pap->payload[pap->n_args + i] = (StgClosure *)SpW(i); } + // No write barrier is needed here as this is a new allocation + SET_HDR(new_pap,&stg_PAP_info,cap->r.rCCCS); tagged_obj = (StgClosure *)new_pap; Sp_addW(m); goto do_return; @@ -844,13 +846,14 @@ do_apply: StgPAP *pap; uint32_t i; pap = (StgPAP *)allocate(cap, PAP_sizeW(m)); - SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS); pap->arity = arity - n; pap->fun = obj; pap->n_args = m; for (i = 0; i < m; i++) { pap->payload[i] = (StgClosure *)SpW(i); } + // No write barrier is needed here as this is a new allocation + SET_HDR(pap, &stg_PAP_info,cap->r.rCCCS); tagged_obj = (StgClosure *)pap; Sp_addW(m); goto do_return; @@ -1081,7 +1084,6 @@ run_BCO: // the BCO size_words = BCO_BITMAP_SIZE(obj) + 2; new_aps = (StgAP_STACK *) allocate(cap, AP_STACK_sizeW(size_words)); - SET_HDR(new_aps,&stg_AP_STACK_info,cap->r.rCCCS); new_aps->size = size_words; new_aps->fun = &stg_dummy_ret_closure; @@ -1095,6 +1097,9 @@ run_BCO: new_aps->payload[i] = (StgClosure *)SpW(i-2); } + // No write barrier is needed here as this is a new allocation + SET_HDR(new_aps,&stg_AP_STACK_info,cap->r.rCCCS); + // Arrange the stack to call the breakpoint IO action, and // continue execution of this BCO when the IO action returns. // @@ -1423,6 +1428,8 @@ run_BCO: ap = (StgAP*)allocate(cap, AP_sizeW(n_payload)); SpW(-1) = (W_)ap; ap->n_args = n_payload; + // No write barrier is needed here as this is a new allocation + // visible only from our stack SET_HDR(ap, &stg_AP_info, cap->r.rCCCS) Sp_subW(1); goto nextInsn; @@ -1434,6 +1441,8 @@ run_BCO: ap = (StgAP*)allocate(cap, AP_sizeW(n_payload)); SpW(-1) = (W_)ap; ap->n_args = n_payload; + // No write barrier is needed here as this is a new allocation + // visible only from our stack SET_HDR(ap, &stg_AP_NOUPD_info, cap->r.rCCCS) Sp_subW(1); goto nextInsn; @@ -1447,6 +1456,8 @@ run_BCO: SpW(-1) = (W_)pap; pap->n_args = n_payload; pap->arity = arity; + // No write barrier is needed here as this is a new allocation + // visible only from our stack SET_HDR(pap, &stg_PAP_info, cap->r.rCCCS) Sp_subW(1); goto nextInsn; @@ -1522,12 +1533,14 @@ run_BCO: itbl->layout.payload.nptrs ); StgClosure* con = (StgClosure*)allocate_NONUPD(cap,request); ASSERT( itbl->layout.payload.ptrs + itbl->layout.payload.nptrs > 0); - SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), cap->r.rCCCS); for (i = 0; i < n_words; i++) { con->payload[i] = (StgClosure*)SpW(i); } Sp_addW(n_words); Sp_subW(1); + // No write barrier is needed here as this is a new allocation + // visible only from our stack + SET_HDR(con, (StgInfoTable*)BCO_LIT(o_itbl), cap->r.rCCCS); SpW(0) = (W_)con; IF_DEBUG(interpreter, debugBelch("\tBuilt "); diff --git a/rts/Messages.c b/rts/Messages.c index 2b13b6306cd48436aecc46278264cfeebf859e5d..d878db5eda2ea9a4ad9b9f9daa29273e4772f3e9 100644 --- a/rts/Messages.c +++ b/rts/Messages.c @@ -173,6 +173,7 @@ uint32_t messageBlackHole(Capability *cap, MessageBlackHole *msg) "blackhole %p", (W_)msg->tso->id, msg->bh); info = bh->header.info; + load_load_barrier(); // See Note [Heap memory barriers] in SMP.h // If we got this message in our inbox, it might be that the // BLACKHOLE has already been updated, and GC has shorted out the @@ -196,6 +197,7 @@ loop: // and turns this into an infinite loop. p = UNTAG_CLOSURE((StgClosure*)VOLATILE_LOAD(&((StgInd*)bh)->indirectee)); info = p->header.info; + load_load_barrier(); // See Note [Heap memory barriers] in SMP.h if (info == &stg_IND_info) { @@ -226,7 +228,6 @@ loop: bq = (StgBlockingQueue*)allocate(cap, sizeofW(StgBlockingQueue)); // initialise the BLOCKING_QUEUE object - SET_HDR(bq, &stg_BLOCKING_QUEUE_DIRTY_info, CCS_SYSTEM); bq->bh = bh; bq->queue = msg; bq->owner = owner; @@ -238,6 +239,11 @@ loop: // a collision to update a BLACKHOLE and a BLOCKING_QUEUE // becomes orphaned (see updateThunk()). bq->link = owner->bq; + SET_HDR(bq, &stg_BLOCKING_QUEUE_DIRTY_info, CCS_SYSTEM); + // We are about to make the newly-constructed message visible to other cores; + // a barrier is necessary to ensure that all writes are visible. + // See Note [Heap memory barriers] in SMP.h. + write_barrier(); owner->bq = bq; dirty_TSO(cap, owner); // we modified owner->bq @@ -255,7 +261,7 @@ loop: } // point to the BLOCKING_QUEUE from the BLACKHOLE - write_barrier(); // make the BQ visible + write_barrier(); // make the BQ visible, see Note [Heap memory barriers]. ((StgInd*)bh)->indirectee = (StgClosure *)bq; recordClosureMutated(cap,bh); // bh was mutated @@ -286,10 +292,14 @@ loop: msg->link = bq->queue; bq->queue = msg; + // No barrier is necessary here: we are only exposing the + // closure to the GC. See Note [Heap memory barriers] in SMP.h. recordClosureMutated(cap,(StgClosure*)msg); if (info == &stg_BLOCKING_QUEUE_CLEAN_info) { bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info; + // No barrier is necessary here: we are only exposing the + // closure to the GC. See Note [Heap memory barriers] in SMP.h. recordClosureMutated(cap,(StgClosure*)bq); } diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index d89f0a952b228f5dc02392e50af121a1264755c1..afb990dda50a77dc29fb8e74b326bb5b1bc09da0 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -102,6 +102,7 @@ stg_newPinnedByteArrayzh ( W_ n ) to BA_ALIGN bytes: */ p = p + ((-p - SIZEOF_StgArrBytes) & BA_MASK); + /* No write barrier needed since this is a new allocation. */ SET_HDR(p, stg_ARR_WORDS_info, CCCS); StgArrBytes_bytes(p) = n; return (p); @@ -144,6 +145,7 @@ stg_newAlignedPinnedByteArrayzh ( W_ n, W_ alignment ) <alignment> is a power of 2, which is technically not guaranteed */ p = p + ((-p - SIZEOF_StgArrBytes) & (alignment - 1)); + /* No write barrier needed since this is a new allocation. */ SET_HDR(p, stg_ARR_WORDS_info, CCCS); StgArrBytes_bytes(p) = n; return (p); @@ -254,6 +256,7 @@ stg_newArrayzh ( W_ n /* words */, gcptr init ) } TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(size), 0); + /* No write barrier needed since this is a new allocation. */ SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS); StgMutArrPtrs_ptrs(arr) = n; StgMutArrPtrs_size(arr) = size; @@ -405,6 +408,7 @@ stg_newSmallArrayzh ( W_ n /* words */, gcptr init ) } TICK_ALLOC_PRIM(SIZEOF_StgSmallMutArrPtrs, WDS(n), 0); + /* No write barrier needed since this is a new allocation. */ SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS); StgSmallMutArrPtrs_ptrs(arr) = n; @@ -522,6 +526,7 @@ stg_newMutVarzh ( gcptr init ) ALLOC_PRIM_P (SIZEOF_StgMutVar, stg_newMutVarzh, init); mv = Hp - SIZEOF_StgMutVar + WDS(1); + /* No write barrier needed since this is a new allocation. */ SET_HDR(mv,stg_MUT_VAR_DIRTY_info,CCCS); StgMutVar_var(mv) = init; @@ -700,6 +705,7 @@ stg_mkWeakzh ( gcptr key, ALLOC_PRIM (SIZEOF_StgWeak) w = Hp - SIZEOF_StgWeak + WDS(1); + // No memory barrier needed as this is a new allocation. SET_HDR(w, stg_WEAK_info, CCCS); StgWeak_key(w) = key; @@ -815,6 +821,7 @@ stg_deRefWeakzh ( gcptr w ) gcptr val; info = GET_INFO(w); + prim_read_barrier; if (info == stg_WHITEHOLE_info) { // w is locked by another thread. Now it's not immediately clear if w is @@ -1385,11 +1392,13 @@ stg_readTVarzh (P_ tvar) stg_readTVarIOzh ( P_ tvar /* :: TVar a */ ) { - W_ result; + W_ result, resultinfo; again: result = StgTVar_current_value(tvar); - if (%INFO_PTR(result) == stg_TREC_HEADER_info) { + resultinfo = %INFO_PTR(result); + prim_read_barrier; + if (resultinfo == stg_TREC_HEADER_info) { goto again; } return (result); @@ -1458,6 +1467,7 @@ stg_newMVarzh () ALLOC_PRIM_ (SIZEOF_StgMVar, stg_newMVarzh); mvar = Hp - SIZEOF_StgMVar + WDS(1); + // No memory barrier needed as this is a new allocation. SET_HDR(mvar,stg_MVAR_DIRTY_info,CCCS); // MVARs start dirty: generation 0 has no mutable list StgMVar_head(mvar) = stg_END_TSO_QUEUE_closure; @@ -1482,7 +1492,7 @@ stg_newMVarzh () stg_takeMVarzh ( P_ mvar /* :: MVar a */ ) { - W_ val, info, tso, q; + W_ val, info, tso, q, qinfo; LOCK_CLOSURE(mvar, info); @@ -1504,9 +1514,12 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ ) q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1); - SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); StgMVarTSOQueue_link(q) = END_TSO_QUEUE; StgMVarTSOQueue_tso(q) = CurrentTSO; + SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); + // Write barrier before we make the new MVAR_TSO_QUEUE + // visible to other cores. + prim_write_barrier; if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) { StgMVar_head(mvar) = q; @@ -1536,8 +1549,10 @@ loop: unlockClosure(mvar, info); return (val); } - if (StgHeader_info(q) == stg_IND_info || - StgHeader_info(q) == stg_MSG_NULL_info) { + qinfo = StgHeader_info(q); + prim_read_barrier; + if (qinfo == stg_IND_info || + qinfo == stg_MSG_NULL_info) { q = StgInd_indirectee(q); goto loop; } @@ -1575,7 +1590,7 @@ loop: stg_tryTakeMVarzh ( P_ mvar /* :: MVar a */ ) { - W_ val, info, tso, q; + W_ val, info, tso, q, qinfo; LOCK_CLOSURE(mvar, info); @@ -1602,8 +1617,11 @@ loop: return (1, val); } - if (StgHeader_info(q) == stg_IND_info || - StgHeader_info(q) == stg_MSG_NULL_info) { + qinfo = StgHeader_info(q); + prim_read_barrier; + + if (qinfo == stg_IND_info || + qinfo == stg_MSG_NULL_info) { q = StgInd_indirectee(q); goto loop; } @@ -1642,7 +1660,7 @@ loop: stg_putMVarzh ( P_ mvar, /* :: MVar a */ P_ val, /* :: a */ ) { - W_ info, tso, q; + W_ info, tso, q, qinfo; LOCK_CLOSURE(mvar, info); @@ -1662,10 +1680,12 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */ q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1); - SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); StgMVarTSOQueue_link(q) = END_TSO_QUEUE; StgMVarTSOQueue_tso(q) = CurrentTSO; + SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); + prim_write_barrier; + if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) { StgMVar_head(mvar) = q; } else { @@ -1692,8 +1712,12 @@ loop: unlockClosure(mvar, stg_MVAR_DIRTY_info); return (); } - if (StgHeader_info(q) == stg_IND_info || - StgHeader_info(q) == stg_MSG_NULL_info) { + + qinfo = StgHeader_info(q); + prim_read_barrier; + + if (qinfo == stg_IND_info || + qinfo == stg_MSG_NULL_info) { q = StgInd_indirectee(q); goto loop; } @@ -1750,7 +1774,7 @@ loop: stg_tryPutMVarzh ( P_ mvar, /* :: MVar a */ P_ val, /* :: a */ ) { - W_ info, tso, q; + W_ info, tso, q, qinfo; LOCK_CLOSURE(mvar, info); @@ -1773,8 +1797,12 @@ loop: unlockClosure(mvar, stg_MVAR_DIRTY_info); return (1); } - if (StgHeader_info(q) == stg_IND_info || - StgHeader_info(q) == stg_MSG_NULL_info) { + + qinfo = StgHeader_info(q); + prim_read_barrier; + + if (qinfo == stg_IND_info || + qinfo == stg_MSG_NULL_info) { q = StgInd_indirectee(q); goto loop; } @@ -1845,10 +1873,12 @@ stg_readMVarzh ( P_ mvar, /* :: MVar a */ ) // readMVars are pushed to the front of the queue, so // they get handled immediately - SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); StgMVarTSOQueue_link(q) = StgMVar_head(mvar); StgMVarTSOQueue_tso(q) = CurrentTSO; + SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); + prim_write_barrier; + StgTSO__link(CurrentTSO) = q; StgTSO_block_info(CurrentTSO) = mvar; StgTSO_why_blocked(CurrentTSO) = BlockedOnMVarRead::I16; @@ -1913,6 +1943,10 @@ stg_makeStableNamezh ( P_ obj ) BYTES_TO_WDS(SIZEOF_StgStableName)); SET_HDR(sn_obj, stg_STABLE_NAME_info, CCCS); StgStableName_sn(sn_obj) = index; + // This will make the StableName# object visible to other threads; + // be sure that its completely visible to other cores. + // See Note [Heap memory barriers] in SMP.h. + prim_write_barrier; snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry) = sn_obj; } else { sn_obj = snEntry_sn_obj(W_[stable_name_table] + index*SIZEOF_snEntry); @@ -1954,6 +1988,7 @@ stg_newBCOzh ( P_ instrs, ALLOC_PRIM (bytes); bco = Hp - bytes + WDS(1); + // No memory barrier necessary as this is a new allocation. SET_HDR(bco, stg_BCO_info, CCS_MAIN); StgBCO_instrs(bco) = instrs; @@ -1990,6 +2025,7 @@ stg_mkApUpd0zh ( P_ bco ) CCCS_ALLOC(SIZEOF_StgAP); ap = Hp - SIZEOF_StgAP + WDS(1); + // No memory barrier necessary as this is a new allocation. SET_HDR(ap, stg_AP_info, CCS_MAIN); StgAP_n_args(ap) = HALF_W_(0); @@ -2002,6 +2038,7 @@ stg_unpackClosurezh ( P_ closure ) { W_ info, ptrs, nptrs, p, ptrs_arr, dat_arr; info = %GET_STD_INFO(UNTAG(closure)); + prim_read_barrier; ptrs = TO_W_(%INFO_PTRS(info)); nptrs = TO_W_(%INFO_NPTRS(info)); @@ -2330,7 +2367,10 @@ stg_noDuplicatezh /* no arg list: explicit stack layout */ stg_getApStackValzh ( P_ ap_stack, W_ offset ) { - if (%INFO_PTR(UNTAG(ap_stack)) == stg_AP_STACK_info) { + W_ ap_stackinfo; + ap_stackinfo = %INFO_PTR(UNTAG(ap_stack)); + prim_read_barrier; + if (ap_stackinfo == stg_AP_STACK_info) { return (1,StgAP_STACK_payload(ap_stack,offset)); } else { return (0,ap_stack); diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c index f58f9177c8c31fb14b984e9ee35827b7622b90bb..807c3e3d30a2783c3234ab758312980674947e7f 100644 --- a/rts/RaiseAsync.c +++ b/rts/RaiseAsync.c @@ -870,6 +870,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception, ap->payload[i] = (StgClosure *)*sp++; } + write_barrier(); // XXX: Necessary? SET_HDR(ap,&stg_AP_STACK_info, ((StgClosure *)frame)->header.prof.ccs /* ToDo */); TICK_ALLOC_UP_THK(WDS(words+1),0); diff --git a/rts/Sparks.c b/rts/Sparks.c index bd5e120863054a41178c0af23696fb092d3269ff..4022691da2a10a8ff25434e3386c97fcb8183eb3 100644 --- a/rts/Sparks.c +++ b/rts/Sparks.c @@ -182,6 +182,7 @@ pruneSparkQueue (Capability *cap) traceEventSparkFizzle(cap); } else { info = spark->header.info; + load_load_barrier(); if (IS_FORWARDING_PTR(info)) { tmp = (StgClosure*)UN_FORWARDING_PTR(info); /* if valuable work: shift inside the pool */ diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm index fdd9f1565eb592ee6ffc5633f9801830585d1656..e80ce45172cc9e7d3d63a8c90946b106281c0612 100644 --- a/rts/StgMiscClosures.cmm +++ b/rts/StgMiscClosures.cmm @@ -292,12 +292,14 @@ INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE") TICK_ENT_DYN_IND(); /* tick */ retry: + prim_read_barrier; p = StgInd_indirectee(node); if (GETTAG(p) != 0) { return (p); } info = StgHeader_info(p); + prim_read_barrier; if (info == stg_IND_info) { // This could happen, if e.g. we got a BLOCKING_QUEUE that has // just been replaced with an IND by another thread in @@ -313,9 +315,11 @@ retry: ("ptr" msg) = ccall allocate(MyCapability() "ptr", BYTES_TO_WDS(SIZEOF_MessageBlackHole)); - SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM); MessageBlackHole_tso(msg) = CurrentTSO; MessageBlackHole_bh(msg) = node; + SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM); + // messageBlackHole has appropriate memory barriers when this object is exposed. + // See Note [Heap memory barriers]. (r) = ccall messageBlackHole(MyCapability() "ptr", msg "ptr"); diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c index a916891aa8cc37b29c2157de3e17e924adc2f7a6..cccc7ad0b0a241be52eba7f8fca97163b80a5e41 100644 --- a/rts/ThreadPaused.c +++ b/rts/ThreadPaused.c @@ -195,6 +195,7 @@ threadPaused(Capability *cap, StgTSO *tso) const StgRetInfoTable *info; const StgInfoTable *bh_info; const StgInfoTable *cur_bh_info USED_IF_THREADS; + const StgInfoTable *frame_info; StgClosure *bh; StgPtr stack_end; uint32_t words_to_squeeze = 0; @@ -218,6 +219,8 @@ threadPaused(Capability *cap, StgTSO *tso) frame = (StgClosure *)tso->stackobj->sp; + // N.B. We know that the TSO is owned by the current capability so no + // memory barriers are needed here. while ((P_)frame < stack_end) { info = get_ret_itbl(frame); @@ -226,7 +229,8 @@ threadPaused(Capability *cap, StgTSO *tso) case UPDATE_FRAME: // If we've already marked this frame, then stop here. - if (frame->header.info == (StgInfoTable *)&stg_marked_upd_frame_info) { + frame_info = frame->header.info; + if (frame_info == (StgInfoTable *)&stg_marked_upd_frame_info) { if (prev_was_update_frame) { words_to_squeeze += sizeofW(StgUpdateFrame); weight += weight_pending; diff --git a/rts/Threads.c b/rts/Threads.c index 977635322d82defac33875606c1a1b5df974cbd9..2bdcea1c0056c0f96597b1d8d1abdce339321d1a 100644 --- a/rts/Threads.c +++ b/rts/Threads.c @@ -126,6 +126,8 @@ createThread(Capability *cap, W_ size) ACQUIRE_LOCK(&sched_mutex); tso->id = next_thread_id++; // while we have the mutex tso->global_link = g0->threads; + /* Mutations above need no memory barrier since this lock will provide + * a release barrier */ g0->threads = tso; RELEASE_LOCK(&sched_mutex); @@ -257,8 +259,10 @@ tryWakeupThread (Capability *cap, StgTSO *tso) { MessageWakeup *msg; msg = (MessageWakeup *)allocate(cap,sizeofW(MessageWakeup)); - SET_HDR(msg, &stg_MSG_TRY_WAKEUP_info, CCS_SYSTEM); msg->tso = tso; + SET_HDR(msg, &stg_MSG_TRY_WAKEUP_info, CCS_SYSTEM); + // Ensure that writes constructing Message are committed before sending. + write_barrier(); sendMessage(cap, tso->cap, (Message*)msg); debugTraceCap(DEBUG_sched, cap, "message: try wakeup thread %ld on cap %d", (W_)tso->id, tso->cap->no); @@ -363,6 +367,7 @@ wakeBlockingQueue(Capability *cap, StgBlockingQueue *bq) for (msg = bq->queue; msg != (MessageBlackHole*)END_TSO_QUEUE; msg = msg->link) { i = msg->header.info; + load_load_barrier(); if (i != &stg_IND_info) { ASSERT(i == &stg_MSG_BLACKHOLE_info); tryWakeupThread(cap,msg->tso); @@ -392,15 +397,18 @@ checkBlockingQueues (Capability *cap, StgTSO *tso) for (bq = tso->bq; bq != (StgBlockingQueue*)END_TSO_QUEUE; bq = next) { next = bq->link; - if (bq->header.info == &stg_IND_info) { + const StgInfoTable *bqinfo = bq->header.info; + load_load_barrier(); // XXX: Is this needed? + if (bqinfo == &stg_IND_info) { // ToDo: could short it out right here, to avoid // traversing this IND multiple times. continue; } p = bq->bh; - - if (p->header.info != &stg_BLACKHOLE_info || + const StgInfoTable *pinfo = p->header.info; + load_load_barrier(); + if (pinfo != &stg_BLACKHOLE_info || ((StgInd *)p)->indirectee != (StgClosure*)bq) { wakeBlockingQueue(cap,bq); @@ -424,6 +432,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val) const StgInfoTable *i; i = thunk->header.info; + load_load_barrier(); if (i != &stg_BLACKHOLE_info && i != &stg_CAF_BLACKHOLE_info && i != &__stg_EAGER_BLACKHOLE_info && @@ -444,6 +453,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val) } i = v->header.info; + load_load_barrier(); if (i == &stg_TSO_info) { checkBlockingQueues(cap, tso); return; @@ -667,6 +677,8 @@ threadStackOverflow (Capability *cap, StgTSO *tso) new_stack->sp -= chunk_words; } + // No write barriers needed; all of the writes above are to structured + // owned by our capability. tso->stackobj = new_stack; // we're about to run it, better mark it dirty @@ -738,6 +750,7 @@ threadStackUnderflow (Capability *cap, StgTSO *tso) bool performTryPutMVar(Capability *cap, StgMVar *mvar, StgClosure *value) { const StgInfoTable *info; + const StgInfoTable *qinfo; StgMVarTSOQueue *q; StgTSO *tso; @@ -762,8 +775,11 @@ loop: unlockClosure((StgClosure*)mvar, &stg_MVAR_DIRTY_info); return true; } - if (q->header.info == &stg_IND_info || - q->header.info == &stg_MSG_NULL_info) { + + qinfo = q->header.info; + load_load_barrier(); + if (qinfo == &stg_IND_info || + qinfo == &stg_MSG_NULL_info) { q = (StgMVarTSOQueue*)((StgInd*)q)->indirectee; goto loop; } diff --git a/rts/TopHandler.c b/rts/TopHandler.c index c0ac936b85ea5db318115711b2c9c1122e9ea740..d5175015e776b2ba92c7b930ac64857267efa427 100644 --- a/rts/TopHandler.c +++ b/rts/TopHandler.c @@ -29,6 +29,7 @@ StgTSO *getTopHandlerThread(void) { StgWeak *weak = (StgWeak*)deRefStablePtr(topHandlerPtr); RELEASE_LOCK(&m); const StgInfoTable *info = weak->header.info; + load_load_barrier(); if (info == &stg_WEAK_info) { StgClosure *key = ((StgWeak*)weak)->key; diff --git a/rts/Updates.h b/rts/Updates.h index 1ba398bd354d35540faf8526e1dd8a38d2ed231d..1bd3e065afbb11348f03d805b0857f71526f36ce 100644 --- a/rts/Updates.h +++ b/rts/Updates.h @@ -39,10 +39,16 @@ PROF_HDR_FIELDS(w_,ccs,p2) \ p_ updatee - +/* + * Getting the memory barriers correct here is quite tricky. Essentially + * the write barrier ensures that any writes to the new indirectee are visible + * before we introduce the indirection. + * See Note [Heap memory barriers] in SMP.h. + */ #define updateWithIndirection(p1, p2, and_then) \ W_ bd; \ \ + prim_write_barrier; \ OVERWRITING_CLOSURE(p1); \ StgInd_indirectee(p1) = p2; \ prim_write_barrier; \ @@ -69,6 +75,8 @@ INLINE_HEADER void updateWithIndirection (Capability *cap, ASSERT( (P_)p1 != (P_)p2 ); /* not necessarily true: ASSERT( !closure_IND(p1) ); */ /* occurs in RaiseAsync.c:raiseAsync() */ + /* See Note [Heap memory barriers] in SMP.h */ + write_barrier(); OVERWRITING_CLOSURE(p1); ((StgInd *)p1)->indirectee = p2; write_barrier(); diff --git a/rts/Weak.c b/rts/Weak.c index a322d822afccfe575cd073a8a535450e4c6c1032..ec998c214f46e9194ea02fa11d7cf183392b61e3 100644 --- a/rts/Weak.c +++ b/rts/Weak.c @@ -57,7 +57,9 @@ runAllCFinalizers(StgWeak *list) // If there's no major GC between the time that the finalizer for the // object from the oldest generation is manually called and shutdown // we end up running the same finalizer twice. See #7170. - if (w->header.info != &stg_DEAD_WEAK_info) { + const StgInfoTable *winfo = w->header.info; + load_load_barrier(); + if (winfo != &stg_DEAD_WEAK_info) { runCFinalizers((StgCFinalizerList *)w->cfinalizers); } } @@ -138,6 +140,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list) size = n + mutArrPtrsCardTableSize(n); arr = (StgMutArrPtrs *)allocate(cap, sizeofW(StgMutArrPtrs) + size); TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0); + // No write barrier needed here; this array is only going to referred to by this core. SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info, CCS_SYSTEM); arr->ptrs = n; arr->size = size; diff --git a/rts/sm/CNF.c b/rts/sm/CNF.c index d0447f867c02a265e950aaa70c57fe7a4e715748..0432505cd2f23221e12c017d01b334c2db867154 100644 --- a/rts/sm/CNF.c +++ b/rts/sm/CNF.c @@ -542,8 +542,9 @@ insertCompactHash (Capability *cap, StgClosure *p, StgClosure *to) { insertHashTable(str->hash, (StgWord)p, (const void*)to); - if (str->header.info == &stg_COMPACT_NFDATA_CLEAN_info) { - str->header.info = &stg_COMPACT_NFDATA_DIRTY_info; + const StgInfoTable *strinfo = str->header.info; + if (strinfo == &stg_COMPACT_NFDATA_CLEAN_info) { + strinfo = &stg_COMPACT_NFDATA_DIRTY_info; recordClosureMutated(cap, (StgClosure*)str); } } diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index 2a2a26ec091b8e7aaf62ab396f7ad7d538efd2bf..7c82caa185610c6aa763bef60201451114259cc3 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -131,7 +131,7 @@ copy_tag(StgClosure **p, const StgInfoTable *info, #else src->header.info = (const StgInfoTable *)MK_FORWARDING_PTR(to); *p = TAG_CLOSURE(tag,(StgClosure*)to); -#endif +#endif /* defined(PARALLEL_GC) */ #if defined(PROFILING) // We store the size of the just evacuated object in the LDV word so that @@ -194,7 +194,7 @@ spin: if (info == (W_)&stg_WHITEHOLE_info) { #if defined(PROF_SPIN) whitehole_gc_spin++; -#endif +#endif /* PROF_SPIN */ busy_wait_nop(); goto spin; } @@ -205,7 +205,7 @@ spin: } #else info = (W_)src->header.info; -#endif +#endif /* PARALLEL_GC */ to = alloc_for_copy(size_to_reserve, gen_no); @@ -216,8 +216,8 @@ spin: } write_barrier(); - src->header.info = (const StgInfoTable*)MK_FORWARDING_PTR(to); *p = (StgClosure *)to; + src->header.info = (const StgInfoTable*)MK_FORWARDING_PTR(to); #if defined(PROFILING) // We store the size of the just evacuated object in the LDV word so that @@ -1099,6 +1099,7 @@ selector_chain: // need the write-barrier stuff. // - undo the chain we've built to point to p. SET_INFO((StgClosure *)p, (const StgInfoTable *)info_ptr); + write_barrier(); *q = (StgClosure *)p; if (evac) evacuate(q); unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p); @@ -1109,7 +1110,7 @@ selector_chain: // Save the real info pointer (NOTE: not the same as get_itbl()). info_ptr = (StgWord)p->header.info; SET_INFO((StgClosure *)p,&stg_WHITEHOLE_info); -#endif +#endif /* THREADED_RTS */ field = INFO_PTR_TO_STRUCT((StgInfoTable *)info_ptr)->layout.selector_offset; @@ -1165,6 +1166,7 @@ selector_loop: SET_INFO((StgClosure*)p, (StgInfoTable *)info_ptr); OVERWRITING_CLOSURE((StgClosure*)p); SET_INFO((StgClosure*)p, &stg_WHITEHOLE_info); + write_barrier(); } #endif diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 4bf540a4a77ffdd62e83c362f03374a6b5899a96..92a5e229a1c209dffa123d429c4f6838d1948d6a 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -1104,6 +1104,8 @@ loop: // scavenge_loop() only exits when there's no work to do + // This atomic decrement also serves as a full barrier to ensure that any + // writes we made during scavenging are visible to other threads. #if defined(DEBUG) r = dec_running(); #else diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c index e8ca0c4002a0dc8aec615e20637d7bfda2972307..650dc2c1dfbab2bc0d6c103a27522a502e2a1d2a 100644 --- a/rts/sm/GCAux.c +++ b/rts/sm/GCAux.c @@ -83,6 +83,7 @@ isAlive(StgClosure *p) } info = INFO_PTR_TO_STRUCT(info); + load_load_barrier(); switch (info->type) { diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c index d5982e2f64686cb8ba91445c96931053a0bd34b5..7475b5e6256d3b51203a8a9e2370468f96e649e2 100644 --- a/rts/sm/MarkWeak.c +++ b/rts/sm/MarkWeak.c @@ -234,16 +234,22 @@ static bool tidyWeakList(generation *gen) last_w = &gen->old_weak_ptr_list; for (w = gen->old_weak_ptr_list; w != NULL; w = next_w) { + info = w->header.info; + /* N.B. This function is executed only during the serial part of GC + * so consequently there is no potential for data races and therefore + * no need for memory barriers. + */ + /* There might be a DEAD_WEAK on the list if finalizeWeak# was * called on a live weak pointer object. Just remove it. */ - if (w->header.info == &stg_DEAD_WEAK_info) { + if (info == &stg_DEAD_WEAK_info) { next_w = w->link; *last_w = next_w; continue; } - info = get_itbl((StgClosure *)w); + info = INFO_PTR_TO_STRUCT(info); switch (info->type) { case WEAK: diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c index 8082b7e6d08978edd10a351de40b1d7f4d934a58..ff76f747c98780a4067bba54a1d20738c673d3d2 100644 --- a/rts/sm/Sanity.c +++ b/rts/sm/Sanity.c @@ -233,6 +233,7 @@ checkClosure( const StgClosure* p ) p = UNTAG_CONST_CLOSURE(p); info = p->header.info; + load_load_barrier(); if (IS_FORWARDING_PTR(info)) { barf("checkClosure: found EVACUATED closure %d", info->type); @@ -243,6 +244,7 @@ checkClosure( const StgClosure* p ) #endif info = INFO_PTR_TO_STRUCT(info); + load_load_barrier(); switch (info->type) { @@ -564,6 +566,7 @@ checkTSO(StgTSO *tso) next = tso->_link; info = (const StgInfoTable*) tso->_link->header.info; + load_load_barrier(); ASSERT(next == END_TSO_QUEUE || info == &stg_MVAR_TSO_QUEUE_info || diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c index 73a790e9eda2ca5e28735d250f5b30d0646d132e..c486cd96c5867e79926dfbb2702732cdd8918b44 100644 --- a/rts/sm/Scav.c +++ b/rts/sm/Scav.c @@ -431,7 +431,7 @@ scavenge_block (bdescr *bd) // time around the loop. while (p < bd->free || (bd == ws->todo_bd && p < ws->todo_free)) { - ASSERT(bd->link == NULL); + ASSERT(bd->link == NULL); ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); info = get_itbl((StgClosure *)p); @@ -1580,6 +1580,7 @@ scavenge_mutable_list(bdescr *bd, generation *gen) ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); #if defined(DEBUG) + const StgInfoTable *pinfo; switch (get_itbl((StgClosure *)p)->type) { case MUT_VAR_CLEAN: // can happen due to concurrent writeMutVars @@ -1599,9 +1600,10 @@ scavenge_mutable_list(bdescr *bd, generation *gen) case TREC_CHUNK: mutlist_TREC_CHUNK++; break; case MUT_PRIM: - if (((StgClosure*)p)->header.info == &stg_TVAR_WATCH_QUEUE_info) + pinfo = ((StgClosure*)p)->header.info; + if (pinfo == &stg_TVAR_WATCH_QUEUE_info) mutlist_TVAR_WATCH_QUEUE++; - else if (((StgClosure*)p)->header.info == &stg_TREC_HEADER_info) + else if (pinfo == &stg_TREC_HEADER_info) mutlist_TREC_HEADER++; else mutlist_OTHERS++; diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 2e03b776956a636a029b7c845b302bff28433dfc..3f91905f3c504ce72982366e9a2e73e18361567d 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -407,8 +407,10 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf) // Allocate the blackhole indirection closure bh = (StgInd *)allocate(cap, sizeofW(*bh)); - SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs); bh->indirectee = (StgClosure *)cap->r.rCurrentTSO; + SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs); + // Ensure that above writes are visible before we introduce reference as CAF indirectee. + write_barrier(); caf->indirectee = (StgClosure *)bh; write_barrier(); @@ -1081,6 +1083,8 @@ void dirty_MUT_VAR(StgRegTable *reg, StgClosure *p) { Capability *cap = regTableToCapability(reg); + // No barrier required here as no other heap object fields are read. See + // note [Heap memory barriers] in SMP.h. if (p->header.info == &stg_MUT_VAR_CLEAN_info) { p->header.info = &stg_MUT_VAR_DIRTY_info; recordClosureMutated(cap,p); @@ -1090,6 +1094,8 @@ dirty_MUT_VAR(StgRegTable *reg, StgClosure *p) void dirty_TVAR(Capability *cap, StgTVar *p) { + // No barrier required here as no other heap object fields are read. See + // note [Heap memory barriers] in SMP.h. if (p->header.info == &stg_TVAR_CLEAN_info) { p->header.info = &stg_TVAR_DIRTY_info; recordClosureMutated(cap,(StgClosure*)p);