From bd5a1f9156ee4e405e52a1ec4789dada5e98336d Mon Sep 17 00:00:00 2001 From: Cheng Shao <terrorjack@type.dance> Date: Mon, 12 Feb 2024 16:06:20 +0000 Subject: [PATCH] compiler: add SEQ_CST fence support In addition to existing Acquire/Release fences, this commit adds SEQ_CST fence support to GHC, allowing Cmm code to explicitly emit a fence that enforces total memory ordering. The following logic is added: - The MO_SeqCstFence callish MachOp - The %prim fence_seq_cst() Cmm syntax and the SEQ_CST_FENCE macro in Cmm.h - MO_SeqCstFence lowering logic in every single GHC codegen backend --- compiler/GHC/Cmm/MachOp.hs | 1 + compiler/GHC/Cmm/Parser.y | 2 ++ compiler/GHC/CmmToAsm/AArch64/CodeGen.hs | 1 + compiler/GHC/CmmToAsm/PPC/CodeGen.hs | 3 +++ compiler/GHC/CmmToAsm/Wasm/FromCmm.hs | 1 + compiler/GHC/CmmToAsm/X86/CodeGen.hs | 2 +- compiler/GHC/CmmToC.hs | 3 +++ compiler/GHC/CmmToLlvm/CodeGen.hs | 5 +++++ rts/include/Cmm.h | 1 + 9 files changed, 18 insertions(+), 1 deletion(-) diff --git a/compiler/GHC/Cmm/MachOp.hs b/compiler/GHC/Cmm/MachOp.hs index 0e7a267afbd9..81abe1a5d7b3 100644 --- a/compiler/GHC/Cmm/MachOp.hs +++ b/compiler/GHC/Cmm/MachOp.hs @@ -727,6 +727,7 @@ data CallishMachOp | MO_AcquireFence | MO_ReleaseFence + | MO_SeqCstFence -- | Atomic read-modify-write. Arguments are @[dest, n]@. | MO_AtomicRMW Width AtomicMachOp diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y index 07847af27427..18c34896cd05 100644 --- a/compiler/GHC/Cmm/Parser.y +++ b/compiler/GHC/Cmm/Parser.y @@ -1133,6 +1133,8 @@ callishMachOps platform = listToUFM $ -- with an overlapping token ('acquire') in the lexer. ( "fence_acquire", (MO_AcquireFence,)), ( "fence_release", (MO_ReleaseFence,)), + ( "fence_seq_cst", (MO_SeqCstFence,)), + ( "memcpy", memcpyLikeTweakArgs MO_Memcpy ), ( "memset", memcpyLikeTweakArgs MO_Memset ), ( "memmove", memcpyLikeTweakArgs MO_Memmove ), diff --git a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs index b922195404ed..65c1098202dd 100644 --- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs @@ -1730,6 +1730,7 @@ genCCall target dest_regs arg_regs bid = do -- Memory Ordering MO_AcquireFence -> return (unitOL DMBISH, Nothing) MO_ReleaseFence -> return (unitOL DMBISH, Nothing) + MO_SeqCstFence -> return (unitOL DMBISH, Nothing) MO_Touch -> return (nilOL, Nothing) -- Keep variables live (when using interior pointers) -- Prefetch MO_Prefetch_Data _n -> return (nilOL, Nothing) -- Prefetch hint. diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs index 9082616bead4..3877e33e5520 100644 --- a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs @@ -1132,6 +1132,8 @@ genCCall (PrimTarget MO_AcquireFence) _ _ = return $ unitOL LWSYNC genCCall (PrimTarget MO_ReleaseFence) _ _ = return $ unitOL LWSYNC +genCCall (PrimTarget MO_SeqCstFence) _ _ + = return $ unitOL HWSYNC genCCall (PrimTarget MO_Touch) _ _ = return $ nilOL @@ -2098,6 +2100,7 @@ genCCall' config gcp target dest_regs args MO_U_Mul2 {} -> unsupported MO_AcquireFence -> unsupported MO_ReleaseFence -> unsupported + MO_SeqCstFence -> unsupported MO_Touch -> unsupported MO_Prefetch_Data _ -> unsupported unsupported = panic ("outOfLineCmmOp: " ++ show mop diff --git a/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs b/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs index d1f4d203f26c..92e96ab83bf2 100644 --- a/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs +++ b/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs @@ -1189,6 +1189,7 @@ lower_CallishMachOp lbl MO_F32_Sqrt rs xs = lower_CMO_Un_Homo lbl "sqrtf" rs xs lower_CallishMachOp lbl (MO_UF_Conv w0) rs xs = lower_MO_UF_Conv lbl w0 rs xs lower_CallishMachOp _ MO_AcquireFence _ _ = pure $ WasmStatements WasmNop lower_CallishMachOp _ MO_ReleaseFence _ _ = pure $ WasmStatements WasmNop +lower_CallishMachOp _ MO_SeqCstFence _ _ = pure $ WasmStatements WasmNop lower_CallishMachOp _ MO_Touch _ _ = pure $ WasmStatements WasmNop lower_CallishMachOp _ (MO_Prefetch_Data {}) _ _ = pure $ WasmStatements WasmNop lower_CallishMachOp lbl (MO_Memcpy {}) [] xs = do diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs index afbc72f97236..a1744566addc 100644 --- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -2403,6 +2403,7 @@ genSimplePrim bid (MO_Memcmp align) [res] [dst,src,n] = genMemCmp bid a genSimplePrim bid (MO_Memset align) [] [dst,c,n] = genMemSet bid align dst c n genSimplePrim _ MO_AcquireFence [] [] = return nilOL -- barriers compile to no code on x86/x86-64; genSimplePrim _ MO_ReleaseFence [] [] = return nilOL -- we keep it this long in order to prevent earlier optimisations. +genSimplePrim _ MO_SeqCstFence [] [] = return $ unitOL MFENCE genSimplePrim _ MO_Touch [] [_] = return nilOL genSimplePrim _ (MO_Prefetch_Data n) [] [src] = genPrefetchData n src genSimplePrim _ (MO_BSwap width) [dst] [src] = genByteSwap width dst src @@ -4667,4 +4668,3 @@ genPred64 cond dst x y = do , SETCC cond (OpReg dst_r) , MOVZxL II8 (OpReg dst_r) (OpReg dst_r) ] - diff --git a/compiler/GHC/CmmToC.hs b/compiler/GHC/CmmToC.hs index e848dbca0aec..e646ef1e709b 100644 --- a/compiler/GHC/CmmToC.hs +++ b/compiler/GHC/CmmToC.hs @@ -265,6 +265,8 @@ pprStmt platform stmt = text "__atomic_thread_fence(__ATOMIC_RELEASE);" CmmUnsafeForeignCall (PrimTarget MO_AcquireFence) [] [] -> text "__atomic_thread_fence(__ATOMIC_ACQUIRE);" + CmmUnsafeForeignCall (PrimTarget MO_SeqCstFence) [] [] -> + text "__atomic_thread_fence(__ATOMIC_SEQ_CST);" CmmUnsafeForeignCall target@(PrimTarget op) results args -> fn_call @@ -959,6 +961,7 @@ pprCallishMachOp_for_C mop MO_F32_Fabs -> text "fabsf" MO_AcquireFence -> unsupported MO_ReleaseFence -> unsupported + MO_SeqCstFence -> unsupported MO_Memcpy _ -> text "__builtin_memcpy" MO_Memset _ -> text "__builtin_memset" MO_Memmove _ -> text "__builtin_memmove" diff --git a/compiler/GHC/CmmToLlvm/CodeGen.hs b/compiler/GHC/CmmToLlvm/CodeGen.hs index 168437ae9bf6..6d94f2c90506 100644 --- a/compiler/GHC/CmmToLlvm/CodeGen.hs +++ b/compiler/GHC/CmmToLlvm/CodeGen.hs @@ -180,6 +180,8 @@ genCall (PrimTarget MO_AcquireFence) _ _ = runStmtsDecls $ statement $ Fence False SyncAcquire genCall (PrimTarget MO_ReleaseFence) _ _ = runStmtsDecls $ statement $ Fence False SyncRelease +genCall (PrimTarget MO_SeqCstFence) _ _ = runStmtsDecls $ + statement $ Fence False SyncSeqCst genCall (PrimTarget MO_Touch) _ _ = return (nilOL, []) @@ -992,8 +994,11 @@ cmmPrimOpFunctions mop = do -- We support MO_U_Mul2 through ordinary LLVM mul instruction, see the -- appropriate case of genCall. MO_U_Mul2 {} -> unsupported + MO_ReleaseFence -> unsupported MO_AcquireFence -> unsupported + MO_SeqCstFence -> unsupported + MO_Touch -> unsupported MO_UF_Conv _ -> unsupported diff --git a/rts/include/Cmm.h b/rts/include/Cmm.h index 598b5de8a00a..780a6eb8f1fa 100644 --- a/rts/include/Cmm.h +++ b/rts/include/Cmm.h @@ -696,6 +696,7 @@ // See Note [ThreadSanitizer and fences] #define RELEASE_FENCE prim %fence_release(); #define ACQUIRE_FENCE prim %fence_acquire(); +#define SEQ_CST_FENCE prim %fence_seq_cst(); #if TSAN_ENABLED // This is may be efficient than a fence but TSAN can reason about it. -- GitLab