Commit 4ee4ab01 authored by tibbe's avatar tibbe

Re-add more primops for atomic ops on byte arrays

This is the second attempt to add this functionality. The first
attempt was reverted in 950fcae4, due
to register allocator failure on x86. Given how the register
allocator currently works, we don't have enough registers on x86 to
support cmpxchg using complicated addressing modes. Instead we fall
back to a simpler addressing mode on x86.

Adds the following primops:

 * atomicReadIntArray#
 * atomicWriteIntArray#
 * fetchSubIntArray#
 * fetchOrIntArray#
 * fetchXorIntArray#
 * fetchAndIntArray#

Makes these pre-existing out-of-line primops inline:

 * fetchAddIntArray#
 * casIntArray#
parent aed1723f
......@@ -19,6 +19,9 @@ module CmmMachOp
-- CallishMachOp
, CallishMachOp(..), callishMachOpHints
, pprCallishMachOp
-- Atomic read-modify-write
, AtomicMachOp(..)
)
where
......@@ -547,8 +550,24 @@ data CallishMachOp
| MO_PopCnt Width
| MO_BSwap Width
-- Atomic read-modify-write.
| MO_AtomicRMW Width AtomicMachOp
| MO_AtomicRead Width
| MO_AtomicWrite Width
| MO_Cmpxchg Width
deriving (Eq, Show)
-- | The operation to perform atomically.
data AtomicMachOp =
AMO_Add
| AMO_Sub
| AMO_And
| AMO_Nand
| AMO_Or
| AMO_Xor
deriving (Eq, Show)
pprCallishMachOp :: CallishMachOp -> SDoc
pprCallishMachOp mo = text (show mo)
......
......@@ -650,6 +650,10 @@ data AbsMem
-- perhaps we ought to have a special annotation for calls that can
-- modify heap/stack memory. For now we just use the conservative
-- definition here.
--
-- Some CallishMachOp imply a memory barrier e.g. AtomicRMW and
-- therefore we should never float any memory operations across one of
-- these calls.
bothMems :: AbsMem -> AbsMem -> AbsMem
......
......@@ -753,6 +753,10 @@ pprCallishMachOp_for_C mop
MO_Memmove -> ptext (sLit "memmove")
(MO_BSwap w) -> ptext (sLit $ bSwapLabel w)
(MO_PopCnt w) -> ptext (sLit $ popCntLabel w)
(MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
(MO_Cmpxchg w) -> ptext (sLit $ cmpxchgLabel w)
(MO_AtomicRead w) -> ptext (sLit $ atomicReadLabel w)
(MO_AtomicWrite w) -> ptext (sLit $ atomicWriteLabel w)
(MO_UF_Conv w) -> ptext (sLit $ word2FloatLabel w)
MO_S_QuotRem {} -> unsupported
......
......@@ -769,6 +769,25 @@ emitPrimOp _ res PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 res
emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args
emitPrimOp _ res PrefetchAddrOp0 args = doPrefetchAddrOp 0 res args
-- Atomic read-modify-write
emitPrimOp dflags [res] FetchAddByteArrayOp_Int [mba, ix, n] =
doAtomicRMW res AMO_Add mba ix (bWord dflags) n
emitPrimOp dflags [res] FetchSubByteArrayOp_Int [mba, ix, n] =
doAtomicRMW res AMO_Sub mba ix (bWord dflags) n
emitPrimOp dflags [res] FetchAndByteArrayOp_Int [mba, ix, n] =
doAtomicRMW res AMO_And mba ix (bWord dflags) n
emitPrimOp dflags [res] FetchNandByteArrayOp_Int [mba, ix, n] =
doAtomicRMW res AMO_Nand mba ix (bWord dflags) n
emitPrimOp dflags [res] FetchOrByteArrayOp_Int [mba, ix, n] =
doAtomicRMW res AMO_Or mba ix (bWord dflags) n
emitPrimOp dflags [res] FetchXorByteArrayOp_Int [mba, ix, n] =
doAtomicRMW res AMO_Xor mba ix (bWord dflags) n
emitPrimOp dflags [res] AtomicReadByteArrayOp_Int [mba, ix] =
doAtomicReadByteArray res mba ix (bWord dflags)
emitPrimOp dflags [] AtomicWriteByteArrayOp_Int [mba, ix, val] =
doAtomicWriteByteArray mba ix (bWord dflags) val
emitPrimOp dflags [res] CasByteArrayOp_Int [mba, ix, old, new] =
doCasByteArray res mba ix (bWord dflags) old new
-- The rest just translate straightforwardly
emitPrimOp dflags [res] op [arg]
......@@ -1932,6 +1951,81 @@ doWriteSmallPtrArrayOp addr idx val = do
mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val
emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel)))
------------------------------------------------------------------------------
-- Atomic read-modify-write
-- | Emit an atomic modification to a byte array element. The result
-- reg contains that previous value of the element. Implies a full
-- memory barrier.
doAtomicRMW :: LocalReg -- ^ Result reg
-> AtomicMachOp -- ^ Atomic op (e.g. add)
-> CmmExpr -- ^ MutableByteArray#
-> CmmExpr -- ^ Index
-> CmmType -- ^ Type of element by which we are indexing
-> CmmExpr -- ^ Op argument (e.g. amount to add)
-> FCode ()
doAtomicRMW res amop mba idx idx_ty n = do
dflags <- getDynFlags
let width = typeWidth idx_ty
addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
width mba idx
emitPrimCall
[ res ]
(MO_AtomicRMW width amop)
[ addr, n ]
-- | Emit an atomic read to a byte array that acts as a memory barrier.
doAtomicReadByteArray
:: LocalReg -- ^ Result reg
-> CmmExpr -- ^ MutableByteArray#
-> CmmExpr -- ^ Index
-> CmmType -- ^ Type of element by which we are indexing
-> FCode ()
doAtomicReadByteArray res mba idx idx_ty = do
dflags <- getDynFlags
let width = typeWidth idx_ty
addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
width mba idx
emitPrimCall
[ res ]
(MO_AtomicRead width)
[ addr ]
-- | Emit an atomic write to a byte array that acts as a memory barrier.
doAtomicWriteByteArray
:: CmmExpr -- ^ MutableByteArray#
-> CmmExpr -- ^ Index
-> CmmType -- ^ Type of element by which we are indexing
-> CmmExpr -- ^ Value to write
-> FCode ()
doAtomicWriteByteArray mba idx idx_ty val = do
dflags <- getDynFlags
let width = typeWidth idx_ty
addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
width mba idx
emitPrimCall
[ {- no results -} ]
(MO_AtomicWrite width)
[ addr, val ]
doCasByteArray
:: LocalReg -- ^ Result reg
-> CmmExpr -- ^ MutableByteArray#
-> CmmExpr -- ^ Index
-> CmmType -- ^ Type of element by which we are indexing
-> CmmExpr -- ^ Old value
-> CmmExpr -- ^ New value
-> FCode ()
doCasByteArray res mba idx idx_ty old new = do
dflags <- getDynFlags
let width = (typeWidth idx_ty)
addr = cmmIndexOffExpr dflags (arrWordsHdrSize dflags)
width mba idx
emitPrimCall
[ res ]
(MO_Cmpxchg width)
[ addr, old, new ]
------------------------------------------------------------------------------
-- Helpers for emitting function calls
......
......@@ -65,6 +65,8 @@ data LlvmFunction = LlvmFunction {
type LlvmFunctions = [LlvmFunction]
type SingleThreaded = Bool
-- | LLVM ordering types for synchronization purposes. (Introduced in LLVM
-- 3.0). Please see the LLVM documentation for a better description.
data LlvmSyncOrdering
......@@ -223,6 +225,11 @@ data LlvmExpression
-}
| Load LlvmVar
{- |
Atomic load of the value at location ptr
-}
| ALoad LlvmSyncOrdering SingleThreaded LlvmVar
{- |
Navigate in an structure, selecting elements
* inbound: Is the pointer inbounds? (computed pointer doesn't overflow)
......
......@@ -239,6 +239,7 @@ ppLlvmExpression expr
Insert vec elt idx -> ppInsert vec elt idx
GetElemPtr inb ptr indexes -> ppGetElementPtr inb ptr indexes
Load ptr -> ppLoad ptr
ALoad ord st ptr -> ppALoad ord st ptr
Malloc tp amount -> ppMalloc tp amount
Phi tp precessors -> ppPhi tp precessors
Asm asm c ty v se sk -> ppAsm asm c ty v se sk
......@@ -327,13 +328,18 @@ ppSyncOrdering SyncSeqCst = text "seq_cst"
-- of specifying alignment.
ppLoad :: LlvmVar -> SDoc
ppLoad var
| isVecPtrVar var = text "load" <+> ppr var <>
comma <+> text "align 1"
| otherwise = text "load" <+> ppr var
ppLoad var = text "load" <+> ppr var <> align
where
isVecPtrVar :: LlvmVar -> Bool
isVecPtrVar = isVector . pLower . getVarType
align | isVector . pLower . getVarType $ var = text ", align 1"
| otherwise = empty
ppALoad :: LlvmSyncOrdering -> SingleThreaded -> LlvmVar -> SDoc
ppALoad ord st var = sdocWithDynFlags $ \dflags ->
let alignment = (llvmWidthInBits dflags $ getVarType var) `quot` 8
align = text ", align" <+> ppr alignment
sThreaded | st = text " singlethread"
| otherwise = empty
in text "load atomic" <+> ppr var <> sThreaded <+> ppSyncOrdering ord <> align
ppStore :: LlvmVar -> LlvmVar -> SDoc
ppStore val dst
......
......@@ -15,6 +15,7 @@ import BlockId
import CodeGen.Platform ( activeStgRegs, callerSaves )
import CLabel
import Cmm
import CPrim
import PprCmm
import CmmUtils
import Hoopl
......@@ -32,6 +33,7 @@ import Unique
import Data.List ( nub )
import Data.Maybe ( catMaybes )
type Atomic = Bool
type LlvmStatements = OrdList LlvmStatement
-- -----------------------------------------------------------------------------
......@@ -228,6 +230,17 @@ genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
genCall t@(PrimTarget (MO_BSwap w)) dsts args =
genCallSimpleCast w t dsts args
genCall (PrimTarget (MO_AtomicRead _)) [dst] [addr] = do
dstV <- getCmmReg (CmmLocal dst)
(v1, stmts, top) <- genLoad True addr (localRegType dst)
let stmt1 = Store v1 dstV
return (stmts `snocOL` stmt1, top)
-- TODO: implement these properly rather than calling to RTS functions.
-- genCall t@(PrimTarget (MO_AtomicWrite width)) [] [addr, val] = undefined
-- genCall t@(PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = undefined
-- genCall t@(PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = undefined
-- Handle memcpy function specifically since llvm's intrinsic version takes
-- some extra parameters.
genCall t@(PrimTarget op) [] args'
......@@ -548,7 +561,6 @@ cmmPrimOpFunctions mop = do
(MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
MO_U_QuotRem2 {} -> unsupported
......@@ -558,6 +570,12 @@ cmmPrimOpFunctions mop = do
MO_Touch -> unsupported
MO_UF_Conv _ -> unsupported
MO_AtomicRead _ -> unsupported
MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
-- | Tail function calls
genJump :: CmmExpr -> [GlobalReg] -> LlvmM StmtData
......@@ -849,7 +867,7 @@ exprToVarOpt opt e = case e of
-> genLit opt lit
CmmLoad e' ty
-> genLoad e' ty
-> genLoad False e' ty
-- Cmmreg in expression is the value, so must load. If you want actual
-- reg pointer, call getCmmReg directly.
......@@ -1268,41 +1286,41 @@ genMachOp_slow _ _ _ = panic "genMachOp: More then 2 expressions in MachOp!"
-- | Handle CmmLoad expression.
genLoad :: CmmExpr -> CmmType -> LlvmM ExprData
genLoad :: Atomic -> CmmExpr -> CmmType -> LlvmM ExprData
-- First we try to detect a few common cases and produce better code for
-- these then the default case. We are mostly trying to detect Cmm code
-- like I32[Sp + n] and use 'getelementptr' operations instead of the
-- generic case that uses casts and pointer arithmetic
genLoad e@(CmmReg (CmmGlobal r)) ty
= genLoad_fast e r 0 ty
genLoad atomic e@(CmmReg (CmmGlobal r)) ty
= genLoad_fast atomic e r 0 ty
genLoad e@(CmmRegOff (CmmGlobal r) n) ty
= genLoad_fast e r n ty
genLoad atomic e@(CmmRegOff (CmmGlobal r) n) ty
= genLoad_fast atomic e r n ty
genLoad e@(CmmMachOp (MO_Add _) [
genLoad atomic e@(CmmMachOp (MO_Add _) [
(CmmReg (CmmGlobal r)),
(CmmLit (CmmInt n _))])
ty
= genLoad_fast e r (fromInteger n) ty
= genLoad_fast atomic e r (fromInteger n) ty
genLoad e@(CmmMachOp (MO_Sub _) [
genLoad atomic e@(CmmMachOp (MO_Sub _) [
(CmmReg (CmmGlobal r)),
(CmmLit (CmmInt n _))])
ty
= genLoad_fast e r (negate $ fromInteger n) ty
= genLoad_fast atomic e r (negate $ fromInteger n) ty
-- generic case
genLoad e ty
genLoad atomic e ty
= do other <- getTBAAMeta otherN
genLoad_slow e ty other
genLoad_slow atomic e ty other
-- | Handle CmmLoad expression.
-- This is a special case for loading from a global register pointer
-- offset such as I32[Sp+8].
genLoad_fast :: CmmExpr -> GlobalReg -> Int -> CmmType
-> LlvmM ExprData
genLoad_fast e r n ty = do
genLoad_fast :: Atomic -> CmmExpr -> GlobalReg -> Int -> CmmType
-> LlvmM ExprData
genLoad_fast atomic e r n ty = do
dflags <- getDynFlags
(gv, grt, s1) <- getCmmRegVal (CmmGlobal r)
meta <- getTBAARegMeta r
......@@ -1315,7 +1333,7 @@ genLoad_fast e r n ty = do
case grt == ty' of
-- were fine
True -> do
(var, s3) <- doExpr ty' (MExpr meta $ Load ptr)
(var, s3) <- doExpr ty' (MExpr meta $ loadInstr ptr)
return (var, s1 `snocOL` s2 `snocOL` s3,
[])
......@@ -1323,32 +1341,34 @@ genLoad_fast e r n ty = do
False -> do
let pty = pLift ty'
(ptr', s3) <- doExpr pty $ Cast LM_Bitcast ptr pty
(var, s4) <- doExpr ty' (MExpr meta $ Load ptr')
(var, s4) <- doExpr ty' (MExpr meta $ loadInstr ptr')
return (var, s1 `snocOL` s2 `snocOL` s3
`snocOL` s4, [])
-- If its a bit type then we use the slow method since
-- we can't avoid casting anyway.
False -> genLoad_slow e ty meta
False -> genLoad_slow atomic e ty meta
where
loadInstr ptr | atomic = ALoad SyncSeqCst False ptr
| otherwise = Load ptr
-- | Handle Cmm load expression.
-- Generic case. Uses casts and pointer arithmetic if needed.
genLoad_slow :: CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
genLoad_slow e ty meta = do
genLoad_slow :: Atomic -> CmmExpr -> CmmType -> [MetaAnnot] -> LlvmM ExprData
genLoad_slow atomic e ty meta = do
(iptr, stmts, tops) <- exprToVar e
dflags <- getDynFlags
case getVarType iptr of
LMPointer _ -> do
(dvar, load) <- doExpr (cmmToLlvmType ty)
(MExpr meta $ Load iptr)
(MExpr meta $ loadInstr iptr)
return (dvar, stmts `snocOL` load, tops)
i@(LMInt _) | i == llvmWord dflags -> do
let pty = LMPointer $ cmmToLlvmType ty
(ptr, cast) <- doExpr pty $ Cast LM_Inttoptr iptr pty
(dvar, load) <- doExpr (cmmToLlvmType ty)
(MExpr meta $ Load ptr)
(MExpr meta $ loadInstr ptr)
return (dvar, stmts `snocOL` cast `snocOL` load, tops)
other -> do dflags <- getDynFlags
......@@ -1357,6 +1377,9 @@ genLoad_slow e ty meta = do
"Size of Ptr: " ++ show (llvmPtrBits dflags) ++
", Size of var: " ++ show (llvmWidthInBits dflags other) ++
", Var: " ++ showSDoc dflags (ppr iptr)))
where
loadInstr ptr | atomic = ALoad SyncSeqCst False ptr
| otherwise = Load ptr
-- | Handle CmmReg expression. This will return a pointer to the stack
......
-- | Generating C symbol names emitted by the compiler.
module CPrim
( popCntLabel
( atomicReadLabel
, atomicWriteLabel
, atomicRMWLabel
, cmpxchgLabel
, popCntLabel
, bSwapLabel
, word2FloatLabel
) where
import CmmType
import CmmMachOp
import Outputable
popCntLabel :: Width -> String
......@@ -31,3 +36,46 @@ word2FloatLabel w = "hs_word2float" ++ pprWidth w
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "word2FloatLabel: Unsupported word width " (ppr w)
atomicRMWLabel :: Width -> AtomicMachOp -> String
atomicRMWLabel w amop = "hs_atomic_" ++ pprFunName amop ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "atomicRMWLabel: Unsupported word width " (ppr w)
pprFunName AMO_Add = "add"
pprFunName AMO_Sub = "sub"
pprFunName AMO_And = "and"
pprFunName AMO_Nand = "nand"
pprFunName AMO_Or = "or"
pprFunName AMO_Xor = "xor"
cmpxchgLabel :: Width -> String
cmpxchgLabel w = "hs_cmpxchg" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "cmpxchgLabel: Unsupported word width " (ppr w)
atomicReadLabel :: Width -> String
atomicReadLabel w = "hs_atomicread" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "atomicReadLabel: Unsupported word width " (ppr w)
atomicWriteLabel :: Width -> String
atomicWriteLabel w = "hs_atomicwrite" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "atomicWriteLabel: Unsupported word width " (ppr w)
......@@ -1160,6 +1160,10 @@ genCCall' dflags gcp target dest_regs args0
MO_BSwap w -> (fsLit $ bSwapLabel w, False)
MO_PopCnt w -> (fsLit $ popCntLabel w, False)
MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False)
MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
MO_AtomicRead w -> (fsLit $ atomicReadLabel w, False)
MO_AtomicWrite w -> (fsLit $ atomicWriteLabel w, False)
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
......
......@@ -654,6 +654,10 @@ outOfLineMachOp_table mop
MO_BSwap w -> fsLit $ bSwapLabel w
MO_PopCnt w -> fsLit $ popCntLabel w
MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
MO_AtomicRead w -> fsLit $ atomicReadLabel w
MO_AtomicWrite w -> fsLit $ atomicWriteLabel w
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
......
......@@ -1057,6 +1057,18 @@ getAmode' _ expr = do
(reg,code) <- getSomeReg expr
return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code)
-- | Like 'getAmode', but on 32-bit use simple register addressing
-- (i.e. no index register). This stops us from running out of
-- registers on x86 when using instructions such as cmpxchg, which can
-- use up to three virtual registers and one fixed register.
getSimpleAmode :: DynFlags -> Bool -> CmmExpr -> NatM Amode
getSimpleAmode dflags is32Bit addr
| is32Bit = do
addr_code <- getAnyReg addr
addr_r <- getNewRegNat (intSize (wordWidth dflags))
let amode = AddrBaseIndex (EABaseReg addr_r) EAIndexNone (ImmInt 0)
return $! Amode amode (addr_code addr_r)
| otherwise = getAmode addr
x86_complex_amode :: CmmExpr -> CmmExpr -> Integer -> Integer -> NatM Amode
x86_complex_amode base index shift offset
......@@ -1761,6 +1773,99 @@ genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
where
lbl = mkCmmCodeLabel primPackageId (fsLit (word2FloatLabel width))
genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = do
Amode amode addr_code <-
if amop `elem` [AMO_Add, AMO_Sub]
then getAmode addr
else getSimpleAmode dflags is32Bit addr -- See genCCall for MO_Cmpxchg
arg <- getNewRegNat size
arg_code <- getAnyReg n
use_sse2 <- sse2Enabled
let platform = targetPlatform dflags
dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
code <- op_code dst_r arg amode
return $ addr_code `appOL` arg_code arg `appOL` code
where
-- Code for the operation
op_code :: Reg -- Destination reg
-> Reg -- Register containing argument
-> AddrMode -- Address of location to mutate
-> NatM (OrdList Instr)
op_code dst_r arg amode = case amop of
-- In the common case where dst_r is a virtual register the
-- final move should go away, because it's the last use of arg
-- and the first use of dst_r.
AMO_Add -> return $ toOL [ LOCK
, XADD size (OpReg arg) (OpAddr amode)
, MOV size (OpReg arg) (OpReg dst_r)
]
AMO_Sub -> return $ toOL [ NEGI size (OpReg arg)
, LOCK
, XADD size (OpReg arg) (OpAddr amode)
, MOV size (OpReg arg) (OpReg dst_r)
]
AMO_And -> cmpxchg_code (\ src dst -> unitOL $ AND size src dst)
AMO_Nand -> cmpxchg_code (\ src dst -> toOL [ AND size src dst
, NOT size dst
])
AMO_Or -> cmpxchg_code (\ src dst -> unitOL $ OR size src dst)
AMO_Xor -> cmpxchg_code (\ src dst -> unitOL $ XOR size src dst)
where
-- Simulate operation that lacks a dedicated instruction using
-- cmpxchg.
cmpxchg_code :: (Operand -> Operand -> OrdList Instr)
-> NatM (OrdList Instr)
cmpxchg_code instrs = do
lbl <- getBlockIdNat
tmp <- getNewRegNat size
return $ toOL
[ MOV size (OpAddr amode) (OpReg eax)
, JXX ALWAYS lbl
, NEWBLOCK lbl
-- Keep old value so we can return it:
, MOV size (OpReg eax) (OpReg dst_r)
, MOV size (OpReg eax) (OpReg tmp)
]
`appOL` instrs (OpReg arg) (OpReg tmp) `appOL` toOL
[ LOCK
, CMPXCHG size (OpReg tmp) (OpAddr amode)
, JXX NE lbl
]
size = intSize width
genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] = do
load_code <- intLoadCode (MOV (intSize width)) addr
let platform = targetPlatform dflags
use_sse2 <- sse2Enabled
return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] = do
assignMem_IntCode (intSize width) addr val
genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] = do
-- On x86 we don't have enough registers to use cmpxchg with a
-- complicated addressing mode, so on that architecture we
-- pre-compute the address first.
Amode amode addr_code <- getSimpleAmode dflags is32Bit addr
newval <- getNewRegNat size
newval_code <- getAnyReg new
oldval <- getNewRegNat size
oldval_code <- getAnyReg old
use_sse2 <- sse2Enabled
let platform = targetPlatform dflags
dst_r = getRegisterReg platform use_sse2 (CmmLocal dst)
code = toOL
[ MOV size (OpReg oldval) (OpReg eax)
, LOCK
, CMPXCHG size (OpReg newval) (OpAddr amode)
, MOV size (OpReg eax) (OpReg dst_r)
]
return $ addr_code `appOL` newval_code newval `appOL` oldval_code oldval
`appOL` code
where
size = intSize width
genCCall _ is32Bit target dest_regs args
| is32Bit = genCCall32 target dest_regs args
| otherwise = genCCall64 target dest_regs args
......@@ -2385,6 +2490,11 @@ outOfLineCmmOp mop res args
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
MO_AtomicWrite _ -> fsLit "atomicwrite"
MO_Cmpxchg _ -> fsLit "cmpxchg"
MO_UF_Conv _ -> unsupported
MO_S_QuotRem {} -> unsupported
......
......@@ -327,6 +327,10 @@ data Instr
| PREFETCH PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch
-- variant can be NTA, Lvl0, Lvl1, or Lvl2
| LOCK -- lock prefix
| XADD Size Operand Operand -- src (r), dst (r/m)
| CMPXCHG Size Operand Operand -- src (r), dst (r/m), eax implicit
data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
......@@ -337,6 +341,8 @@ data Operand
-- | Returns which registers are read and written as a (read, written)
-- pair.
x86_regUsageOfInstr :: Platform -> Instr -> RegUsage
x86_regUsageOfInstr platform instr
= case instr of
......@@ -428,10 +434,21 @@ x86_regUsageOfInstr platform instr
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
LOCK -> noUsage
XADD _ src dst -> usageMM src dst
CMPXCHG _ src dst -> usageRMM src dst (OpReg eax)
_other -> panic "regUsage: unrecognised instr"
where
-- # Definitions
--
-- Written: If the operand is a register, it's written. If it's an
-- address, registers mentioned in the address are read.
--
-- Modified: If the operand is a register, it's both read and
-- written. If it's an address, registers mentioned in the address
-- are read.
-- 2 operand form; first operand Read; second Written
usageRW :: Operand -> Operand -> RegUsage
usageRW op (OpReg reg) = mkRU (use_R op []) [reg]
......@@ -444,6 +461,18 @@ x86_regUsageOfInstr platform instr
usageRM op (OpAddr ea) = mkRUR (use_R op $! use_EA ea [])
usageRM _ _ = panic "X86.RegInfo.usageRM: no match"
-- 2 operand form; first operand Modified; second Modified
usageMM :: Operand -> Operand -> RegUsage
usageMM (OpReg src) (OpReg dst) = mkRU [src, dst] [src, dst]
usageMM (OpReg src) (OpAddr ea) = mkRU (use_EA ea [src]) [src]
usageMM _ _ = panic "X86.RegInfo.usageMM: no match"
-- 3 operand form; first operand Read; second Modified; third Modified
usageRMM :: Operand -> Operand -> Operand -> RegUsage
usageRMM (OpReg src) (OpReg dst) (OpReg reg) = mkRU [src, dst, reg] [dst, reg]
usageRMM (OpReg src) (OpAddr ea) (OpReg reg) = mkRU (use_EA ea [src, reg]) [reg]
usageRMM _ _ _ = panic "X86.RegInfo.usageRMM: no match"
-- 1 operand form; operand Modified
usageM :: Operand -> RegUsage
usageM (OpReg reg) = mkRU [reg] [reg]
......@@ -476,6 +505,7 @@ x86_regUsageOfInstr platform instr
where src' = filter (interesting platform) src
dst' = filter (interesting platform) dst
-- | Is this register interesting for the register allocator?
interesting :: Platform -> Reg -> Bool
interesting _ (RegVirtual _) = True
interesting platform (RegReal (RealRegSingle i)) = isFastTrue (freeReg platform i)
......@@ -483,6 +513,8 @@ interesting _ (RegReal (RealRegPair{})) = panic "X86.interesting: no re
-- | Applies the supplied function to all registers in instructions.
-- Typically used to change virtual registers to real registers.
x86_patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
x86_patchRegsOfInstr instr env
= case instr of
......@@ -571,6 +603,10 @@ x86_patchRegsOfInstr instr env
PREFETCH lvl size src -> PREFETCH lvl size (patchOp src)
LOCK -> instr
XADD sz src dst -> patch2 (XADD sz) src dst
CMPXCHG sz src dst -> patch2 (CMPXCHG sz) src dst
_other -> panic "patchRegs: unrecognised instr"
where
......
......@@ -886,6 +886,14 @@ pprInstr GFREE