Commit f8557696 authored by John Ky's avatar John Ky Committed by Ben Gamari

Add new mbmi and mbmi2 compiler flags

This adds support for the bit deposit and extraction operations provided
by the BMI and BMI2 instruction set extensions on modern amd64 machines.

Implement x86 code generator for pdep and pext.  Properly initialise
bmiVersion field.

pdep and pext test cases

Fix pattern match for pdep and pext instructions

Fix build of pdep and pext code for 32-bit architectures

Test Plan: Validate

Reviewers: austin, simonmar, bgamari, angerman

Reviewed By: bgamari

Subscribers: trommler, carter, angerman, thomie, rwbarton, newhoggy

GHC Trac Issues: #14206

Differential Revision: https://phabricator.haskell.org/D4236
parent 5e8ea6a6
......@@ -587,6 +587,8 @@ data CallishMachOp
| MO_Memcmp Int
| MO_PopCnt Width
| MO_Pdep Width
| MO_Pext Width
| MO_Clz Width
| MO_Ctz Width
......
......@@ -1006,6 +1006,16 @@ callishMachOps = listToUFM $
( "popcnt32", (,) $ MO_PopCnt W32 ),
( "popcnt64", (,) $ MO_PopCnt W64 ),
( "pdep8", (,) $ MO_Pdep W8 ),
( "pdep16", (,) $ MO_Pdep W16 ),
( "pdep32", (,) $ MO_Pdep W32 ),
( "pdep64", (,) $ MO_Pdep W64 ),
( "pext8", (,) $ MO_Pext W8 ),
( "pext16", (,) $ MO_Pext W16 ),
( "pext32", (,) $ MO_Pext W32 ),
( "pext64", (,) $ MO_Pext W64 ),
( "cmpxchg8", (,) $ MO_Cmpxchg W8 ),
( "cmpxchg16", (,) $ MO_Cmpxchg W16 ),
( "cmpxchg32", (,) $ MO_Cmpxchg W32 ),
......
......@@ -789,6 +789,8 @@ pprCallishMachOp_for_C mop
MO_Memcmp _ -> text "memcmp"
(MO_BSwap w) -> ptext (sLit $ bSwapLabel w)
(MO_PopCnt w) -> ptext (sLit $ popCntLabel w)
(MO_Pext w) -> ptext (sLit $ pextLabel w)
(MO_Pdep w) -> ptext (sLit $ pdepLabel w)
(MO_Clz w) -> ptext (sLit $ clzLabel w)
(MO_Ctz w) -> ptext (sLit $ ctzLabel w)
(MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
......
......@@ -584,6 +584,20 @@ emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32
emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64
emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags)
-- Parallel bit deposit
emitPrimOp _ [res] Pdep8Op [src, mask] = emitPdepCall res src mask W8
emitPrimOp _ [res] Pdep16Op [src, mask] = emitPdepCall res src mask W16
emitPrimOp _ [res] Pdep32Op [src, mask] = emitPdepCall res src mask W32
emitPrimOp _ [res] Pdep64Op [src, mask] = emitPdepCall res src mask W64
emitPrimOp dflags [res] PdepOp [src, mask] = emitPdepCall res src mask (wordWidth dflags)
-- Parallel bit extract
emitPrimOp _ [res] Pext8Op [src, mask] = emitPextCall res src mask W8
emitPrimOp _ [res] Pext16Op [src, mask] = emitPextCall res src mask W16
emitPrimOp _ [res] Pext32Op [src, mask] = emitPextCall res src mask W32
emitPrimOp _ [res] Pext64Op [src, mask] = emitPextCall res src mask W64
emitPrimOp dflags [res] PextOp [src, mask] = emitPextCall res src mask (wordWidth dflags)
-- count leading zeros
emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8
emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16
......@@ -2266,6 +2280,20 @@ emitPopCntCall res x width = do
(MO_PopCnt width)
[ x ]
emitPdepCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode ()
emitPdepCall res x y width = do
emitPrimCall
[ res ]
(MO_Pdep width)
[ x, y ]
emitPextCall :: LocalReg -> CmmExpr -> CmmExpr -> Width -> FCode ()
emitPextCall res x y width = do
emitPrimCall
[ res ]
(MO_Pext width)
[ x, y ]
emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
emitClzCall res x width = do
emitPrimCall
......
......@@ -867,4 +867,3 @@ mkAbsentErrorApp res_ty err_msg
= mkApps (Var aBSENT_ERROR_ID) [ Type res_ty, err_string ]
where
err_string = Lit (mkMachString err_msg)
......@@ -46,6 +46,8 @@ import Data.Maybe ( catMaybes )
type Atomic = Bool
type LlvmStatements = OrdList LlvmStatement
data Signage = Signed | Unsigned deriving (Eq, Show)
-- -----------------------------------------------------------------------------
-- | Top-level of the LLVM proc Code generator
--
......@@ -207,7 +209,7 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
let args_hints' = zip args arg_hints
argVars <- arg_varsW args_hints' ([], nilOL, [])
fptr <- liftExprData $ getFunPtr funTy t
argVars' <- castVarsW $ zip argVars argTy
argVars' <- castVarsW Signed $ zip argVars argTy
doTrashStmts
let argSuffix = [mkIntLit i32 0, mkIntLit i32 localityInt, mkIntLit i32 1]
......@@ -218,6 +220,11 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
-- and return types
genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_Pdep w)) dsts args =
genCallSimpleCast2 w t dsts args
genCall t@(PrimTarget (MO_Pext w)) dsts args =
genCallSimpleCast2 w t dsts args
genCall t@(PrimTarget (MO_Clz w)) dsts args =
genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_Ctz w)) dsts args =
......@@ -285,7 +292,7 @@ genCall t@(PrimTarget op) [] args
let args_hints = zip args arg_hints
argVars <- arg_varsW args_hints ([], nilOL, [])
fptr <- getFunPtrW funTy t
argVars' <- castVarsW $ zip argVars argTy
argVars' <- castVarsW Signed $ zip argVars argTy
doTrashStmts
let alignVal = mkIntLit i32 align
......@@ -518,7 +525,7 @@ genCallExtract target@(PrimTarget op) w (argA, argB) (llvmTypeA, llvmTypeB) = do
-- Process the arguments.
let args_hints = zip [argA, argB] (snd $ foreignTargetHints target)
(argsV1, args1, top1) <- arg_vars args_hints ([], nilOL, [])
(argsV2, args2) <- castVars $ zip argsV1 argTy
(argsV2, args2) <- castVars Signed $ zip argsV1 argTy
-- Get the function and make the call.
fname <- cmmPrimOpFunctions op
......@@ -558,9 +565,9 @@ genCallSimpleCast w t@(PrimTarget op) [dst] args = do
let (_, arg_hints) = foreignTargetHints t
let args_hints = zip args arg_hints
(argsV, stmts2, top2) <- arg_vars args_hints ([], nilOL, [])
(argsV', stmts4) <- castVars $ zip argsV [width]
(argsV', stmts4) <- castVars Signed $ zip argsV [width]
(retV, s1) <- doExpr width $ Call StdCall fptr argsV' []
([retV'], stmts5) <- castVars [(retV,dstTy)]
([retV'], stmts5) <- castVars (cmmPrimOpRetValSignage op) [(retV,dstTy)]
let s2 = Store retV' dstV
let stmts = stmts2 `appOL` stmts4 `snocOL`
......@@ -569,6 +576,37 @@ genCallSimpleCast w t@(PrimTarget op) [dst] args = do
genCallSimpleCast _ _ dsts _ =
panic ("genCallSimpleCast: " ++ show (length dsts) ++ " dsts")
-- Handle simple function call that only need simple type casting, of the form:
-- truncate arg >>= \a -> call(a) >>= zext
--
-- since GHC only really has i32 and i64 types and things like Word8 are backed
-- by an i32 and just present a logical i8 range. So we must handle conversions
-- from i32 to i8 explicitly as LLVM is strict about types.
genCallSimpleCast2 :: Width -> ForeignTarget -> [CmmFormal] -> [CmmActual]
-> LlvmM StmtData
genCallSimpleCast2 w t@(PrimTarget op) [dst] args = do
let width = widthToLlvmInt w
dstTy = cmmToLlvmType $ localRegType dst
fname <- cmmPrimOpFunctions op
(fptr, _, top3) <- getInstrinct fname width (const width <$> args)
dstV <- getCmmReg (CmmLocal dst)
let (_, arg_hints) = foreignTargetHints t
let args_hints = zip args arg_hints
(argsV, stmts2, top2) <- arg_vars args_hints ([], nilOL, [])
(argsV', stmts4) <- castVars Signed $ zip argsV (const width <$> argsV)
(retV, s1) <- doExpr width $ Call StdCall fptr argsV' []
([retV'], stmts5) <- castVars (cmmPrimOpRetValSignage op) [(retV,dstTy)]
let s2 = Store retV' dstV
let stmts = stmts2 `appOL` stmts4 `snocOL`
s1 `appOL` stmts5 `snocOL` s2
return (stmts, top2 ++ top3)
genCallSimpleCast2 _ _ dsts _ =
panic ("genCallSimpleCast2: " ++ show (length dsts) ++ " dsts")
-- | Create a function pointer from a target.
getFunPtrW :: (LMString -> LlvmType) -> ForeignTarget
-> WriterT LlvmAccum LlvmM LlvmVar
......@@ -638,31 +676,32 @@ arg_vars ((e, _):rest) (vars, stmts, tops)
-- | Cast a collection of LLVM variables to specific types.
castVarsW :: [(LlvmVar, LlvmType)]
castVarsW :: Signage
-> [(LlvmVar, LlvmType)]
-> WriterT LlvmAccum LlvmM [LlvmVar]
castVarsW vars = do
(vars, stmts) <- lift $ castVars vars
castVarsW signage vars = do
(vars, stmts) <- lift $ castVars signage vars
tell $ LlvmAccum stmts mempty
return vars
-- | Cast a collection of LLVM variables to specific types.
castVars :: [(LlvmVar, LlvmType)]
castVars :: Signage -> [(LlvmVar, LlvmType)]
-> LlvmM ([LlvmVar], LlvmStatements)
castVars vars = do
done <- mapM (uncurry castVar) vars
castVars signage vars = do
done <- mapM (uncurry (castVar signage)) vars
let (vars', stmts) = unzip done
return (vars', toOL stmts)
-- | Cast an LLVM variable to a specific type, panicing if it can't be done.
castVar :: LlvmVar -> LlvmType -> LlvmM (LlvmVar, LlvmStatement)
castVar v t | getVarType v == t
castVar :: Signage -> LlvmVar -> LlvmType -> LlvmM (LlvmVar, LlvmStatement)
castVar signage v t | getVarType v == t
= return (v, Nop)
| otherwise
= do dflags <- getDynFlags
let op = case (getVarType v, t) of
(LMInt n, LMInt m)
-> if n < m then LM_Sext else LM_Trunc
-> if n < m then extend else LM_Trunc
(vt, _) | isFloat vt && isFloat t
-> if llvmWidthInBits dflags vt < llvmWidthInBits dflags t
then LM_Fpext else LM_Fptrunc
......@@ -676,7 +715,16 @@ castVar v t | getVarType v == t
(vt, _) -> panic $ "castVars: Can't cast this type ("
++ showSDoc dflags (ppr vt) ++ ") to (" ++ showSDoc dflags (ppr t) ++ ")"
doExpr t $ Cast op v t
where extend = case signage of
Signed -> LM_Sext
Unsigned -> LM_Zext
cmmPrimOpRetValSignage :: CallishMachOp -> Signage
cmmPrimOpRetValSignage mop = case mop of
MO_Pdep _ -> Unsigned
MO_Pext _ -> Unsigned
_ -> Signed
-- | Decide what C function to use to implement a CallishMachOp
cmmPrimOpFunctions :: CallishMachOp -> LlvmM LMString
......@@ -735,6 +783,15 @@ cmmPrimOpFunctions mop = do
(MO_Clz w) -> fsLit $ "llvm.ctlz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Ctz w) -> fsLit $ "llvm.cttz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Pdep w) -> let w' = showSDoc dflags (ppr $ widthInBits w)
in if isBmi2Enabled dflags
then fsLit $ "llvm.x86.bmi.pdep." ++ w'
else fsLit $ "hs_pdep" ++ w'
(MO_Pext w) -> let w' = showSDoc dflags (ppr $ widthInBits w)
in if isBmi2Enabled dflags
then fsLit $ "llvm.x86.bmi.pext." ++ w'
else fsLit $ "hs_pext" ++ w'
(MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
MO_AddIntC w -> fsLit $ "llvm.sadd.with.overflow."
......@@ -1212,7 +1269,7 @@ genMachOp _ op [x] = case op of
negateVec ty v2 negOp = do
(vx, stmts1, top) <- exprToVar x
([vx'], stmts2) <- castVars [(vx, ty)]
([vx'], stmts2) <- castVars Signed [(vx, ty)]
(v1, s1) <- doExpr ty $ LlvmOp negOp v2 vx'
return (v1, stmts1 `appOL` stmts2 `snocOL` s1, top)
......@@ -1275,7 +1332,7 @@ genMachOp_slow :: EOption -> MachOp -> [CmmExpr] -> LlvmM ExprData
genMachOp_slow _ (MO_V_Extract l w) [val, idx] = runExprData $ do
vval <- exprToVarW val
vidx <- exprToVarW idx
[vval'] <- castVarsW [(vval, LMVector l ty)]
[vval'] <- castVarsW Signed [(vval, LMVector l ty)]
doExprW ty $ Extract vval' vidx
where
ty = widthToLlvmInt w
......@@ -1283,7 +1340,7 @@ genMachOp_slow _ (MO_V_Extract l w) [val, idx] = runExprData $ do
genMachOp_slow _ (MO_VF_Extract l w) [val, idx] = runExprData $ do
vval <- exprToVarW val
vidx <- exprToVarW idx
[vval'] <- castVarsW [(vval, LMVector l ty)]
[vval'] <- castVarsW Signed [(vval, LMVector l ty)]
doExprW ty $ Extract vval' vidx
where
ty = widthToLlvmFloat w
......@@ -1293,7 +1350,7 @@ genMachOp_slow _ (MO_V_Insert l w) [val, elt, idx] = runExprData $ do
vval <- exprToVarW val
velt <- exprToVarW elt
vidx <- exprToVarW idx
[vval'] <- castVarsW [(vval, ty)]
[vval'] <- castVarsW Signed [(vval, ty)]
doExprW ty $ Insert vval' velt vidx
where
ty = LMVector l (widthToLlvmInt w)
......@@ -1302,7 +1359,7 @@ genMachOp_slow _ (MO_VF_Insert l w) [val, elt, idx] = runExprData $ do
vval <- exprToVarW val
velt <- exprToVarW elt
vidx <- exprToVarW idx
[vval'] <- castVarsW [(vval, ty)]
[vval'] <- castVarsW Signed [(vval, ty)]
doExprW ty $ Insert vval' velt vidx
where
ty = LMVector l (widthToLlvmFloat w)
......@@ -1414,7 +1471,7 @@ genMachOp_slow opt op [x, y] = case op of
binCastLlvmOp ty binOp = runExprData $ do
vx <- exprToVarW x
vy <- exprToVarW y
[vx', vy'] <- castVarsW [(vx, ty), (vy, ty)]
[vx', vy'] <- castVarsW Signed [(vx, ty), (vy, ty)]
doExprW ty $ binOp vx' vy'
-- | Need to use EOption here as Cmm expects word size results from
......
......@@ -848,6 +848,8 @@ llvmOptions dflags =
++ ["+avx512cd"| isAvx512cdEnabled dflags ]
++ ["+avx512er"| isAvx512erEnabled dflags ]
++ ["+avx512pf"| isAvx512pfEnabled dflags ]
++ ["+bmi" | isBmiEnabled dflags ]
++ ["+bmi2" | isBmi2Enabled dflags ]
-- -----------------------------------------------------------------------------
-- | Each phase in the pipeline returns the next phase to execute, and the
......
......@@ -150,6 +150,8 @@ module DynFlags (
isSseEnabled,
isSse2Enabled,
isSse4_2Enabled,
isBmiEnabled,
isBmi2Enabled,
isAvxEnabled,
isAvx2Enabled,
isAvx512cdEnabled,
......@@ -1005,6 +1007,7 @@ data DynFlags = DynFlags {
-- | Machine dependent flags (-m<blah> stuff)
sseVersion :: Maybe SseVersion,
bmiVersion :: Maybe BmiVersion,
avx :: Bool,
avx2 :: Bool,
avx512cd :: Bool, -- Enable AVX-512 Conflict Detection Instructions.
......@@ -1806,6 +1809,7 @@ defaultDynFlags mySettings myLlvmTargets =
interactivePrint = Nothing,
nextWrapperNum = panic "defaultDynFlags: No nextWrapperNum",
sseVersion = Nothing,
bmiVersion = Nothing,
avx = False,
avx2 = False,
avx512cd = False,
......@@ -3201,6 +3205,10 @@ dynamic_flags_deps = [
d { sseVersion = Just SSE4 }))
, make_ord_flag defGhcFlag "msse4.2" (noArg (\d ->
d { sseVersion = Just SSE42 }))
, make_ord_flag defGhcFlag "mbmi" (noArg (\d ->
d { bmiVersion = Just BMI1 }))
, make_ord_flag defGhcFlag "mbmi2" (noArg (\d ->
d { bmiVersion = Just BMI2 }))
, make_ord_flag defGhcFlag "mavx" (noArg (\d -> d { avx = True }))
, make_ord_flag defGhcFlag "mavx2" (noArg (\d -> d { avx2 = True }))
, make_ord_flag defGhcFlag "mavx512cd" (noArg (\d ->
......@@ -5446,6 +5454,25 @@ isAvx512fEnabled dflags = avx512f dflags
isAvx512pfEnabled :: DynFlags -> Bool
isAvx512pfEnabled dflags = avx512pf dflags
-- -----------------------------------------------------------------------------
-- BMI2
data BmiVersion = BMI1
| BMI2
deriving (Eq, Ord)
isBmiEnabled :: DynFlags -> Bool
isBmiEnabled dflags = case platformArch (targetPlatform dflags) of
ArchX86_64 -> bmiVersion dflags >= Just BMI1
ArchX86 -> bmiVersion dflags >= Just BMI1
_ -> False
isBmi2Enabled :: DynFlags -> Bool
isBmi2Enabled dflags = case platformArch (targetPlatform dflags) of
ArchX86_64 -> bmiVersion dflags >= Just BMI2
ArchX86 -> bmiVersion dflags >= Just BMI2
_ -> False
-- -----------------------------------------------------------------------------
-- Linker/compiler information
......
......@@ -5,6 +5,8 @@ module CPrim
, atomicRMWLabel
, cmpxchgLabel
, popCntLabel
, pdepLabel
, pextLabel
, bSwapLabel
, clzLabel
, ctzLabel
......@@ -26,6 +28,24 @@ popCntLabel w = "hs_popcnt" ++ pprWidth w
pprWidth W64 = "64"
pprWidth w = pprPanic "popCntLabel: Unsupported word width " (ppr w)
pdepLabel :: Width -> String
pdepLabel w = "hs_pdep" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "pdepLabel: Unsupported word width " (ppr w)
pextLabel :: Width -> String
pextLabel w = "hs_pext" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "pextLabel: Unsupported word width " (ppr w)
bSwapLabel :: Width -> String
bSwapLabel w = "hs_bswap" ++ pprWidth w
where
......
......@@ -2004,6 +2004,8 @@ genCCall' dflags gcp target dest_regs args
MO_BSwap w -> (fsLit $ bSwapLabel w, False)
MO_PopCnt w -> (fsLit $ popCntLabel w, False)
MO_Pdep w -> (fsLit $ pdepLabel w, False)
MO_Pext w -> (fsLit $ pextLabel w, False)
MO_Clz _ -> unsupported
MO_Ctz _ -> unsupported
MO_AtomicRMW {} -> unsupported
......
......@@ -654,6 +654,8 @@ outOfLineMachOp_table mop
MO_BSwap w -> fsLit $ bSwapLabel w
MO_PopCnt w -> fsLit $ popCntLabel w
MO_Pdep w -> fsLit $ pdepLabel w
MO_Pext w -> fsLit $ pextLabel w
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz w -> fsLit $ ctzLabel w
MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
......
......@@ -1872,6 +1872,72 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (popCntLabel width))
genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
args@[src, mask] = do
let platform = targetPlatform dflags
if isBmi2Enabled dflags
then do code_src <- getAnyReg src
code_mask <- getAnyReg mask
src_r <- getNewRegNat format
mask_r <- getNewRegNat format
let dst_r = getRegisterReg platform False (CmmLocal dst)
return $ code_src src_r `appOL` code_mask mask_r `appOL`
(if width == W8 then
-- The PDEP instruction doesn't take a r/m8
unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
unitOL (PDEP II16 (OpReg mask_r) (OpReg src_r ) dst_r)
else
unitOL (PDEP format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
(if width == W8 || width == W16 then
-- We used a 16-bit destination register above,
-- so zero-extend
unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
else nilOL)
else do
targetExpr <- cmmMakeDynamicReference dflags
CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
genCCall dflags is32Bit target dest_regs args
where
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (pdepLabel width))
genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
args@[src, mask] = do
let platform = targetPlatform dflags
if isBmi2Enabled dflags
then do code_src <- getAnyReg src
code_mask <- getAnyReg mask
src_r <- getNewRegNat format
mask_r <- getNewRegNat format
let dst_r = getRegisterReg platform False (CmmLocal dst)
return $ code_src src_r `appOL` code_mask mask_r `appOL`
(if width == W8 then
-- The PEXT instruction doesn't take a r/m8
unitOL (MOVZxL II8 (OpReg src_r ) (OpReg src_r )) `appOL`
unitOL (MOVZxL II8 (OpReg mask_r) (OpReg mask_r)) `appOL`
unitOL (PEXT II16 (OpReg mask_r) (OpReg src_r) dst_r)
else
unitOL (PEXT format (OpReg mask_r) (OpReg src_r) dst_r)) `appOL`
(if width == W8 || width == W16 then
-- We used a 16-bit destination register above,
-- so zero-extend
unitOL (MOVZxL II16 (OpReg dst_r) (OpReg dst_r))
else nilOL)
else do
targetExpr <- cmmMakeDynamicReference dflags
CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
genCCall dflags is32Bit target dest_regs args
where
format = intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (pextLabel width))
genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
| is32Bit && width == W64 = do
-- Fallback to `hs_clz64` on i386
......@@ -2689,6 +2755,9 @@ outOfLineCmmOp mop res args
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz _ -> unsupported
MO_Pdep w -> fsLit $ pdepLabel w
MO_Pext w -> fsLit $ pextLabel w
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
MO_AtomicWrite _ -> fsLit "atomicwrite"
......
......@@ -345,6 +345,10 @@ data Instr
| BSF Format Operand Reg -- bit scan forward
| BSR Format Operand Reg -- bit scan reverse
-- bit manipulation instructions
| PDEP Format Operand Operand Reg -- [BMI2] deposit bits to the specified mask
| PEXT Format Operand Operand Reg -- [BMI2] extract bits from the specified mask
-- prefetch
| PREFETCH PrefetchVariant Format Operand -- prefetch Variant, addr size, address to prefetch
-- variant can be NTA, Lvl0, Lvl1, or Lvl2
......@@ -464,6 +468,9 @@ x86_regUsageOfInstr platform instr
BSF _ src dst -> mkRU (use_R src []) [dst]
BSR _ src dst -> mkRU (use_R src []) [dst]
PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst]
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
LOCK i -> x86_regUsageOfInstr platform i
......@@ -640,6 +647,8 @@ x86_patchRegsOfInstr instr env
CLTD _ -> instr
POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
BSR fmt src dst -> BSR fmt (patchOp src) (env dst)
......
......@@ -648,6 +648,9 @@ pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst
pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst)
pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst)
pprInstr (PDEP format src mask dst) = pprFormatOpOpReg (sLit "pdep") format src mask dst
pprInstr (PEXT format src mask dst) = pprFormatOpOpReg (sLit "pext") format src mask dst
pprInstr (PREFETCH NTA format src ) = pprFormatOp_ (sLit "prefetchnta") format src
pprInstr (PREFETCH Lvl0 format src) = pprFormatOp_ (sLit "prefetcht0") format src
pprInstr (PREFETCH Lvl1 format src) = pprFormatOp_ (sLit "prefetcht1") format src
......@@ -1262,6 +1265,16 @@ pprFormatRegRegReg name format reg1 reg2 reg3
pprReg format reg3
]
pprFormatOpOpReg :: LitString -> Format -> Operand -> Operand -> Reg -> SDoc
pprFormatOpOpReg name format op1 op2 reg3
= hcat [
pprMnemonic name format,
pprOperand format op1,
comma,
pprOperand format op2,
comma,
pprReg format reg3
]
pprFormatAddrReg :: LitString -> Format -> AddrMode -> Reg -> SDoc
pprFormatAddrReg name format op dst
......
......@@ -403,6 +403,28 @@ primop PopCnt64Op "popCnt64#" GenPrimOp WORD64 -> Word#
primop PopCntOp "popCnt#" Monadic Word# -> Word#
{Count the number of set bits in a word.}
primop Pdep8Op "pdep8#" Dyadic Word# -> Word# -> Word#
{Deposit bits to lower 8 bits of a word at locations specified by a mask.}
primop Pdep16Op "pdep16#" Dyadic Word# -> Word# -> Word#
{Deposit bits to lower 16 bits of a word at locations specified by a mask.}
primop Pdep32Op "pdep32#" Dyadic Word# -> Word# -> Word#
{Deposit bits to lower 32 bits of a word at locations specified by a mask.}
primop Pdep64Op "pdep64#" GenPrimOp WORD64 -> WORD64 -> WORD64
{Deposit bits to a word at locations specified by a mask.}
primop PdepOp "pdep#" Dyadic Word# -> Word# -> Word#
{Deposit bits to a word at locations specified by a mask.}
primop Pext8Op "pext8#" Dyadic Word# -> Word# -> Word#
{Extract bits from lower 8 bits of a word at locations specified by a mask.}
primop Pext16Op "pext16#" Dyadic Word# -> Word# -> Word#
{Extract bits from lower 16 bits of a word at locations specified by a mask.}
primop Pext32Op "pext32#" Dyadic Word# -> Word# -> Word#
{Extract bits from lower 32 bits of a word at locations specified by a mask.}
primop Pext64Op "pext64#" GenPrimOp WORD64 -> WORD64 -> WORD64
{Extract bits from a word at locations specified by a mask.}
primop PextOp "pext#" Dyadic Word# -> Word# -> Word#
{Extract bits from a word at locations specified by a mask.}
primop Clz8Op "clz8#" Monadic Word# -> Word#
{Count leading zeros in the lower 8 bits of a word.}
primop Clz16Op "clz16#" Monadic Word# -> Word#
......
#include "Rts.h"
#include "MachDeps.h"
extern StgWord64 hs_pdep64(StgWord64 src, StgWord64 mask);
StgWord64
hs_pdep64(StgWord64 src, StgWord64 mask)
{
uint64_t result = 0;
while (1) {
// Mask out all but the lowest bit
const uint64_t lowest = (-mask & mask);