Commit c1d9416f authored by Dmitry Ivanov's avatar Dmitry Ivanov Committed by Ben Gamari

Compile count{Leading,Trailing}Zeros to corresponding x86_64 instructions under -mbmi2

This works similarly to existing implementation for popCount.
Trac ticket: #16086.
parent cc2261d4
......@@ -2045,25 +2045,37 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b
| otherwise = do
code_src <- getAnyReg src
src_r <- getNewRegNat format
tmp_r <- getNewRegNat format
let dst_r = getRegisterReg platform False (CmmLocal dst)
-- The following insn sequence makes sure 'clz 0' has a defined value.
-- starting with Haswell, one could use the LZCNT insn instead.
return $ code_src src_r `appOL` toOL
([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
[ BSR format (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
, CMOV NE format (OpReg tmp_r) dst_r
, XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
if isBmi2Enabled dflags
then do
src_r <- getNewRegNat (intFormat width)
return $ appOL (code_src src_r) $ case width of
W8 -> toOL
[ MOVZxL II8 (OpReg src_r) (OpReg src_r) -- zero-extend to 32 bit
, LZCNT II32 (OpReg src_r) dst_r -- lzcnt with extra 24 zeros
, SUB II32 (OpImm (ImmInt 24)) (OpReg dst_r) -- compensate for extra zeros
]
W16 -> toOL
[ LZCNT II16 (OpReg src_r) dst_r
, MOVZxL II16 (OpReg dst_r) (OpReg dst_r) -- zero-extend from 16 bit
]
_ -> unitOL (LZCNT (intFormat width) (OpReg src_r) dst_r)
else do
let format = if width == W8 then II16 else intFormat width
src_r <- getNewRegNat format
tmp_r <- getNewRegNat format
return $ code_src src_r `appOL` toOL
([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
[ BSR format (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
, CMOV NE format (OpReg tmp_r) dst_r
, XOR format (OpImm (ImmInt (bw-1))) (OpReg dst_r)
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
where
bw = widthInBits width
platform = targetPlatform dflags
format = if width == W8 then II16 else intFormat width
lbl = mkCmmCodeLabel primUnitId (fsLit (clzLabel width))
genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
......@@ -2073,6 +2085,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
dst_r = getRegisterReg platform False (CmmLocal dst)
lbl1 <- getBlockIdNat
lbl2 <- getBlockIdNat
let format = if width == W8 then II16 else intFormat width
tmp_r <- getNewRegNat format
-- New CFG Edges:
......@@ -2109,24 +2122,38 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
| otherwise = do
code_src <- getAnyReg src
src_r <- getNewRegNat format
tmp_r <- getNewRegNat format
let dst_r = getRegisterReg platform False (CmmLocal dst)
-- The following insn sequence makes sure 'ctz 0' has a defined value.
-- starting with Haswell, one could use the TZCNT insn instead.
return $ code_src src_r `appOL` toOL
([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
[ BSF format (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
, CMOV NE format (OpReg tmp_r) dst_r
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
if isBmi2Enabled dflags
then do
src_r <- getNewRegNat (intFormat width)
return $ appOL (code_src src_r) $ case width of
W8 -> toOL
[ OR II32 (OpImm (ImmInt 0xFFFFFF00)) (OpReg src_r)
, TZCNT II32 (OpReg src_r) dst_r
]
W16 -> toOL
[ TZCNT II16 (OpReg src_r) dst_r
, MOVZxL II16 (OpReg dst_r) (OpReg dst_r)
]
_ -> unitOL $ TZCNT (intFormat width) (OpReg src_r) dst_r
else do
-- The following insn sequence makes sure 'ctz 0' has a defined value.
-- starting with Haswell, one could use the TZCNT insn instead.
let format = if width == W8 then II16 else intFormat width
src_r <- getNewRegNat format
tmp_r <- getNewRegNat format
return $ code_src src_r `appOL` toOL
([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
[ BSF format (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
, CMOV NE format (OpReg tmp_r) dst_r
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
where
bw = widthInBits width
platform = targetPlatform dflags
format = if width == W8 then II16 else intFormat width
genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args bid = do
targetExpr <- cmmMakeDynamicReference dflags
......
......@@ -342,6 +342,8 @@ data Instr
-- bit counting instructions
| POPCNT Format Operand Reg -- [SSE4.2] count number of bits set to 1
| LZCNT Format Operand Reg -- [BMI2] count number of leading zeros
| TZCNT Format Operand Reg -- [BMI2] count number of trailing zeros
| BSF Format Operand Reg -- bit scan forward
| BSR Format Operand Reg -- bit scan reverse
......@@ -471,6 +473,8 @@ x86_regUsageOfInstr platform instr
DELTA _ -> noUsage
POPCNT _ src dst -> mkRU (use_R src []) [dst]
LZCNT _ src dst -> mkRU (use_R src []) [dst]
TZCNT _ src dst -> mkRU (use_R src []) [dst]
BSF _ src dst -> mkRU (use_R src []) [dst]
BSR _ src dst -> mkRU (use_R src []) [dst]
......@@ -653,6 +657,8 @@ x86_patchRegsOfInstr instr env
CLTD _ -> instr
POPCNT fmt src dst -> POPCNT fmt (patchOp src) (env dst)
LZCNT fmt src dst -> LZCNT fmt (patchOp src) (env dst)
TZCNT fmt src dst -> TZCNT fmt (patchOp src) (env dst)
PDEP fmt src mask dst -> PDEP fmt (patchOp src) (patchOp mask) (env dst)
PEXT fmt src mask dst -> PEXT fmt (patchOp src) (patchOp mask) (env dst)
BSF fmt src dst -> BSF fmt (patchOp src) (env dst)
......
......@@ -693,6 +693,8 @@ pprInstr (XOR FF64 src dst) = pprOpOp (sLit "xorpd") FF64 src dst
pprInstr (XOR format src dst) = pprFormatOpOp (sLit "xor") format src dst
pprInstr (POPCNT format src dst) = pprOpOp (sLit "popcnt") format src (OpReg dst)
pprInstr (LZCNT format src dst) = pprOpOp (sLit "lzcnt") format src (OpReg dst)
pprInstr (TZCNT format src dst) = pprOpOp (sLit "tzcnt") format src (OpReg dst)
pprInstr (BSF format src dst) = pprOpOp (sLit "bsf") format src (OpReg dst)
pprInstr (BSR format src dst) = pprOpOp (sLit "bsr") format src (OpReg dst)
......
......@@ -1088,6 +1088,16 @@ Some flags only make sense for particular target platforms.
SSE4.2 if your processor supports it but detects this automatically
so no flag is required.
.. ghc-flag:: -mbmi2
:shortdesc: (x86 only) Use BMI2 for bit manipulation operations
:type: dynamic
:category: platform-options
(x86 only, added in GHC 7.4.1) Use the BMI2 instruction set to
implement some bit operations when using the
:ref:`native code generator <native-code-gen>`. The resulting compiled
code will only run on processors that support BMI2 (Intel Haswell and newer, AMD Excavator, Zen and newer).
Miscellaneous flags
-------------------
......
......@@ -438,6 +438,9 @@ instance Bits Int where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
-- We want popCnt# to be inlined in user code so that `ghc -msse4.2`
-- can compile it down to a popcnt instruction without an extra function call
{-# INLINE popCount #-}
zeroBits = 0
......@@ -478,13 +481,16 @@ instance Bits Int where
instance FiniteBits Int where
finiteBitSize _ = WORD_SIZE_IN_BITS
countLeadingZeros (I# x#) = I# (word2Int# (clz# (int2Word# x#)))
{-# INLINE countLeadingZeros #-}
countTrailingZeros (I# x#) = I# (word2Int# (ctz# (int2Word# x#)))
{-# INLINE countTrailingZeros #-}
-- | @since 2.01
instance Bits Word where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(W# x#) .&. (W# y#) = W# (x# `and#` y#)
(W# x#) .|. (W# y#) = W# (x# `or#` y#)
......@@ -519,7 +525,9 @@ instance Bits Word where
instance FiniteBits Word where
finiteBitSize _ = WORD_SIZE_IN_BITS
countLeadingZeros (W# x#) = I# (word2Int# (clz# x#))
{-# INLINE countLeadingZeros #-}
countTrailingZeros (W# x#) = I# (word2Int# (ctz# x#))
{-# INLINE countTrailingZeros #-}
-- | @since 2.01
instance Bits Integer where
......
......@@ -177,6 +177,7 @@ instance Bits Int8 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(I8# x#) .&. (I8# y#) = I8# (word2Int# (int2Word# x# `and#` int2Word# y#))
(I8# x#) .|. (I8# y#) = I8# (word2Int# (int2Word# x# `or#` int2Word# y#))
......@@ -211,6 +212,8 @@ instance Bits Int8 where
-- | @since 4.6.0.0
instance FiniteBits Int8 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 8
countLeadingZeros (I8# x#) = I# (word2Int# (clz8# (int2Word# x#)))
countTrailingZeros (I8# x#) = I# (word2Int# (ctz8# (int2Word# x#)))
......@@ -381,6 +384,7 @@ instance Bits Int16 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(I16# x#) .&. (I16# y#) = I16# (word2Int# (int2Word# x# `and#` int2Word# y#))
(I16# x#) .|. (I16# y#) = I16# (word2Int# (int2Word# x# `or#` int2Word# y#))
......@@ -415,6 +419,8 @@ instance Bits Int16 where
-- | @since 4.6.0.0
instance FiniteBits Int16 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 16
countLeadingZeros (I16# x#) = I# (word2Int# (clz16# (int2Word# x#)))
countTrailingZeros (I16# x#) = I# (word2Int# (ctz16# (int2Word# x#)))
......@@ -587,6 +593,7 @@ instance Bits Int32 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(I32# x#) .&. (I32# y#) = I32# (word2Int# (int2Word# x# `and#` int2Word# y#))
(I32# x#) .|. (I32# y#) = I32# (word2Int# (int2Word# x# `or#` int2Word# y#))
......@@ -622,6 +629,8 @@ instance Bits Int32 where
-- | @since 4.6.0.0
instance FiniteBits Int32 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 32
countLeadingZeros (I32# x#) = I# (word2Int# (clz32# (int2Word# x#)))
countTrailingZeros (I32# x#) = I# (word2Int# (ctz32# (int2Word# x#)))
......@@ -825,6 +834,7 @@ instance Bits Int64 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(I64# x#) .&. (I64# y#) = I64# (word64ToInt64# (int64ToWord64# x# `and64#` int64ToWord64# y#))
(I64# x#) .|. (I64# y#) = I64# (word64ToInt64# (int64ToWord64# x# `or64#` int64ToWord64# y#))
......@@ -1002,6 +1012,7 @@ instance Bits Int64 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(I64# x#) .&. (I64# y#) = I64# (word2Int# (int2Word# x# `and#` int2Word# y#))
(I64# x#) .|. (I64# y#) = I64# (word2Int# (int2Word# x# `or#` int2Word# y#))
......@@ -1078,6 +1089,8 @@ uncheckedIShiftRA64# = uncheckedIShiftRA#
-- | @since 4.6.0.0
instance FiniteBits Int64 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 64
#if WORD_SIZE_IN_BITS < 64
countLeadingZeros (I64# x#) = I# (word2Int# (clz64# (int64ToWord64# x#)))
......
......@@ -168,6 +168,7 @@ instance Bits Word8 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(W8# x#) .&. (W8# y#) = W8# (x# `and#` y#)
(W8# x#) .|. (W8# y#) = W8# (x# `or#` y#)
......@@ -201,6 +202,8 @@ instance Bits Word8 where
-- | @since 4.6.0.0
instance FiniteBits Word8 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 8
countLeadingZeros (W8# x#) = I# (word2Int# (clz8# x#))
countTrailingZeros (W8# x#) = I# (word2Int# (ctz8# x#))
......@@ -356,6 +359,7 @@ instance Bits Word16 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(W16# x#) .&. (W16# y#) = W16# (x# `and#` y#)
(W16# x#) .|. (W16# y#) = W16# (x# `or#` y#)
......@@ -389,6 +393,8 @@ instance Bits Word16 where
-- | @since 4.6.0.0
instance FiniteBits Word16 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 16
countLeadingZeros (W16# x#) = I# (word2Int# (clz16# x#))
countTrailingZeros (W16# x#) = I# (word2Int# (ctz16# x#))
......@@ -590,6 +596,7 @@ instance Bits Word32 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(W32# x#) .&. (W32# y#) = W32# (x# `and#` y#)
(W32# x#) .|. (W32# y#) = W32# (x# `or#` y#)
......@@ -623,6 +630,8 @@ instance Bits Word32 where
-- | @since 4.6.0.0
instance FiniteBits Word32 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 32
countLeadingZeros (W32# x#) = I# (word2Int# (clz32# x#))
countTrailingZeros (W32# x#) = I# (word2Int# (ctz32# x#))
......@@ -762,6 +771,7 @@ instance Bits Word64 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(W64# x#) .&. (W64# y#) = W64# (x# `and64#` y#)
(W64# x#) .|. (W64# y#) = W64# (x# `or64#` y#)
......@@ -914,6 +924,7 @@ instance Bits Word64 where
{-# INLINE shift #-}
{-# INLINE bit #-}
{-# INLINE testBit #-}
{-# INLINE popCount #-}
(W64# x#) .&. (W64# y#) = W64# (x# `and#` y#)
(W64# x#) .|. (W64# y#) = W64# (x# `or#` y#)
......@@ -959,6 +970,8 @@ uncheckedShiftRL64# = uncheckedShiftRL#
-- | @since 4.6.0.0
instance FiniteBits Word64 where
{-# INLINE countLeadingZeros #-}
{-# INLINE countTrailingZeros #-}
finiteBitSize _ = 64
countLeadingZeros (W64# x#) = I# (word2Int# (clz64# x#))
countTrailingZeros (W64# x#) = I# (word2Int# (ctz64# x#))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment