Commit e0c1767d authored by Herbert Valerio Riedel's avatar Herbert Valerio Riedel 🕺

Implement new CLZ and CTZ primops (re #9340)

This implements the new primops

  clz#, clz32#, clz64#,
  ctz#, ctz32#, ctz64#

which provide efficient implementations of the popular
count-leading-zero and count-trailing-zero respectively
(see testcase for a pure Haskell reference implementation).

On x86, NCG as well as LLVM generates code based on the BSF/BSR
instructions (which need extra logic to make the 0-case well-defined).

Test Plan: validate and succesful tests on i686 and amd64

Reviewers: rwbarton, simonmar, ezyang, austin

Subscribers: simonmar, relrod, ezyang, carter

Differential Revision: https://phabricator.haskell.org/D144

GHC Trac Issues: #9340
parent 6b5ea617
......@@ -549,6 +549,9 @@ data CallishMachOp
| MO_Memmove
| MO_PopCnt Width
| MO_Clz Width
| MO_Ctz Width
| MO_BSwap Width
-- Atomic read-modify-write.
......
......@@ -753,6 +753,8 @@ pprCallishMachOp_for_C mop
MO_Memmove -> ptext (sLit "memmove")
(MO_BSwap w) -> ptext (sLit $ bSwapLabel w)
(MO_PopCnt w) -> ptext (sLit $ popCntLabel w)
(MO_Clz w) -> ptext (sLit $ clzLabel w)
(MO_Ctz w) -> ptext (sLit $ ctzLabel w)
(MO_AtomicRMW w amop) -> ptext (sLit $ atomicRMWLabel w amop)
(MO_Cmpxchg w) -> ptext (sLit $ cmpxchgLabel w)
(MO_AtomicRead w) -> ptext (sLit $ atomicReadLabel w)
......
......@@ -563,6 +563,20 @@ emitPrimOp _ [res] PopCnt32Op [w] = emitPopCntCall res w W32
emitPrimOp _ [res] PopCnt64Op [w] = emitPopCntCall res w W64
emitPrimOp dflags [res] PopCntOp [w] = emitPopCntCall res w (wordWidth dflags)
-- count leading zeros
emitPrimOp _ [res] Clz8Op [w] = emitClzCall res w W8
emitPrimOp _ [res] Clz16Op [w] = emitClzCall res w W16
emitPrimOp _ [res] Clz32Op [w] = emitClzCall res w W32
emitPrimOp _ [res] Clz64Op [w] = emitClzCall res w W64
emitPrimOp dflags [res] ClzOp [w] = emitClzCall res w (wordWidth dflags)
-- count trailing zeros
emitPrimOp _ [res] Ctz8Op [w] = emitCtzCall res w W8
emitPrimOp _ [res] Ctz16Op [w] = emitCtzCall res w W16
emitPrimOp _ [res] Ctz32Op [w] = emitCtzCall res w W32
emitPrimOp _ [res] Ctz64Op [w] = emitCtzCall res w W64
emitPrimOp dflags [res] CtzOp [w] = emitCtzCall res w (wordWidth dflags)
-- Unsigned int to floating point conversions
emitPrimOp _ [res] Word2FloatOp [w] = emitPrimCall [res]
(MO_UF_Conv W32) [w]
......@@ -2096,3 +2110,17 @@ emitPopCntCall res x width = do
[ res ]
(MO_PopCnt width)
[ x ]
emitClzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
emitClzCall res x width = do
emitPrimCall
[ res ]
(MO_Clz width)
[ x ]
emitCtzCall :: LocalReg -> CmmExpr -> Width -> FCode ()
emitCtzCall res x width = do
emitPrimCall
[ res ]
(MO_Ctz width)
[ x ]
......@@ -224,9 +224,14 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
return (stmts, top1 ++ top2)
| otherwise = panic $ "prefetch locality level integer must be between 0 and 3, given: " ++ (show localityInt)
-- Handle PopCnt and BSwap that need to only convert arg and return types
-- Handle PopCnt, Clz, Ctz, and BSwap that need to only convert arg
-- and return types
genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_Clz w)) dsts args =
genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_Ctz w)) dsts args =
genCallSimpleCast w t dsts args
genCall t@(PrimTarget (MO_BSwap w)) dsts args =
genCallSimpleCast w t dsts args
......@@ -558,6 +563,8 @@ cmmPrimOpFunctions mop = do
(MO_PopCnt w) -> fsLit $ "llvm.ctpop." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_BSwap w) -> fsLit $ "llvm.bswap." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Clz w) -> fsLit $ "llvm.ctlz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Ctz w) -> fsLit $ "llvm.cttz." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
......
......@@ -6,6 +6,8 @@ module CPrim
, cmpxchgLabel
, popCntLabel
, bSwapLabel
, clzLabel
, ctzLabel
, word2FloatLabel
) where
......@@ -30,6 +32,24 @@ bSwapLabel w = "hs_bswap" ++ pprWidth w
pprWidth W64 = "64"
pprWidth w = pprPanic "bSwapLabel: Unsupported word width " (ppr w)
clzLabel :: Width -> String
clzLabel w = "hs_clz" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "clzLabel: Unsupported word width " (ppr w)
ctzLabel :: Width -> String
ctzLabel w = "hs_ctz" ++ pprWidth w
where
pprWidth W8 = "8"
pprWidth W16 = "16"
pprWidth W32 = "32"
pprWidth W64 = "64"
pprWidth w = pprPanic "ctzLabel: Unsupported word width " (ppr w)
word2FloatLabel :: Width -> String
word2FloatLabel w = "hs_word2float" ++ pprWidth w
where
......
......@@ -1151,6 +1151,8 @@ genCCall' dflags gcp target dest_regs args0
MO_BSwap w -> (fsLit $ bSwapLabel w, False)
MO_PopCnt w -> (fsLit $ popCntLabel w, False)
MO_Clz w -> (fsLit $ clzLabel w, False)
MO_Ctz w -> (fsLit $ ctzLabel w, False)
MO_AtomicRMW w amop -> (fsLit $ atomicRMWLabel w amop, False)
MO_Cmpxchg w -> (fsLit $ cmpxchgLabel w, False)
MO_AtomicRead w -> (fsLit $ atomicReadLabel w, False)
......
......@@ -654,6 +654,8 @@ outOfLineMachOp_table mop
MO_BSwap w -> fsLit $ bSwapLabel w
MO_PopCnt w -> fsLit $ popCntLabel w
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz w -> fsLit $ ctzLabel w
MO_AtomicRMW w amop -> fsLit $ atomicRMWLabel w amop
MO_Cmpxchg w -> fsLit $ cmpxchgLabel w
MO_AtomicRead w -> fsLit $ atomicReadLabel w
......
......@@ -1767,6 +1767,69 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
size = intSize width
lbl = mkCmmCodeLabel primPackageKey (fsLit (popCntLabel width))
genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src]
| is32Bit && width == W64 = do
-- Fallback to `hs_clz64` on i386
targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
genCCall dflags is32Bit target dest_regs args
| otherwise = do
code_src <- getAnyReg src
src_r <- getNewRegNat size
tmp_r <- getNewRegNat size
let dst_r = getRegisterReg platform False (CmmLocal dst)
-- The following insn sequence makes sure 'clz 0' has a defined value.
-- starting with Haswell, one could use the LZCNT insn instead.
return $ code_src src_r `appOL` toOL
([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
[ BSR size (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt (2*bw-1))) (OpReg dst_r)
, CMOV NE size (OpReg tmp_r) dst_r
, XOR size (OpImm (ImmInt (bw-1))) (OpReg dst_r)
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
where
bw = widthInBits width
platform = targetPlatform dflags
size = if width == W8 then II16 else intSize width
lbl = mkCmmCodeLabel primPackageKey (fsLit (clzLabel width))
genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) dest_regs@[dst] args@[src]
| is32Bit, width == W64 = do
-- Fallback to `hs_ctz64` on i386
targetExpr <- cmmMakeDynamicReference dflags CallReference lbl
let target = ForeignTarget targetExpr (ForeignConvention CCallConv
[NoHint] [NoHint]
CmmMayReturn)
genCCall dflags is32Bit target dest_regs args
| otherwise = do
code_src <- getAnyReg src
src_r <- getNewRegNat size
tmp_r <- getNewRegNat size
let dst_r = getRegisterReg platform False (CmmLocal dst)
-- The following insn sequence makes sure 'ctz 0' has a defined value.
-- starting with Haswell, one could use the TZCNT insn instead.
return $ code_src src_r `appOL` toOL
([ MOVZxL II8 (OpReg src_r) (OpReg src_r) | width == W8 ] ++
[ BSF size (OpReg src_r) tmp_r
, MOV II32 (OpImm (ImmInt bw)) (OpReg dst_r)
, CMOV NE size (OpReg tmp_r) dst_r
]) -- NB: We don't need to zero-extend the result for the
-- W8/W16 cases because the 'MOV' insn already
-- took care of implicitly clearing the upper bits
where
bw = widthInBits width
platform = targetPlatform dflags
size = if width == W8 then II16 else intSize width
lbl = mkCmmCodeLabel primPackageKey (fsLit (ctzLabel width))
genCCall dflags is32Bit (PrimTarget (MO_UF_Conv width)) dest_regs args = do
targetExpr <- cmmMakeDynamicReference dflags
CallReference lbl
......@@ -2403,6 +2466,8 @@ outOfLineCmmOp mop res args
MO_PopCnt _ -> fsLit "popcnt"
MO_BSwap _ -> fsLit "bswap"
MO_Clz w -> fsLit $ clzLabel w
MO_Ctz w -> fsLit $ ctzLabel w
MO_AtomicRMW _ _ -> fsLit "atomicrmw"
MO_AtomicRead _ -> fsLit "atomicread"
......
......@@ -386,6 +386,28 @@ primop PopCnt64Op "popCnt64#" GenPrimOp WORD64 -> Word#
primop PopCntOp "popCnt#" Monadic Word# -> Word#
{Count the number of set bits in a word.}
primop Clz8Op "clz8#" Monadic Word# -> Word#
{Count leading zeros in the lower 8 bits of a word.}
primop Clz16Op "clz16#" Monadic Word# -> Word#
{Count leading zeros in the lower 16 bits of a word.}
primop Clz32Op "clz32#" Monadic Word# -> Word#
{Count leading zeros in the lower 32 bits of a word.}
primop Clz64Op "clz64#" GenPrimOp WORD64 -> Word#
{Count leading zeros in a 64-bit word.}
primop ClzOp "clz#" Monadic Word# -> Word#
{Count leading zeros in a word.}
primop Ctz8Op "ctz8#" Monadic Word# -> Word#
{Count trailing zeros in the lower 8 bits of a word.}
primop Ctz16Op "ctz16#" Monadic Word# -> Word#
{Count trailing zeros in the lower 16 bits of a word.}
primop Ctz32Op "ctz32#" Monadic Word# -> Word#
{Count trailing zeros in the lower 32 bits of a word.}
primop Ctz64Op "ctz64#" GenPrimOp WORD64 -> Word#
{Count trailing zeros in a 64-bit word.}
primop CtzOp "ctz#" Monadic Word# -> Word#
{Count trailing zeros in a word.}
primop BSwap16Op "byteSwap16#" Monadic Word# -> Word#
{Swap bytes in the lower 16 bits of a word. The higher bytes are undefined. }
primop BSwap32Op "byteSwap32#" Monadic Word# -> Word#
......
......@@ -32,4 +32,16 @@ StgWord hs_popcnt(StgWord x);
StgFloat hs_word2float32(StgWord x);
StgDouble hs_word2float64(StgWord x);
/* libraries/ghc-prim/cbits/clz.c */
StgWord hs_clz8(StgWord x);
StgWord hs_clz16(StgWord x);
StgWord hs_clz32(StgWord x);
StgWord hs_clz64(StgWord64 x);
/* libraries/ghc-prim/cbits/ctz.c */
StgWord hs_ctz8(StgWord x);
StgWord hs_ctz16(StgWord x);
StgWord hs_ctz32(StgWord x);
StgWord hs_ctz64(StgWord64 x);
#endif /* PRIM_H */
#include "MachDeps.h"
#include "Rts.h"
#include <stdint.h>
// Fall-back implementations for count-leading-zeros primop
//
// __builtin_clz*() is supported by GCC and Clang
#if SIZEOF_UNSIGNED_INT == 4
StgWord
hs_clz8(StgWord x)
{
return (uint8_t)x ? __builtin_clz((uint8_t)x)-24 : 8;
}
StgWord
hs_clz16(StgWord x)
{
return (uint16_t)x ? __builtin_clz((uint16_t)x)-16 : 16;
}
StgWord
hs_clz32(StgWord x)
{
return (uint32_t)x ? __builtin_clz((uint32_t)x) : 32;
}
#else
# error no suitable __builtin_clz() found
#endif
StgWord
hs_clz64(StgWord64 x)
{
#if SIZEOF_UNSIGNED_LONG == 8
return x ? __builtin_clzl(x) : 64;
#elif SIZEOF_UNSIGNED_LONG_LONG == 8
return x ? __builtin_clzll(x) : 64;
#else
# error no suitable __builtin_clz() found
#endif
}
#include "MachDeps.h"
#include "Rts.h"
#include <stdint.h>
// Fall-back implementations for count-trailing-zeros primop
//
// __builtin_ctz*() is supported by GCC and Clang
#if SIZEOF_UNSIGNED_INT == 4
StgWord
hs_ctz8(StgWord x)
{
return (uint8_t)x ? __builtin_ctz(x) : 8;
}
StgWord
hs_ctz16(StgWord x)
{
return (uint16_t)x ? __builtin_ctz(x) : 16;
}
StgWord
hs_ctz32(StgWord x)
{
return (uint32_t)x ? __builtin_ctz(x) : 32;
}
#else
# error no suitable __builtin_ctz() found
#endif
StgWord
hs_ctz64(StgWord64 x)
{
#if SIZEOF_UNSIGNED_LONG == 8
return x ? __builtin_ctzl(x) : 64;
#elif SIZEOF_UNSIGNED_LONG_LONG == 8
return x ? __builtin_ctzll(x) : 64;
#else
# error no suitable __builtin_ctz() found
#endif
}
......@@ -54,6 +54,8 @@ Library
c-sources:
cbits/atomic.c
cbits/bswap.c
cbits/clz.c
cbits/ctz.c
cbits/debug.c
cbits/longlong.c
cbits/popcnt.c
......
......@@ -181,6 +181,7 @@ mk/ghcconfig_*_inplace_bin_ghc-stage2.exe.mk
/tests/codeGen/should_run/T8256
/tests/codeGen/should_run/T9001
/tests/codeGen/should_run/T9013
/tests/codeGen/should_run/T9340
/tests/codeGen/should_run/Word2Float64
/tests/codeGen/should_run/cgrun001
/tests/codeGen/should_run/cgrun002
......
{-# LANGUAGE MagicHash #-}
import Control.Monad
import Data.Bits
import GHC.Exts
import GHC.Word
import Numeric (showHex)
-- Reference Implementation
-- count trailing zeros
ctzRI :: FiniteBits a => a -> Word
ctzRI x = fromIntegral $ go 0
where
go i | i >= w = i
| testBit x i = i
| otherwise = go (i+1)
w = finiteBitSize x
-- count leading zeros
clzRI :: FiniteBits a => a -> Word
clzRI x = fromIntegral $ (w-1) - go (w-1)
where
go i | i < 0 = i -- no bit set
| testBit x i = i
| otherwise = go (i-1)
w = finiteBitSize x
clzRI32, ctzRI32 :: Word -> Word
clzRI32 x = clzRI (fromIntegral x :: Word32)
ctzRI32 x = ctzRI (fromIntegral x :: Word32)
clzRI16, ctzRI16 :: Word -> Word
clzRI16 x = clzRI (fromIntegral x :: Word16)
ctzRI16 x = ctzRI (fromIntegral x :: Word16)
clzRI8, ctzRI8 :: Word -> Word
clzRI8 x = clzRI (fromIntegral x :: Word8)
ctzRI8 x = ctzRI (fromIntegral x :: Word8)
-- Implementation Under Test
ctzIUT, clzIUT :: Word -> Word
ctzIUT (W# x#) = W# (ctz# x#)
clzIUT (W# x#) = W# (clz# x#)
ctzIUT8, clzIUT8 :: Word -> Word
ctzIUT8 (W# x#) = W# (ctz8# x#)
clzIUT8 (W# x#) = W# (clz8# x#)
ctzIUT16, clzIUT16 :: Word -> Word
ctzIUT16 (W# x#) = W# (ctz16# x#)
clzIUT16 (W# x#) = W# (clz16# x#)
ctzIUT32, clzIUT32 :: Word -> Word
ctzIUT32 (W# x#) = W# (ctz32# x#)
clzIUT32 (W# x#) = W# (clz32# x#)
ctzIUT64, clzIUT64 :: Word64 -> Word
ctzIUT64 (W64# x#) = W# (ctz64# x#)
clzIUT64 (W64# x#) = W# (clz64# x#)
main :: IO ()
main = do
forM_ testpats $ \w64 -> do
let w = fromIntegral w64 :: Word
check "clz" clzRI clzIUT w
check "clz8" clzRI8 clzIUT8 w
check "clz16" clzRI16 clzIUT16 w
check "clz32" clzRI32 clzIUT32 w
check "clz64" clzRI clzIUT64 w64
check "ctz" ctzRI ctzIUT w
check "ctz8" ctzRI8 ctzIUT8 w
check "ctz16" ctzRI16 ctzIUT16 w
check "ctz32" ctzRI32 ctzIUT32 w
check "ctz64" ctzRI ctzIUT64 w64
putStrLn $ concat ["tested ", show (length testpats), " patterns"]
where
-- try to construct some interesting patterns
testpats :: [Word64]
testpats = [ bit i - 1 | i <- [0..63] ] ++
[ complement (bit i - 1) | i <- [0..63] ] ++
[ bit i .|. bit j | i <- [0..63], j <- [0..i] ]
check s fri fiut v = unless (vri == viut) $ do
putStrLn $ concat [ "FAILED ", s, " for x=0x", showHex v ""
, " (RI=", show vri, " IUT=", show viut, ")"
]
where
vri = fri v
viut = fiut v
......@@ -122,3 +122,4 @@ test('SizeOfSmallArray', normal, compile_and_run, [''])
test('T9001', normal, compile_and_run, [''])
test('T9013', omit_ways(['ghci']), # ghci doesn't support unboxed tuples
compile_and_run, [''])
test('T9340', normal, compile_and_run, [''])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment