From 71010381f4270966de334193ab2bfc67f8524212 Mon Sep 17 00:00:00 2001 From: Alex Mason <git@axman6.com> Date: Fri, 26 Apr 2024 16:55:36 +1000 Subject: [PATCH] Add AArch64 CLZ, CTZ, RBIT primop implementations. Adds support for emitting the clz and rbit instructions, which are used by GHC.Prim.clz*#, GHC.Prim.ctz*# and GHC.Prim.bitReverse*#. --- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs | 134 +++++++++++++++++- compiler/GHC/CmmToAsm/AArch64/Instr.hs | 14 +- compiler/GHC/CmmToAsm/AArch64/Ppr.hs | 2 + testsuite/tests/codeGen/should_run/CtzClz0.hs | 27 ++++ testsuite/tests/codeGen/should_run/all.T | 1 + 5 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 testsuite/tests/codeGen/should_run/CtzClz0.hs diff --git a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs index 9b642ea3cff..94cb802e700 100644 --- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs @@ -1757,6 +1757,137 @@ genCCall target dest_regs arg_regs bid = do truncateReg W64 w lo , Nothing) | otherwise -> unsupported (MO_U_Mul2 w) + PrimTarget (MO_Clz w) + | w == W64 || w == W32 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst_reg = getRegisterReg platform (CmmLocal dst) + return ( + code_x `snocOL` + CLZ (OpReg w dst_reg) (OpReg w reg_a) + , Nothing) + | w == W16 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst' = getRegisterReg platform (CmmLocal dst) + r n = OpReg W32 n + imm n = OpImm (ImmInt n) + {- dst = clz(x << 16 | 0x0000_8000) -} + return ( + code_x `appOL` toOL + [ LSL (r dst') (r reg_a) (imm 16) + , ORR (r dst') (r dst') (imm 0x00008000) + , CLZ (r dst') (r dst') + ] + , Nothing) + | w == W8 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst' = getRegisterReg platform (CmmLocal dst) + r n = OpReg W32 n + imm n = OpImm (ImmInt n) + {- dst = clz(x << 24 | 0x0080_0000) -} + return ( + code_x `appOL` toOL + [ LSL (r dst') (r reg_a) (imm 24) + , ORR (r dst') (r dst') (imm 0x00800000) + , CLZ (r dst') (r dst') + ] + , Nothing) + | otherwise -> unsupported (MO_Clz w) + PrimTarget (MO_Ctz w) + | w == W64 || w == W32 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst_reg = getRegisterReg platform (CmmLocal dst) + return ( + code_x `snocOL` + RBIT (OpReg w dst_reg) (OpReg w reg_a) `snocOL` + CLZ (OpReg w dst_reg) (OpReg w dst_reg) + , Nothing) + | w == W16 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst' = getRegisterReg platform (CmmLocal dst) + r n = OpReg W32 n + imm n = OpImm (ImmInt n) + {- dst = clz(reverseBits(x) | 0x0000_8000) -} + return ( + code_x `appOL` toOL + [ RBIT (r dst') (r reg_a) + , ORR (r dst') (r dst') (imm 0x00008000) + , CLZ (r dst') (r dst') + ] + , Nothing) + | w == W8 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst' = getRegisterReg platform (CmmLocal dst) + r n = OpReg W32 n + imm n = OpImm (ImmInt n) + {- dst = clz(reverseBits(x) | 0x0080_0000) -} + return ( + code_x `appOL` toOL + [ RBIT (r dst') (r reg_a) + , ORR (r dst') (r dst') (imm 0x00800000) + , CLZ (r dst') (r dst') + ] + , Nothing) + | otherwise -> unsupported (MO_Ctz w) + PrimTarget (MO_BRev w) + | w == W64 || w == W32 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst_reg = getRegisterReg platform (CmmLocal dst) + return ( + code_x `snocOL` + RBIT (OpReg w dst_reg) (OpReg w reg_a) + , Nothing) + | w == W16 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst' = getRegisterReg platform (CmmLocal dst) + r n = OpReg W32 n + imm n = OpImm (ImmInt n) + {- dst = reverseBits32(x << 16) -} + return ( + code_x `appOL` toOL + [ LSL (r dst') (r reg_a) (imm 16) + , RBIT (r dst') (r dst') + ] + , Nothing) + | w == W8 + , [src] <- arg_regs + , [dst] <- dest_regs + -> do + (reg_a, _format_x, code_x) <- getSomeReg src + let dst' = getRegisterReg platform (CmmLocal dst) + r n = OpReg W32 n + imm n = OpImm (ImmInt n) + {- dst = reverseBits32(x << 24) -} + return ( + code_x `appOL` toOL + [ LSL (r dst') (r reg_a) (imm 24) + , RBIT (r dst') (r dst') + ] + , Nothing) + | otherwise -> unsupported (MO_BRev w) -- or a possibly side-effecting machine operation @@ -1883,10 +2014,7 @@ genCCall target dest_regs arg_regs bid = do MO_PopCnt w -> mkCCall (popCntLabel w) MO_Pdep w -> mkCCall (pdepLabel w) MO_Pext w -> mkCCall (pextLabel w) - MO_Clz w -> mkCCall (clzLabel w) - MO_Ctz w -> mkCCall (ctzLabel w) MO_BSwap w -> mkCCall (bSwapLabel w) - MO_BRev w -> mkCCall (bRevLabel w) -- -- Atomic read-modify-write. MO_AtomicRead w ord diff --git a/compiler/GHC/CmmToAsm/AArch64/Instr.hs b/compiler/GHC/CmmToAsm/AArch64/Instr.hs index 8836ab7d9f7..01443e02bad 100644 --- a/compiler/GHC/CmmToAsm/AArch64/Instr.hs +++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs @@ -100,6 +100,8 @@ regUsageOfInstr platform instr = case instr of UXTB dst src -> usage (regOp src, regOp dst) SXTH dst src -> usage (regOp src, regOp dst) UXTH dst src -> usage (regOp src, regOp dst) + CLZ dst src -> usage (regOp src, regOp dst) + RBIT dst src -> usage (regOp src, regOp dst) -- 3. Logical and Move Instructions ------------------------------------------ AND dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) ASR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst) @@ -140,7 +142,8 @@ regUsageOfInstr platform instr = case instr of FMA _ dst src1 src2 src3 -> usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst) - _ -> panic $ "regUsageOfInstr: " ++ instrCon instr + LOCATION{} -> panic $ "regUsageOfInstr: " ++ instrCon instr + NEWBLOCK{} -> panic $ "regUsageOfInstr: " ++ instrCon instr where -- filtering the usage is necessary, otherwise the register @@ -234,6 +237,8 @@ patchRegsOfInstr instr env = case instr of UXTB o1 o2 -> UXTB (patchOp o1) (patchOp o2) SXTH o1 o2 -> SXTH (patchOp o1) (patchOp o2) UXTH o1 o2 -> UXTH (patchOp o1) (patchOp o2) + CLZ o1 o2 -> CLZ (patchOp o1) (patchOp o2) + RBIT o1 o2 -> RBIT (patchOp o1) (patchOp o2) -- 3. Logical and Move Instructions ---------------------------------------- AND o1 o2 o3 -> AND (patchOp o1) (patchOp o2) (patchOp o3) @@ -276,7 +281,8 @@ patchRegsOfInstr instr env = case instr of FMA s o1 o2 o3 o4 -> FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4) - _ -> panic $ "patchRegsOfInstr: " ++ instrCon instr + NEWBLOCK{} -> panic $ "patchRegsOfInstr: " ++ instrCon instr + LOCATION{} -> panic $ "patchRegsOfInstr: " ++ instrCon instr where patchOp :: Operand -> Operand patchOp (OpReg w r) = OpReg w (env r) @@ -591,6 +597,8 @@ data Instr -- Signed/Unsigned bitfield extract | SBFX Operand Operand Operand Operand -- rd = rn[i,j] | UBFX Operand Operand Operand Operand -- rd = rn[i,j] + | CLZ Operand Operand -- rd = countLeadingZeros(rn) + | RBIT Operand Operand -- rd = reverseBits(rn) -- 3. Logical and Move Instructions ---------------------------------------- | AND Operand Operand Operand -- rd = rn & op2 @@ -676,6 +684,8 @@ instrCon i = UBFM{} -> "UBFM" SBFX{} -> "SBFX" UBFX{} -> "UBFX" + CLZ{} -> "CLZ" + RBIT{} -> "RBIT" AND{} -> "AND" ASR{} -> "ASR" EOR{} -> "EOR" diff --git a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs index 8c5d08167bf..7fb0a5351cb 100644 --- a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs +++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs @@ -396,6 +396,8 @@ pprInstr platform instr = case instr of -- 2. Bit Manipulation Instructions ------------------------------------------ SBFM o1 o2 o3 o4 -> op4 (text "\tsbfm") o1 o2 o3 o4 UBFM o1 o2 o3 o4 -> op4 (text "\tubfm") o1 o2 o3 o4 + CLZ o1 o2 -> op2 (text "\tclz") o1 o2 + RBIT o1 o2 -> op2 (text "\trbit") o1 o2 -- signed and unsigned bitfield extract SBFX o1 o2 o3 o4 -> op4 (text "\tsbfx") o1 o2 o3 o4 UBFX o1 o2 o3 o4 -> op4 (text "\tubfx") o1 o2 o3 o4 diff --git a/testsuite/tests/codeGen/should_run/CtzClz0.hs b/testsuite/tests/codeGen/should_run/CtzClz0.hs new file mode 100644 index 00000000000..0caad01b1da --- /dev/null +++ b/testsuite/tests/codeGen/should_run/CtzClz0.hs @@ -0,0 +1,27 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} + +module Main where + +import GHC.Exts +import Control.Monad + +#include <MachDeps.h> + +{-# OPAQUE x #-} -- needed to avoid triggering constant folding +x :: Word +x = 0 + +main :: IO () +main = do + let !(W# w) = x + + guard (W# (ctz# w) == WORD_SIZE_IN_BITS) + guard (W# (ctz8# w) == 8) + guard (W# (ctz16# w) == 16) + guard (W# (ctz32# w) == 32) + + guard (W# (clz# w) == WORD_SIZE_IN_BITS) + guard (W# (clz8# w) == 8) + guard (W# (clz16# w) == 16) + guard (W# (clz32# w) == 32) diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T index 40ddadecde1..264c1877f80 100644 --- a/testsuite/tests/codeGen/should_run/all.T +++ b/testsuite/tests/codeGen/should_run/all.T @@ -246,3 +246,4 @@ test('T24295a', normal, compile_and_run, ['-O -floopification']) test('T24295b', normal, compile_and_run, ['-O -floopification -fpedantic-bottoms']) test('T24664a', normal, compile_and_run, ['-O']) test('T24664b', normal, compile_and_run, ['-O']) +test('CtzClz0', normal, compile_and_run, ['']) -- GitLab