Commit af4cea7f authored by Artem Pyanykh's avatar Artem Pyanykh Committed by Marge Bot

codegen: fix memset unroll for small bytearrays, add 64-bit sets

Fixes #16052

When the offset in `setByteArray#` is statically known, we can provide
better alignment guarantees then just 1 byte.

Also, memset can now do 64-bit wide sets.

The current memset intrinsic is not optimal however and can be
improved for the case when we know that we deal with

(baseAddress at known alignment) + offset

For instance, on 64-bit

`setByteArray# s 1# 23# 0#`

given that bytearray is 8 bytes aligned could be unrolled into
`movb, movw, movl, movq, movq`; but currently it is
`movb x23` since alignment of 1 is all we can embed into MO_Memset op.
parent 36d38047
......@@ -2073,10 +2073,18 @@ doCopyAddrToByteArrayOp src_p dst dst_off bytes = do
-- character.
doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ()
doSetByteArrayOp ba off len c
= do dflags <- getDynFlags
p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
emitMemsetCall p c len 1
doSetByteArrayOp ba off len c = do
dflags <- getDynFlags
let maxAlign = wORD_SIZE dflags
align = minimum [maxAlign, possibleAlign]
p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off
emitMemsetCall p c len align
where
possibleAlign = case off of
CmmLit (CmmInt intOff _) -> fromIntegral $ byteAlignment (fromIntegral intOff)
_ -> 1
-- ----------------------------------------------------------------------------
-- Allocating arrays
......
......@@ -1843,22 +1843,32 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
(ImmInteger (n - i))
genCCall dflags _ (PrimTarget (MO_Memset align)) _
genCCall dflags is32Bit (PrimTarget (MO_Memset align)) _
[dst,
CmmLit (CmmInt c _),
CmmLit (CmmInt n _)]
_
| fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do
| fromInteger insns <= maxInlineMemsetInsns dflags = do
code_dst <- getAnyReg dst
dst_r <- getNewRegNat format
return $ code_dst dst_r `appOL` go dst_r (fromInteger n)
if format == II64 && n >= 8 then do
code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64))
imm8byte_r <- getNewRegNat II64
return $ code_dst dst_r `appOL`
code_imm8byte imm8byte_r `appOL`
go8 dst_r imm8byte_r (fromInteger n)
else
return $ code_dst dst_r `appOL`
go4 dst_r (fromInteger n)
where
(format, val) = case align .&. 3 of
2 -> (II16, c2)
0 -> (II32, c4)
_ -> (II8, c)
format = case byteAlignment (fromIntegral align) of
8 -> if is32Bit then II32 else II64
4 -> II32
2 -> II16
_ -> II8
c2 = c `shiftL` 8 .|. c
c4 = c2 `shiftL` 16 .|. c2
c8 = c4 `shiftL` 32 .|. c4
-- The number of instructions we will generate (approx). We need 1
-- instructions per move.
......@@ -1868,25 +1878,45 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _
sizeBytes :: Integer
sizeBytes = fromIntegral (formatInBytes format)
go :: Reg -> Integer -> OrdList Instr
go dst i
-- TODO: Add movabs instruction and support 64-bit sets.
| i >= sizeBytes = -- This might be smaller than the below sizes
unitOL (MOV format (OpImm (ImmInteger val)) (OpAddr dst_addr)) `appOL`
go dst (i - sizeBytes)
| i >= 4 = -- Will never happen on 32-bit
unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr dst_addr)) `appOL`
go dst (i - 4)
| i >= 2 =
unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr dst_addr)) `appOL`
go dst (i - 2)
| i >= 1 =
unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr dst_addr)) `appOL`
go dst (i - 1)
| otherwise = nilOL
-- Depending on size returns the widest MOV instruction and its
-- width.
gen4 :: AddrMode -> Integer -> (InstrBlock, Integer)
gen4 addr size
| size >= 4 =
(unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4)
| size >= 2 =
(unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2)
| size >= 1 =
(unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1)
| otherwise = (nilOL, 0)
-- Generates a 64-bit wide MOV instruction from REG to MEM.
gen8 :: AddrMode -> Reg -> InstrBlock
gen8 addr reg8byte =
unitOL (MOV format (OpReg reg8byte) (OpAddr addr))
-- Unrolls memset when the widest MOV is <= 4 bytes.
go4 :: Reg -> Integer -> InstrBlock
go4 dst left =
if left <= 0 then nilOL
else curMov `appOL` go4 dst (left - curWidth)
where
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone
(ImmInteger (n - i))
possibleWidth = minimum [left, sizeBytes]
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
(curMov, curWidth) = gen4 dst_addr possibleWidth
-- Unrolls memset when the widest MOV is 8 bytes (thus another Reg
-- argument). Falls back to go4 when all 8 byte moves are
-- exhausted.
go8 :: Reg -> Reg -> Integer -> InstrBlock
go8 dst reg8byte left =
if possibleWidth >= 8 then
let curMov = gen8 dst_addr reg8byte
in curMov `appOL` go8 dst reg8byte (left - 8)
else go4 dst left
where
possibleWidth = minimum [left, sizeBytes]
dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left))
genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL
-- write barrier compiles to no code on x86/x86-64;
......
......@@ -87,6 +87,7 @@ module Util (
-- * Integers
exactLog2,
byteAlignment,
-- * Floating point
readRational,
......@@ -1149,6 +1150,15 @@ exactLog2 x
pow2 x | x == 1 = 0
| otherwise = 1 + pow2 (x `shiftR` 1)
-- x is aligned at N bytes means the remainder from x / N is zero.
-- Currently, interested in N <= 8, but can be expanded to N <= 16 or
-- N <= 32 if used within SSE or AVX context.
byteAlignment :: Integer -> Integer
byteAlignment x = case x .&. 7 of
0 -> 8
4 -> 4
2 -> 2
_ -> 1
{-
-- -----------------------------------------------------------------------------
......
......@@ -1131,9 +1131,9 @@ def do_compile(name, way, should_fail, top_mod, extra_mods, extra_hc_opts, **kwa
# no problems found, this test passed
return passed()
def compile_cmp_asm( name, way, extra_hc_opts ):
def compile_cmp_asm( name, way, ext, extra_hc_opts ):
print('Compile only, extra args = ', extra_hc_opts)
result = simple_build(name + '.cmm', way, '-keep-s-files -O ' + extra_hc_opts, 0, '', 0, 0)
result = simple_build(name + '.' + ext, way, '-keep-s-files -O ' + extra_hc_opts, 0, '', 0, 0)
if badResult(result):
return result
......@@ -1153,6 +1153,24 @@ def compile_cmp_asm( name, way, extra_hc_opts ):
# no problems found, this test passed
return passed()
def compile_grep_asm( name, way, ext, is_substring, extra_hc_opts ):
print('Compile only, extra args = ', extra_hc_opts)
result = simple_build(name + '.' + ext, way, '-keep-s-files -O ' + extra_hc_opts, 0, '', 0, 0)
if badResult(result):
return result
expected_pat_file = find_expected_file(name, 'asm')
actual_asm_file = add_suffix(name, 's')
if not grep_output(join_normalisers(normalise_errmsg),
expected_pat_file, actual_asm_file,
is_substring):
return failBecause('asm mismatch')
# no problems found, this test passed
return passed()
# -----------------------------------------------------------------------------
# Compile-and-run tests
......@@ -1735,6 +1753,43 @@ def compare_outputs(way, kind, normaliser, expected_file, actual_file, diff_file
else:
return False
# Checks that each line from pattern_file is present in actual_file as
# a substring or regex pattern depending on is_substring.
def grep_output(normaliser, pattern_file, actual_file, is_substring=True):
expected_path = in_srcdir(pattern_file)
actual_path = in_testdir(actual_file)
expected_patterns = read_no_crs(expected_path).strip().split('\n')
actual_raw = read_no_crs(actual_path)
actual_str = normaliser(actual_raw)
success = True
failed_patterns = []
def regex_match(pat, actual):
return re.search(pat, actual) is not None
def substring_match(pat, actual):
return pat in actual
def is_match(pat, actual):
if is_substring:
return substring_match(pat, actual)
else:
return regex_match(pat, actual)
for pat in expected_patterns:
if not is_match(pat, actual_str):
success = False
failed_patterns.append(pat)
if not success:
print('Actual output does not contain the following patterns:')
for pat in failed_patterns:
print(pat)
return success
# Note [Output comparison]
#
# We do two types of output comparison:
......
......@@ -3,7 +3,8 @@ is_amd64_codegen = [
when(unregisterised(), skip),
]
test('memcpy', is_amd64_codegen, compile_cmp_asm, [''])
test('memcpy-unroll', is_amd64_codegen, compile_cmp_asm, [''])
test('memcpy-unroll-conprop', is_amd64_codegen, compile_cmp_asm, [''])
test('memset-unroll', is_amd64_codegen, compile_cmp_asm, [''])
test('memcpy', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('memcpy-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('memcpy-unroll-conprop', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('memset-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', ''])
test('bytearray-memset-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, ''])
movq $72340172838076673,%rcx
movq %rcx,0(%rbx)
movq %rcx,8(%rbx)
movl $16843009,16(%rbx)
movw $257,20(%rbx)
movb $1,22(%rbx)
{-# language MagicHash #-}
{-# language UnboxedTuples #-}
module FillArray
( fill
) where
import GHC.Exts
import GHC.IO
data ByteArray = ByteArray ByteArray#
fill :: IO ByteArray
fill = IO $ \s0 -> case newByteArray# 24# s0 of
(# s1, m #) -> case setByteArray# m 0# 23# 1# s1 of
s2 -> case unsafeFreezeByteArray# m s2 of
(# s3, r #) -> (# s3, ByteArray r #)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment