NCG spills to stack when performing trivial bit arithmetic
Summary
This is inspired by the results in https://github.com/andrewthad/haskell-ip/pull/61. The problem is that NCG spills onto the stack. I don't know if the best place to fix this is in cmm or in the NCG.
Steps to reproduce
Here is a minimal version of the code:
{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language TypeApplications #-}
{-# OPTIONS_GHC -O2 -fforce-recomp #-}
module TestIpReserved
( reserved#
) where
import GHC.Exts
import GHC.Word
import Data.Bits
-- On 32-bit platforms, returns 1 on success. On 64-bit platforms,
-- returns 2 on success. On all platforms, returns 0 if the address
-- is not reserved.
reserved# :: Word# -> Word#
{-# noinline reserved# #-}
reserved# w# =
let w = W# w#
a = fromOctets 0 0 0 0
b = fromOctets 100 64 0 0
c = fromOctets 127 0 0 0
d = fromOctets 169 254 0 0
e = fromOctets 192 0 0 0
f = fromOctets 192 0 2 0
g = fromOctets 192 88 99 0
h = fromOctets 198 18 0 0
i = fromOctets 198 51 100 0
j = fromOctets 203 0 113 0
k = fromOctets 224 0 0 0
l = fromOctets 240 0 0 0
!(W# r) = flip unsafeShiftR 5 $ fromIntegral @Int @Word $
countTrailingZeros ((fromIntegral @Word32 @Word mask4 .&. w) `xor` fromIntegral @Word32 @Word k)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask4 .&. w) `xor` fromIntegral @Word32 @Word l)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask8 .&. w) `xor` fromIntegral @Word32 @Word p24)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask8 .&. w) `xor` fromIntegral @Word32 @Word a)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask8 .&. w) `xor` fromIntegral @Word32 @Word c)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask10 .&. w) `xor` fromIntegral @Word32 @Word b)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask12 .&. w) `xor` fromIntegral @Word32 @Word p20)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask15 .&. w) `xor` fromIntegral @Word32 @Word h)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask16 .&. w) `xor` fromIntegral @Word32 @Word d)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask16 .&. w) `xor` fromIntegral @Word32 @Word p16)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word e)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word f)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word g)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word i)
.|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word j)
in r
mask8,mask4,mask12,mask16,mask10,mask24,mask15 :: Word32
mask4 = 0xF0000000
mask8 = 0xFF000000
mask10 = 0xFFC00000
mask12 = 0xFFF00000
mask15 = 0xFFFE0000
mask16 = 0xFFFF0000
mask24 = 0xFFFFFF00
p24 :: Word32
p24 = fromOctets 10 0 0 0
p20 :: Word32
p20 = fromOctets 172 16 0 0
p16 :: Word32
p16 = fromOctets 192 168 0 0
fromOctets :: Word -> Word -> Word -> Word -> Word32
fromOctets a b c d = fromIntegral
( shiftL a 24
.|. shiftL b 16
.|. shiftL c 8
.|. d
)
The cmm we get from this looks just like I would expect it to:
[TestIpReserved.reserved#_entry() // [R2]
{ info_tbls: [(c3jm,
label: TestIpReserved.reserved#_info
rep: HeapRep static { Fun {arity: 1 fun_type: ArgSpec 4} }
srt: Nothing)]
stack_info: arg_space: 8 updfr_space: Just 8
}
{offset
c3jm: // global
(_c3jr::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3405803776);
(_c3jE::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3325256704);
(_c3jR::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3227017984);
(_c3k4::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3221225984);
(_c3kh::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3221225472);
(_c3ku::I64) = call MO_Ctz W64(R2 & 4294901760 ^ 3232235520);
(_c3kH::I64) = call MO_Ctz W64(R2 & 4294901760 ^ 2851995648);
(_c3kU::I64) = call MO_Ctz W64(R2 & 4294836224 ^ 3323068416);
(_c3l7::I64) = call MO_Ctz W64(R2 & 4293918720 ^ 2886729728);
(_c3lk::I64) = call MO_Ctz W64(R2 & 4290772992 ^ 1681915904);
(_c3lx::I64) = call MO_Ctz W64(R2 & 4278190080 ^ 2130706432);
(_c3lH::I64) = call MO_Ctz W64(R2 & 4278190080);
(_c3lU::I64) = call MO_Ctz W64(R2 & 4278190080 ^ 167772160);
(_c3m7::I64) = call MO_Ctz W64(R2 & 4026531840 ^ 4026531840);
(_c3mk::I64) = call MO_Ctz W64(R2 & 4026531840 ^ 3758096384);
R1 = _c3mk::I64 | _c3m7::I64 | _c3lU::I64 | _c3lH::I64 | _c3lx::I64 | _c3lk::I64 | _c3l7::I64 | _c3kU::I64 | _c3kH::I64 | _c3ku::I64 | _c3kh::I64 | _c3k4::I64 | _c3jR::I64 | _c3jE::I64 | _c3jr::I64 >> 5;
call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
}
},
section ""data" . TestIpReserved.reserved#_closure" {
TestIpReserved.reserved#_closure:
const TestIpReserved.reserved#_info;
}]
However, the assembly is horrible. It uses a bunch of registers and then spills onto the stack. This is truncated after the stack spilling become apparent:
.globl TestIpReserved.reserved#_info
.type TestIpReserved.reserved#_info, @function
TestIpReserved.reserved#_info:
_c3jm:
movl $3405803776,%eax
movl $4294967040,%ebx
movq %r14,%rcx
andq %rbx,%rcx
xorq %rax,%rcx
bsf %rcx,%rax
movl $64,%ebx
cmovne %rax,%rbx
movl $3325256704,%eax
movl $4294967040,%ecx
movq %r14,%rdx
andq %rcx,%rdx
xorq %rax,%rdx
bsf %rdx,%rax
movl $64,%ecx
cmovne %rax,%rcx
movl $3227017984,%eax
movl $4294967040,%edx
movq %r14,%rsi
andq %rdx,%rsi
xorq %rax,%rsi
bsf %rsi,%rax
...
movl $64,%r10d
cmovne %rax,%r10
movl $2886729728,%eax
movl $4293918720,%r11d
movq %rbx,64(%rsp)
movq %r14,%rbx
andq %r11,%rbx
xorq %rax,%rbx
bsf %rbx,%rax
movl $64,%ebx
cmovne %rax,%rbx
movl $4290772992,%eax
movq %r14,%r11
andq %rax,%r11
xorq $1681915904,%r11
bsf %r11,%rax
movl $64,%r11d
cmovne %rax,%r11
movl $4278190080,%eax
movq %rcx,72(%rsp)
movq %r14,%rcx
andq %rax,%rcx
xorq $2130706432,%rcx
bsf %rcx,%rax
movl $64,%ecx
cmovne %rax,%rcx
movl $4278190080,%eax
movq %rdx,80(%rsp)
movq %r14,%rdx
andq %rax,%rdx
bsf %rdx,%rax
Expected Behavior
I don't expect stack spilling to happen.
Environment
- GHC version used: 8.8.1 and 8.10.1rc1
Optional:
- Operating System: Ubuntu 18.04
- System Architecture: x86_64