NCG spills to stack when performing trivial bit arithmetic

Summary

This is inspired by the results in https://github.com/andrewthad/haskell-ip/pull/61. The problem is that NCG spills onto the stack. I don't know if the best place to fix this is in cmm or in the NCG.

Steps to reproduce

Here is a minimal version of the code:

{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language TypeApplications #-}

{-# OPTIONS_GHC -O2 -fforce-recomp #-}

module TestIpReserved
  ( reserved#
  ) where

import GHC.Exts
import GHC.Word
import Data.Bits

-- On 32-bit platforms, returns 1 on success. On 64-bit platforms,
-- returns 2 on success. On all platforms, returns 0 if the address
-- is not reserved.
reserved# :: Word# -> Word#
{-# noinline reserved# #-}
reserved# w# =
  let w = W# w#
      a = fromOctets 0 0 0 0
      b = fromOctets 100 64 0 0
      c = fromOctets 127 0 0 0
      d = fromOctets 169 254 0 0
      e = fromOctets 192 0 0 0
      f = fromOctets 192 0 2 0
      g = fromOctets 192 88 99 0
      h = fromOctets 198 18 0 0
      i = fromOctets 198 51 100 0
      j = fromOctets 203 0 113 0
      k = fromOctets 224 0 0 0
      l = fromOctets 240 0 0 0
      !(W# r) = flip unsafeShiftR 5 $ fromIntegral @Int @Word $
            countTrailingZeros ((fromIntegral @Word32 @Word mask4  .&. w) `xor` fromIntegral @Word32 @Word k)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask4  .&. w) `xor` fromIntegral @Word32 @Word l)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask8  .&. w) `xor` fromIntegral @Word32 @Word p24)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask8  .&. w) `xor` fromIntegral @Word32 @Word a)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask8  .&. w) `xor` fromIntegral @Word32 @Word c)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask10 .&. w) `xor` fromIntegral @Word32 @Word b)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask12 .&. w) `xor` fromIntegral @Word32 @Word p20)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask15 .&. w) `xor` fromIntegral @Word32 @Word h)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask16 .&. w) `xor` fromIntegral @Word32 @Word d)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask16 .&. w) `xor` fromIntegral @Word32 @Word p16)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word e)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word f)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word g)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word i)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word j)
   in r

mask8,mask4,mask12,mask16,mask10,mask24,mask15 :: Word32
mask4  = 0xF0000000
mask8  = 0xFF000000
mask10 = 0xFFC00000
mask12 = 0xFFF00000
mask15 = 0xFFFE0000
mask16 = 0xFFFF0000
mask24 = 0xFFFFFF00

p24 :: Word32
p24 = fromOctets 10 0 0 0

p20 :: Word32
p20 = fromOctets 172 16 0 0

p16 :: Word32
p16 = fromOctets 192 168 0 0

fromOctets :: Word -> Word -> Word -> Word -> Word32
fromOctets a b c d = fromIntegral
    ( shiftL a 24
  .|. shiftL b 16
  .|. shiftL c 8
  .|. d
    )

The cmm we get from this looks just like I would expect it to:

[TestIpReserved.reserved#_entry() //  [R2]
         { info_tbls: [(c3jm,
                        label: TestIpReserved.reserved#_info
                        rep: HeapRep static { Fun {arity: 1 fun_type: ArgSpec 4} }
                        srt: Nothing)]
           stack_info: arg_space: 8 updfr_space: Just 8
         }
     {offset
       c3jm: // global
           (_c3jr::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3405803776);
           (_c3jE::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3325256704);
           (_c3jR::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3227017984);
           (_c3k4::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3221225984);
           (_c3kh::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3221225472);
           (_c3ku::I64) = call MO_Ctz W64(R2 & 4294901760 ^ 3232235520);
           (_c3kH::I64) = call MO_Ctz W64(R2 & 4294901760 ^ 2851995648);
           (_c3kU::I64) = call MO_Ctz W64(R2 & 4294836224 ^ 3323068416);
           (_c3l7::I64) = call MO_Ctz W64(R2 & 4293918720 ^ 2886729728);
           (_c3lk::I64) = call MO_Ctz W64(R2 & 4290772992 ^ 1681915904);
           (_c3lx::I64) = call MO_Ctz W64(R2 & 4278190080 ^ 2130706432);
           (_c3lH::I64) = call MO_Ctz W64(R2 & 4278190080);
           (_c3lU::I64) = call MO_Ctz W64(R2 & 4278190080 ^ 167772160);
           (_c3m7::I64) = call MO_Ctz W64(R2 & 4026531840 ^ 4026531840);
           (_c3mk::I64) = call MO_Ctz W64(R2 & 4026531840 ^ 3758096384);
           R1 = _c3mk::I64 | _c3m7::I64 | _c3lU::I64 | _c3lH::I64 | _c3lx::I64 | _c3lk::I64 | _c3l7::I64 | _c3kU::I64 | _c3kH::I64 | _c3ku::I64 | _c3kh::I64 | _c3k4::I64 | _c3jR::I64 | _c3jE::I64 | _c3jr::I64 >> 5;
           call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
     }
 },
 section ""data" . TestIpReserved.reserved#_closure" {
     TestIpReserved.reserved#_closure:
         const TestIpReserved.reserved#_info;
 }]

However, the assembly is horrible. It uses a bunch of registers and then spills onto the stack. This is truncated after the stack spilling become apparent:

.globl TestIpReserved.reserved#_info
.type TestIpReserved.reserved#_info, @function
TestIpReserved.reserved#_info:
_c3jm:
        movl $3405803776,%eax
        movl $4294967040,%ebx
        movq %r14,%rcx
        andq %rbx,%rcx
        xorq %rax,%rcx
        bsf %rcx,%rax
        movl $64,%ebx
        cmovne %rax,%rbx
        movl $3325256704,%eax
        movl $4294967040,%ecx
        movq %r14,%rdx
        andq %rcx,%rdx
        xorq %rax,%rdx
        bsf %rdx,%rax
        movl $64,%ecx
        cmovne %rax,%rcx
        movl $3227017984,%eax
        movl $4294967040,%edx
        movq %r14,%rsi
        andq %rdx,%rsi
        xorq %rax,%rsi
        bsf %rsi,%rax
        ...
        movl $64,%r10d
        cmovne %rax,%r10
        movl $2886729728,%eax
        movl $4293918720,%r11d
        movq %rbx,64(%rsp)
        movq %r14,%rbx
        andq %r11,%rbx
        xorq %rax,%rbx
        bsf %rbx,%rax
        movl $64,%ebx
        cmovne %rax,%rbx
        movl $4290772992,%eax
        movq %r14,%r11
        andq %rax,%r11
        xorq $1681915904,%r11
        bsf %r11,%rax
        movl $64,%r11d
        cmovne %rax,%r11
        movl $4278190080,%eax
        movq %rcx,72(%rsp)
        movq %r14,%rcx
        andq %rax,%rcx
        xorq $2130706432,%rcx
        bsf %rcx,%rax
        movl $64,%ecx
        cmovne %rax,%rcx
        movl $4278190080,%eax
        movq %rdx,80(%rsp)
        movq %r14,%rdx
        andq %rax,%rdx
        bsf %rdx,%rax

Expected Behavior

I don't expect stack spilling to happen.

Environment

GHC version used: 8.8.1 and 8.10.1rc1

Optional:

Operating System: Ubuntu 18.04
System Architecture: x86_64

Edited Jan 08, 2020 by Ben Gamari

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message

NCG spills to stack when performing trivial bit arithmetic

Summary

Steps to reproduce

Expected Behavior

Environment