Skip to content

NCG spills to stack when performing trivial bit arithmetic

Summary

This is inspired by the results in https://github.com/andrewthad/haskell-ip/pull/61. The problem is that NCG spills onto the stack. I don't know if the best place to fix this is in cmm or in the NCG.

Steps to reproduce

Here is a minimal version of the code:

{-# language BangPatterns #-}
{-# language MagicHash #-}
{-# language TypeApplications #-}

{-# OPTIONS_GHC -O2 -fforce-recomp #-}

module TestIpReserved
  ( reserved#
  ) where

import GHC.Exts
import GHC.Word
import Data.Bits

-- On 32-bit platforms, returns 1 on success. On 64-bit platforms,
-- returns 2 on success. On all platforms, returns 0 if the address
-- is not reserved.
reserved# :: Word# -> Word#
{-# noinline reserved# #-}
reserved# w# =
  let w = W# w#
      a = fromOctets 0 0 0 0
      b = fromOctets 100 64 0 0
      c = fromOctets 127 0 0 0
      d = fromOctets 169 254 0 0
      e = fromOctets 192 0 0 0
      f = fromOctets 192 0 2 0
      g = fromOctets 192 88 99 0
      h = fromOctets 198 18 0 0
      i = fromOctets 198 51 100 0
      j = fromOctets 203 0 113 0
      k = fromOctets 224 0 0 0
      l = fromOctets 240 0 0 0
      !(W# r) = flip unsafeShiftR 5 $ fromIntegral @Int @Word $
            countTrailingZeros ((fromIntegral @Word32 @Word mask4  .&. w) `xor` fromIntegral @Word32 @Word k)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask4  .&. w) `xor` fromIntegral @Word32 @Word l)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask8  .&. w) `xor` fromIntegral @Word32 @Word p24)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask8  .&. w) `xor` fromIntegral @Word32 @Word a)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask8  .&. w) `xor` fromIntegral @Word32 @Word c)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask10 .&. w) `xor` fromIntegral @Word32 @Word b)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask12 .&. w) `xor` fromIntegral @Word32 @Word p20)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask15 .&. w) `xor` fromIntegral @Word32 @Word h)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask16 .&. w) `xor` fromIntegral @Word32 @Word d)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask16 .&. w) `xor` fromIntegral @Word32 @Word p16)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word e)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word f)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word g)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word i)
        .|. countTrailingZeros ((fromIntegral @Word32 @Word mask24 .&. w) `xor` fromIntegral @Word32 @Word j)
   in r

mask8,mask4,mask12,mask16,mask10,mask24,mask15 :: Word32
mask4  = 0xF0000000
mask8  = 0xFF000000
mask10 = 0xFFC00000
mask12 = 0xFFF00000
mask15 = 0xFFFE0000
mask16 = 0xFFFF0000
mask24 = 0xFFFFFF00

p24 :: Word32
p24 = fromOctets 10 0 0 0

p20 :: Word32
p20 = fromOctets 172 16 0 0

p16 :: Word32
p16 = fromOctets 192 168 0 0

fromOctets :: Word -> Word -> Word -> Word -> Word32
fromOctets a b c d = fromIntegral
    ( shiftL a 24
  .|. shiftL b 16
  .|. shiftL c 8
  .|. d
    )

The cmm we get from this looks just like I would expect it to:

[TestIpReserved.reserved#_entry() //  [R2]
         { info_tbls: [(c3jm,
                        label: TestIpReserved.reserved#_info
                        rep: HeapRep static { Fun {arity: 1 fun_type: ArgSpec 4} }
                        srt: Nothing)]
           stack_info: arg_space: 8 updfr_space: Just 8
         }
     {offset
       c3jm: // global
           (_c3jr::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3405803776);
           (_c3jE::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3325256704);
           (_c3jR::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3227017984);
           (_c3k4::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3221225984);
           (_c3kh::I64) = call MO_Ctz W64(R2 & 4294967040 ^ 3221225472);
           (_c3ku::I64) = call MO_Ctz W64(R2 & 4294901760 ^ 3232235520);
           (_c3kH::I64) = call MO_Ctz W64(R2 & 4294901760 ^ 2851995648);
           (_c3kU::I64) = call MO_Ctz W64(R2 & 4294836224 ^ 3323068416);
           (_c3l7::I64) = call MO_Ctz W64(R2 & 4293918720 ^ 2886729728);
           (_c3lk::I64) = call MO_Ctz W64(R2 & 4290772992 ^ 1681915904);
           (_c3lx::I64) = call MO_Ctz W64(R2 & 4278190080 ^ 2130706432);
           (_c3lH::I64) = call MO_Ctz W64(R2 & 4278190080);
           (_c3lU::I64) = call MO_Ctz W64(R2 & 4278190080 ^ 167772160);
           (_c3m7::I64) = call MO_Ctz W64(R2 & 4026531840 ^ 4026531840);
           (_c3mk::I64) = call MO_Ctz W64(R2 & 4026531840 ^ 3758096384);
           R1 = _c3mk::I64 | _c3m7::I64 | _c3lU::I64 | _c3lH::I64 | _c3lx::I64 | _c3lk::I64 | _c3l7::I64 | _c3kU::I64 | _c3kH::I64 | _c3ku::I64 | _c3kh::I64 | _c3k4::I64 | _c3jR::I64 | _c3jE::I64 | _c3jr::I64 >> 5;
           call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
     }
 },
 section ""data" . TestIpReserved.reserved#_closure" {
     TestIpReserved.reserved#_closure:
         const TestIpReserved.reserved#_info;
 }]

However, the assembly is horrible. It uses a bunch of registers and then spills onto the stack. This is truncated after the stack spilling become apparent:

.globl TestIpReserved.reserved#_info
.type TestIpReserved.reserved#_info, @function
TestIpReserved.reserved#_info:
_c3jm:
        movl $3405803776,%eax
        movl $4294967040,%ebx
        movq %r14,%rcx
        andq %rbx,%rcx
        xorq %rax,%rcx
        bsf %rcx,%rax
        movl $64,%ebx
        cmovne %rax,%rbx
        movl $3325256704,%eax
        movl $4294967040,%ecx
        movq %r14,%rdx
        andq %rcx,%rdx
        xorq %rax,%rdx
        bsf %rdx,%rax
        movl $64,%ecx
        cmovne %rax,%rcx
        movl $3227017984,%eax
        movl $4294967040,%edx
        movq %r14,%rsi
        andq %rdx,%rsi
        xorq %rax,%rsi
        bsf %rsi,%rax
        ...
        movl $64,%r10d
        cmovne %rax,%r10
        movl $2886729728,%eax
        movl $4293918720,%r11d
        movq %rbx,64(%rsp)
        movq %r14,%rbx
        andq %r11,%rbx
        xorq %rax,%rbx
        bsf %rbx,%rax
        movl $64,%ebx
        cmovne %rax,%rbx
        movl $4290772992,%eax
        movq %r14,%r11
        andq %rax,%r11
        xorq $1681915904,%r11
        bsf %r11,%rax
        movl $64,%r11d
        cmovne %rax,%r11
        movl $4278190080,%eax
        movq %rcx,72(%rsp)
        movq %r14,%rcx
        andq %rax,%rcx
        xorq $2130706432,%rcx
        bsf %rcx,%rax
        movl $64,%ecx
        cmovne %rax,%rcx
        movl $4278190080,%eax
        movq %rdx,80(%rsp)
        movq %r14,%rdx
        andq %rax,%rdx
        bsf %rdx,%rax

Expected Behavior

I don't expect stack spilling to happen.

Environment

  • GHC version used: 8.8.1 and 8.10.1rc1

Optional:

  • Operating System: Ubuntu 18.04
  • System Architecture: x86_64
Edited by Ben Gamari
To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information