    Implement optimized NCG `MO_Ctz W64` op for i386 (#9340) · 612f3d12
    Herbert Valerio Riedel authored
    This is an optimization to the CTZ primops introduced for #9340
    Previously we called out to `hs_ctz64`, but we can actually generate
    better hand-tuned code while avoiding the FFI ccall.
    With this patch, the code
      {-# LANGUAGE MagicHash #-}
      module TestClz0 where
      import GHC.Prim
      ctz64 :: Word64# -> Word#
      ctz64 x = ctz64# x
    results in the following assembler generated by NCG on i386:
          movl (%ebp),%eax
          movl 4(%ebp),%ecx
          movl %ecx,%edx
          orl %eax,%edx
          movl $64,%edx
          je _nAO
          bsf %ecx,%ecx
          addl $32,%ecx
          bsf %eax,%eax
          cmovne %eax,%ecx
          movl %ecx,%edx
          movl %edx,%esi
          addl $8,%ebp
          jmp *(%ebp)
    For comparision, here's what LLVM 3.4 currently generates:
      000000fc <TestClzz_ctzz64_info>:
        fc:   0f bc 45 04             bsf    0x4(%ebp),%eax
       100:   b9 20 00 00 00          mov    $0x20,%ecx
       105:   0f 45 c8                cmovne %eax,%ecx
       108:   83 c1 20                add    $0x20,%ecx
       10b:   8b 45 00                mov    0x0(%ebp),%eax
       10e:   8b 55 08                mov    0x8(%ebp),%edx
       111:   0f bc f0                bsf    %eax,%esi
       114:   85 c0                   test   %eax,%eax
       116:   0f 44 f1                cmove  %ecx,%esi
       119:   83 c5 08                add    $0x8,%ebp
       11c:   ff e2                   jmp    *%edx
