From 08026bcc8d68d5c7851879f7d6943b85dce0cbdf Mon Sep 17 00:00:00 2001
From: Benjamin Maurer <maurer.benjamin@gmail.com>
Date: Fri, 4 Nov 2022 09:47:18 +0100
Subject: [PATCH] Generate assembly on x86 for word2float.

We used to emit C function call for MO_UF_Conv primitive.
Now emits direct assembly instead.
---
 compiler/GHC/CmmToAsm/X86/CodeGen.hs | 87 +++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 3 deletions(-)

diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index d6ef821c9f7b..7a31187fd2e2 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -3923,9 +3923,90 @@ genClz bid width dst src = do
                        -- took care of implicitly clearing the upper bits
 
 genWordToFloat :: BlockId -> Width -> CmmFormal -> CmmActual -> NatM InstrBlock
-genWordToFloat bid width dst src =
-  -- TODO: generate assembly instead
-  genPrimCCall bid (word2FloatLabel width) [dst] [src]
+genWordToFloat bid width dst src = do
+  is32Bit <- is32BitPlatform
+  platform <- getPlatform
+
+  let srcFormat = intFormat $ cmmExprWidth platform src
+  let dst_r = getLocalRegReg dst
+  let conv = case width of
+              W64 -> CVTSI2SD
+              W32 -> CVTSI2SS
+              _ -> pprPanic "genWordToFloat: unsupported width" (ppr width)
+  let dstFormat = floatFormat width
+
+  code_src  <- getAnyReg src
+  src_r     <- getNewRegNat srcFormat
+
+  if is32Bit
+    then case (srcFormat, width) of
+      (II32, W64) -> do
+        -- CVTSI2S* works on signed ints, so large 32-bit uints look negative.
+        -- So we flip the MSB by adding an immediate and add the same nominal
+        -- value to the resulting float.
+        Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes width) (CmmInt 0x41E0000000000000 width)
+        return $ (code_src src_r) `appOL` amode_code `appOL` toOL
+          [ ADD srcFormat (OpImm $ ImmInteger 0x80000000) (OpReg src_r) -- -2147483648
+          , XOR dstFormat (OpReg dst_r) (OpReg dst_r)
+          , conv srcFormat (OpReg src_r) dst_r
+          , ADD dstFormat (OpAddr amode) (OpReg dst_r) -- +2147483648.0
+          ]
+      (II32, W32) -> do
+        -- For unsigned 32-Bit int to 32-bit float, we convert low and
+        -- high 16 bits separately.
+        xmmTmp  <- getNewRegNat dstFormat
+        intTmp  <- getNewRegNat srcFormat
+        Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes width) (CmmInt 0x47800000 width)
+        return $ (code_src src_r) `appOL` amode_code `appOL` toOL
+          [ XOR dstFormat (OpReg dst_r) (OpReg dst_r)
+          , MOVZxL II16 (OpReg src_r) (OpReg intTmp)
+          , SHR srcFormat (OpImm $ ImmInt 16) (OpReg src_r)
+          , conv srcFormat (OpReg src_r) dst_r
+          , MUL dstFormat (OpAddr amode) (OpReg dst_r)
+          , XOR dstFormat (OpReg xmmTmp) (OpReg xmmTmp)
+          , conv srcFormat (OpReg intTmp) xmmTmp
+          , ADD dstFormat (OpReg xmmTmp) (OpReg dst_r)
+          ]
+      _           -> panic ("genWordToFloat: unsupported source operand format: " ++ show srcFormat)
+    else do
+      tmp       <- getNewRegNat srcFormat
+
+      lblLarge  <- getBlockIdNat
+      lblSmall  <- getBlockIdNat
+      lblAfter  <- getBlockIdNat
+
+      addImmediateSuccessorNat bid lblSmall
+      addImmediateSuccessorNat bid lblLarge
+      addImmediateSuccessorNat lblSmall lblAfter
+      addImmediateSuccessorNat lblLarge lblAfter
+      updateCfgNat ( addWeightEdge bid lblSmall 100
+                  . addWeightEdge bid lblLarge 50 )
+
+      -- For small values, that don't look like negative numbers,
+      -- we can simply convert directly.
+      -- Otherwise we divide by 2, convert, then double again
+      -- (with special handling for uneven numbers).
+      return $ appOL (code_src src_r)
+        $ toOL
+        [ TEST srcFormat (OpReg src_r) (OpReg src_r)
+        , JXX NEG lblLarge
+        -- Adding this label to allow optimizations to either invert condition or just eliminate
+        , JXX ALWAYS lblSmall
+        , NEWBLOCK lblSmall
+        , XOR dstFormat (OpReg dst_r) (OpReg dst_r)
+        , conv srcFormat (OpReg src_r) dst_r
+        , JXX ALWAYS lblAfter
+        , NEWBLOCK lblLarge
+        , MOV srcFormat (OpReg src_r) (OpReg tmp)
+        , SHR srcFormat (OpImm $ ImmInt 1) (OpReg tmp)
+        , AND srcFormat (OpImm $ ImmInt 1) (OpReg src_r)
+        , OR srcFormat (OpReg src_r) (OpReg tmp)
+        , XOR dstFormat (OpReg dst_r) (OpReg dst_r)
+        , conv srcFormat (OpReg tmp) dst_r
+        , ADD dstFormat (OpReg dst_r) (OpReg dst_r)
+        , JXX ALWAYS lblAfter
+        , NEWBLOCK lblAfter
+        ]
 
 genAtomicRead :: Width -> MemoryOrdering -> LocalReg -> CmmExpr -> NatM InstrBlock
 genAtomicRead width _mord dst addr = do
-- 
GitLab