From 08026bcc8d68d5c7851879f7d6943b85dce0cbdf Mon Sep 17 00:00:00 2001 From: Benjamin Maurer <maurer.benjamin@gmail.com> Date: Fri, 4 Nov 2022 09:47:18 +0100 Subject: [PATCH] Generate assembly on x86 for word2float. We used to emit C function call for MO_UF_Conv primitive. Now emits direct assembly instead. --- compiler/GHC/CmmToAsm/X86/CodeGen.hs | 87 +++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs index d6ef821c9f7b..7a31187fd2e2 100644 --- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -3923,9 +3923,90 @@ genClz bid width dst src = do -- took care of implicitly clearing the upper bits genWordToFloat :: BlockId -> Width -> CmmFormal -> CmmActual -> NatM InstrBlock -genWordToFloat bid width dst src = - -- TODO: generate assembly instead - genPrimCCall bid (word2FloatLabel width) [dst] [src] +genWordToFloat bid width dst src = do + is32Bit <- is32BitPlatform + platform <- getPlatform + + let srcFormat = intFormat $ cmmExprWidth platform src + let dst_r = getLocalRegReg dst + let conv = case width of + W64 -> CVTSI2SD + W32 -> CVTSI2SS + _ -> pprPanic "genWordToFloat: unsupported width" (ppr width) + let dstFormat = floatFormat width + + code_src <- getAnyReg src + src_r <- getNewRegNat srcFormat + + if is32Bit + then case (srcFormat, width) of + (II32, W64) -> do + -- CVTSI2S* works on signed ints, so large 32-bit uints look negative. + -- So we flip the MSB by adding an immediate and add the same nominal + -- value to the resulting float. + Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes width) (CmmInt 0x41E0000000000000 width) + return $ (code_src src_r) `appOL` amode_code `appOL` toOL + [ ADD srcFormat (OpImm $ ImmInteger 0x80000000) (OpReg src_r) -- -2147483648 + , XOR dstFormat (OpReg dst_r) (OpReg dst_r) + , conv srcFormat (OpReg src_r) dst_r + , ADD dstFormat (OpAddr amode) (OpReg dst_r) -- +2147483648.0 + ] + (II32, W32) -> do + -- For unsigned 32-Bit int to 32-bit float, we convert low and + -- high 16 bits separately. + xmmTmp <- getNewRegNat dstFormat + intTmp <- getNewRegNat srcFormat + Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes width) (CmmInt 0x47800000 width) + return $ (code_src src_r) `appOL` amode_code `appOL` toOL + [ XOR dstFormat (OpReg dst_r) (OpReg dst_r) + , MOVZxL II16 (OpReg src_r) (OpReg intTmp) + , SHR srcFormat (OpImm $ ImmInt 16) (OpReg src_r) + , conv srcFormat (OpReg src_r) dst_r + , MUL dstFormat (OpAddr amode) (OpReg dst_r) + , XOR dstFormat (OpReg xmmTmp) (OpReg xmmTmp) + , conv srcFormat (OpReg intTmp) xmmTmp + , ADD dstFormat (OpReg xmmTmp) (OpReg dst_r) + ] + _ -> panic ("genWordToFloat: unsupported source operand format: " ++ show srcFormat) + else do + tmp <- getNewRegNat srcFormat + + lblLarge <- getBlockIdNat + lblSmall <- getBlockIdNat + lblAfter <- getBlockIdNat + + addImmediateSuccessorNat bid lblSmall + addImmediateSuccessorNat bid lblLarge + addImmediateSuccessorNat lblSmall lblAfter + addImmediateSuccessorNat lblLarge lblAfter + updateCfgNat ( addWeightEdge bid lblSmall 100 + . addWeightEdge bid lblLarge 50 ) + + -- For small values, that don't look like negative numbers, + -- we can simply convert directly. + -- Otherwise we divide by 2, convert, then double again + -- (with special handling for uneven numbers). + return $ appOL (code_src src_r) + $ toOL + [ TEST srcFormat (OpReg src_r) (OpReg src_r) + , JXX NEG lblLarge + -- Adding this label to allow optimizations to either invert condition or just eliminate + , JXX ALWAYS lblSmall + , NEWBLOCK lblSmall + , XOR dstFormat (OpReg dst_r) (OpReg dst_r) + , conv srcFormat (OpReg src_r) dst_r + , JXX ALWAYS lblAfter + , NEWBLOCK lblLarge + , MOV srcFormat (OpReg src_r) (OpReg tmp) + , SHR srcFormat (OpImm $ ImmInt 1) (OpReg tmp) + , AND srcFormat (OpImm $ ImmInt 1) (OpReg src_r) + , OR srcFormat (OpReg src_r) (OpReg tmp) + , XOR dstFormat (OpReg dst_r) (OpReg dst_r) + , conv srcFormat (OpReg tmp) dst_r + , ADD dstFormat (OpReg dst_r) (OpReg dst_r) + , JXX ALWAYS lblAfter + , NEWBLOCK lblAfter + ] genAtomicRead :: Width -> MemoryOrdering -> LocalReg -> CmmExpr -> NatM InstrBlock genAtomicRead width _mord dst addr = do -- GitLab