From fa576eb8a55db8e216eed3e7ea8807121b3496f5 Mon Sep 17 00:00:00 2001 From: Andreas Klebinger <klebinger.andreas@gmx.at> Date: Fri, 10 Nov 2023 02:22:32 +0100 Subject: [PATCH] Fix FMA primops generating broken assembly on x86. `genFMA3Code` assumed that we had to take extra precations to avoid overwriting the result of `getNonClobberedReg`. One of these special cases caused a bug resulting in broken assembly. I believe we don't need to hadle these cases specially at all, which means this MR simply deletes the special cases to fix the bug. Fixes #24160 --- compiler/GHC/CmmToAsm/X86/CodeGen.hs | 23 ------------------- compiler/GHC/CmmToAsm/X86/Instr.hs | 6 +++-- .../primops/should_compile/T24160_FMA.hs | 8 +++++++ testsuite/tests/primops/should_compile/all.T | 5 ++++ 4 files changed, 17 insertions(+), 25 deletions(-) create mode 100644 testsuite/tests/primops/should_compile/T24160_FMA.hs diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs index 9b0e0b741ff0..1e28a573329b 100644 --- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs +++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs @@ -3448,32 +3448,9 @@ genFMA3Code w signs x y z = do (y_reg, y_code) <- getNonClobberedReg y (z_reg, z_code) <- getNonClobberedReg z x_code <- getAnyReg x - y_tmp <- getNewRegNat rep - z_tmp <- getNewRegNat rep let fma213 = FMA3 rep signs FMA213 code dst - | dst == y_reg - , dst == z_reg - = y_code `appOL` - unitOL (MOV rep (OpReg y_reg) (OpReg y_tmp)) `appOL` - z_code `appOL` - unitOL (MOV rep (OpReg z_reg) (OpReg z_tmp)) `appOL` - x_code dst `snocOL` - fma213 (OpReg z_tmp) y_tmp dst - | dst == y_reg - = y_code `appOL` - unitOL (MOV rep (OpReg y_reg) (OpReg z_tmp)) `appOL` - z_code `appOL` - x_code dst `snocOL` - fma213 (OpReg z_reg) y_tmp dst - | dst == z_reg - = y_code `appOL` - z_code `appOL` - unitOL (MOV rep (OpReg z_reg) (OpReg z_tmp)) `appOL` - x_code dst `snocOL` - fma213 (OpReg z_tmp) y_reg dst - | otherwise = y_code `appOL` z_code `appOL` x_code dst `snocOL` diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs index 98570b0a45b1..5e43836bcc0f 100644 --- a/compiler/GHC/CmmToAsm/X86/Instr.hs +++ b/compiler/GHC/CmmToAsm/X86/Instr.hs @@ -277,7 +277,8 @@ data Instr -- | FMA3 fused multiply-add operations. | FMA3 Format FMASign FMAPermutation Operand Reg Reg - -- src1 (r/m), src2 (r), dst (r) + -- src3 (r/m), src2 (r), dst/src1 (r) + -- The is exactly reversed from how intel lists the arguments. -- use ADD, SUB, and SQRT for arithmetic. In both cases, operands -- are Operand Reg. @@ -358,6 +359,7 @@ data Operand | OpImm Imm -- immediate value | OpAddr AddrMode -- memory reference +-- NB: As of 2023 we only use the FMA213 permutation. data FMAPermutation = FMA132 | FMA213 | FMA231 -- | Returns which registers are read and written as a (read, written) @@ -447,7 +449,7 @@ regUsageOfInstr platform instr PDEP _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] PEXT _ src mask dst -> mkRU (use_R src $ use_R mask []) [dst] - FMA3 _ _ _ src1 src2 dst -> usageFMA src1 src2 dst + FMA3 _ _ _ src3 src2 dst -> usageFMA src3 src2 dst -- note: might be a better way to do this PREFETCH _ _ src -> mkRU (use_R src []) [] diff --git a/testsuite/tests/primops/should_compile/T24160_FMA.hs b/testsuite/tests/primops/should_compile/T24160_FMA.hs new file mode 100644 index 000000000000..632e8df24222 --- /dev/null +++ b/testsuite/tests/primops/should_compile/T24160_FMA.hs @@ -0,0 +1,8 @@ +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnboxedTuples #-} +module TwoProdFMA where +import GHC.Exts + +twoProductFloat# :: Float# -> Float# -> (# Float#, Float# #) +twoProductFloat# x y = let !r = x `timesFloat#` y + in (# r, fmsubFloat# x y r #) diff --git a/testsuite/tests/primops/should_compile/all.T b/testsuite/tests/primops/should_compile/all.T index 9ba0fe40e8a5..a696259050be 100644 --- a/testsuite/tests/primops/should_compile/all.T +++ b/testsuite/tests/primops/should_compile/all.T @@ -8,3 +8,8 @@ test('UnliftedStableName', normal, compile, ['']) test('KeepAliveWrapper', normal, compile, ['-O']) test('T22152', normal, compile, ['']) test('T22152b', normal, compile, ['']) +test('T24160_FMA' + , [ js_skip # JS backend doesn't have an FMA implementation ] + # , omit_ghci # fails during compilation phase, remove after !10563 + ] + , compile, ['-O']) \ No newline at end of file -- GitLab