Commit 335b9f36 authored by Simon Marlow's avatar Simon Marlow
Browse files

Implement SSE2 floating-point support in the x86 native code generator (#594)

The new flag -msse2 enables code generation for SSE2 on x86.  It
results in substantially faster floating-point performance; the main
reason for doing this was that our x87 code generation is appallingly
bad, and since we plan to drop -fvia-C soon, we need a way to generate
half-decent floating-point code.

The catch is that SSE2 is only available on CPUs that support it (P4+,
AMD K8+).  We'll have to think hard about whether we should enable it
by default for the libraries we ship.  In the meantime, at least
-msse2 should be an acceptable replacement for "-fvia-C
-optc-ffast-math -fexcess-precision".

SSE2 also has the advantage of performing all operations at the
correct precision, so floating-point results are consistent with other
platforms.

I also tweaked the x87 code generation a bit while I was here, now
it's slighlty less bad than before.
parent d9f71774
......@@ -312,6 +312,7 @@ data DynFlag
| Opt_EmitExternalCore
| Opt_SharedImplib
| Opt_BuildingCabalPackage
| Opt_SSE2
-- temporary flags
| Opt_RunCPS
......@@ -1265,6 +1266,9 @@ dynamic_flags = [
, Flag "monly-4-regs" (NoArg (upd (\s -> s{stolen_x86_regs = 4}) ))
Supported
, Flag "msse2" (NoArg (setDynFlag Opt_SSE2))
Supported
------ Warning opts -------------------------------------------------
, Flag "W" (NoArg (mapM_ setDynFlag minusWOpts))
Supported
......
......@@ -171,6 +171,7 @@ pprReg r
RegVirtual (VirtualRegHi u) -> text "%vHi_" <> asmSDoc (pprUnique u)
RegVirtual (VirtualRegF u) -> text "%vF_" <> asmSDoc (pprUnique u)
RegVirtual (VirtualRegD u) -> text "%vD_" <> asmSDoc (pprUnique u)
RegVirtual (VirtualRegSSE u) -> text "%vSSE_" <> asmSDoc (pprUnique u)
where
#if darwin_TARGET_OS
ppr_reg_no :: Int -> Doc
......
......@@ -87,25 +87,15 @@ virtualRegSqueeze cls vr
-> case vr of
VirtualRegI{} -> _ILIT(1)
VirtualRegHi{} -> _ILIT(1)
VirtualRegD{} -> _ILIT(0)
VirtualRegF{} -> _ILIT(0)
-- We don't use floats on this arch, but we can't
-- return error because the return type is unboxed...
RcFloat
-> case vr of
VirtualRegI{} -> _ILIT(0)
VirtualRegHi{} -> _ILIT(0)
VirtualRegD{} -> _ILIT(0)
VirtualRegF{} -> _ILIT(0)
_other -> _ILIT(0)
RcDouble
-> case vr of
VirtualRegI{} -> _ILIT(0)
VirtualRegHi{} -> _ILIT(0)
VirtualRegD{} -> _ILIT(1)
VirtualRegF{} -> _ILIT(0)
_other -> _ILIT(0)
_other -> _ILIT(0)
{-# INLINE realRegSqueeze #-}
realRegSqueeze :: RegClass -> RealReg -> FastInt
......@@ -119,16 +109,6 @@ realRegSqueeze cls rr
RealRegPair{} -> _ILIT(0)
-- We don't use floats on this arch, but we can't
-- return error because the return type is unboxed...
RcFloat
-> case rr of
RealRegSingle regNo
| regNo < 32 -> _ILIT(0)
| otherwise -> _ILIT(0)
RealRegPair{} -> _ILIT(0)
RcDouble
-> case rr of
RealRegSingle regNo
......@@ -137,6 +117,8 @@ realRegSqueeze cls rr
RealRegPair{} -> _ILIT(0)
_other -> _ILIT(0)
mkVirtualReg :: Unique -> Size -> VirtualReg
mkVirtualReg u size
| not (isFloatSize size) = VirtualRegI u
......@@ -152,6 +134,7 @@ regDotColor reg
RcInteger -> Outputable.text "blue"
RcFloat -> Outputable.text "red"
RcDouble -> Outputable.text "green"
RcDoubleSSE -> Outputable.text "yellow"
-- immediates ------------------------------------------------------------------
......
......@@ -55,6 +55,7 @@ data VirtualReg
| VirtualRegHi {-# UNPACK #-} !Unique -- High part of 2-word register
| VirtualRegF {-# UNPACK #-} !Unique
| VirtualRegD {-# UNPACK #-} !Unique
| VirtualRegSSE {-# UNPACK #-} !Unique
deriving (Eq, Show, Ord)
instance Uniquable VirtualReg where
......@@ -64,6 +65,7 @@ instance Uniquable VirtualReg where
VirtualRegHi u -> u
VirtualRegF u -> u
VirtualRegD u -> u
VirtualRegSSE u -> u
instance Outputable VirtualReg where
ppr reg
......@@ -72,6 +74,7 @@ instance Outputable VirtualReg where
VirtualRegHi u -> text "%vHi_" <> pprUnique u
VirtualRegF u -> text "%vF_" <> pprUnique u
VirtualRegD u -> text "%vD_" <> pprUnique u
VirtualRegSSE u -> text "%vSSE_" <> pprUnique u
renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
......@@ -81,6 +84,7 @@ renameVirtualReg u r
VirtualRegHi _ -> VirtualRegHi u
VirtualRegF _ -> VirtualRegF u
VirtualRegD _ -> VirtualRegD u
VirtualRegSSE _ -> VirtualRegSSE u
classOfVirtualReg :: VirtualReg -> RegClass
......@@ -90,6 +94,7 @@ classOfVirtualReg vr
VirtualRegHi{} -> RcInteger
VirtualRegF{} -> RcFloat
VirtualRegD{} -> RcDouble
VirtualRegSSE{} -> RcDoubleSSE
-- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform
......
......@@ -380,25 +380,13 @@ seqNode node
`seq` (seqVirtualRegList (uniqSetToList (Color.nodeCoalesce node)))
seqVirtualReg :: VirtualReg -> ()
seqVirtualReg reg
= case reg of
VirtualRegI _ -> ()
VirtualRegHi _ -> ()
VirtualRegF _ -> ()
VirtualRegD _ -> ()
seqVirtualReg reg = reg `seq` ()
seqRealReg :: RealReg -> ()
seqRealReg reg
= case reg of
RealRegSingle _ -> ()
RealRegPair _ _ -> ()
seqRealReg reg = reg `seq` ()
seqRegClass :: RegClass -> ()
seqRegClass c
= case c of
RcInteger -> ()
RcFloat -> ()
RcDouble -> ()
seqRegClass c = c `seq` ()
seqMaybeRealReg :: Maybe RealReg -> ()
seqMaybeRealReg mr
......
......@@ -50,24 +50,27 @@ import FastTypes
#define ALLOCATABLE_REGS_INTEGER (_ILIT(3))
#define ALLOCATABLE_REGS_DOUBLE (_ILIT(6))
#define ALLOCATABLE_REGS_FLOAT (_ILIT(0))
#define ALLOCATABLE_REGS_SSE (_ILIT(16))
#elif x86_64_TARGET_ARCH
#define ALLOCATABLE_REGS_INTEGER (_ILIT(5))
#define ALLOCATABLE_REGS_DOUBLE (_ILIT(2))
#define ALLOCATABLE_REGS_DOUBLE (_ILIT(0))
#define ALLOCATABLE_REGS_FLOAT (_ILIT(0))
#define ALLOCATABLE_REGS_SSE (_ILIT(10))
#elif powerpc_TARGET_ARCH
#define ALLOCATABLE_REGS_INTEGER (_ILIT(16))
#define ALLOCATABLE_REGS_DOUBLE (_ILIT(26))
#define ALLOCATABLE_REGS_FLOAT (_ILIT(0))
#define ALLOCATABLE_REGS_SSE (_ILIT(0))
#elif sparc_TARGET_ARCH
#define ALLOCATABLE_REGS_INTEGER (_ILIT(14))
#define ALLOCATABLE_REGS_DOUBLE (_ILIT(11))
#define ALLOCATABLE_REGS_FLOAT (_ILIT(22))
#define ALLOCATABLE_REGS_SSE (_ILIT(0))
#else
......@@ -139,6 +142,17 @@ trivColorable virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
= count3 <# ALLOCATABLE_REGS_DOUBLE
trivColorable virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
| count2 <- accSqueeze (_ILIT(0)) ALLOCATABLE_REGS_SSE
(virtualRegSqueeze RcDoubleSSE)
conflicts
, count3 <- accSqueeze count2 ALLOCATABLE_REGS_SSE
(realRegSqueeze RcDoubleSSE)
exclusions
= count3 <# ALLOCATABLE_REGS_SSE
-- Specification Code ----------------------------------------------------------
--
......
......@@ -17,6 +17,7 @@ data RegClass
= RcInteger
| RcFloat
| RcDouble
| RcDoubleSSE -- x86 only: the SSE regs are a separate class
deriving Eq
......@@ -24,8 +25,10 @@ instance Uniquable RegClass where
getUnique RcInteger = mkRegClassUnique 0
getUnique RcFloat = mkRegClassUnique 1
getUnique RcDouble = mkRegClassUnique 2
getUnique RcDoubleSSE = mkRegClassUnique 3
instance Outputable RegClass where
ppr RcInteger = Outputable.text "I"
ppr RcFloat = Outputable.text "F"
ppr RcDouble = Outputable.text "D"
ppr RcDoubleSSE = Outputable.text "S"
......@@ -373,6 +373,7 @@ sparc_mkSpillInstr reg _ slot
RcInteger -> II32
RcFloat -> FF32
RcDouble -> FF64
_ -> panic "sparc_mkSpillInstr"
in ST sz reg (fpRel (negate off_w))
......@@ -391,6 +392,7 @@ sparc_mkLoadInstr reg _ slot
RcInteger -> II32
RcFloat -> FF32
RcDouble -> FF64
_ -> panic "sparc_mkLoadInstr"
in LD sz (fpRel (- off_w)) reg
......@@ -438,6 +440,7 @@ sparc_mkRegRegMoveInstr src dst
RcInteger -> ADD False False src (RIReg g0) dst
RcDouble -> FMOV FF64 src dst
RcFloat -> FMOV FF32 src dst
_ -> panic "sparc_mkRegRegMoveInstr"
| otherwise
= panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same"
......
......@@ -156,6 +156,7 @@ pprReg reg
VirtualRegHi u -> text "%vHi_" <> asmSDoc (pprUnique u)
VirtualRegF u -> text "%vF_" <> asmSDoc (pprUnique u)
VirtualRegD u -> text "%vD_" <> asmSDoc (pprUnique u)
VirtualRegSSE u -> text "%vSSE_" <> asmSDoc (pprUnique u)
RegReal rr
-> case rr of
......
......@@ -95,22 +95,21 @@ virtualRegSqueeze cls vr
-> case vr of
VirtualRegI{} -> _ILIT(1)
VirtualRegHi{} -> _ILIT(1)
VirtualRegF{} -> _ILIT(0)
VirtualRegD{} -> _ILIT(0)
_other -> _ILIT(0)
RcFloat
-> case vr of
VirtualRegI{} -> _ILIT(0)
VirtualRegHi{} -> _ILIT(0)
VirtualRegF{} -> _ILIT(1)
VirtualRegD{} -> _ILIT(2)
_other -> _ILIT(0)
RcDouble
-> case vr of
VirtualRegI{} -> _ILIT(0)
VirtualRegHi{} -> _ILIT(0)
VirtualRegF{} -> _ILIT(1)
VirtualRegD{} -> _ILIT(1)
_other -> _ILIT(0)
_other -> _ILIT(0)
{-# INLINE realRegSqueeze #-}
realRegSqueeze :: RegClass -> RealReg -> FastInt
......@@ -141,6 +140,7 @@ realRegSqueeze cls rr
RealRegPair{} -> _ILIT(1)
_other -> _ILIT(0)
-- | All the allocatable registers in the machine,
-- including register pairs.
......@@ -283,7 +283,7 @@ regDotColor reg
= case classOfRealReg reg of
RcInteger -> text "blue"
RcFloat -> text "red"
RcDouble -> text "green"
_other -> text "green"
......
......@@ -71,6 +71,22 @@ import Data.Bits
import Data.Word
import Data.Int
sse2Enabled :: NatM Bool
#if x86_64_TARGET_ARCH
-- SSE2 is fixed on for x86_64. It would be possible to make it optional,
-- but we'd need to fix at least the foreign call code where the calling
-- convention specifies the use of xmm regs, and possibly other places.
sse2Enabled = return True
#else
sse2Enabled = do
dflags <- getDynFlagsNat
return (dopt Opt_SSE2 dflags)
#endif
if_sse2 :: NatM a -> NatM a -> NatM a
if_sse2 sse2 x87 = do
b <- sse2Enabled
if b then sse2 else x87
cmmTopCodeGen
:: DynFlags
......@@ -201,12 +217,15 @@ swizzleRegisterRep (Any _ codefn) size = Any size codefn
-- | Grab the Reg for a CmmReg
getRegisterReg :: CmmReg -> Reg
getRegisterReg :: Bool -> CmmReg -> Reg
getRegisterReg (CmmLocal (LocalReg u pk))
= RegVirtual $ mkVirtualReg u (cmmTypeSize pk)
getRegisterReg use_sse2 (CmmLocal (LocalReg u pk))
= let sz = cmmTypeSize pk in
if isFloatSize sz && not use_sse2
then RegVirtual (mkVirtualReg u FF80)
else RegVirtual (mkVirtualReg u sz)
getRegisterReg (CmmGlobal mid)
getRegisterReg _ (CmmGlobal mid)
= case get_GlobalReg_reg_or_addr mid of
Left reg -> RegReal $ reg
_other -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
......@@ -405,8 +424,14 @@ getRegister (CmmReg (CmmGlobal PicBaseReg))
#endif
getRegister (CmmReg reg)
= return (Fixed (cmmTypeSize (cmmRegType reg))
(getRegisterReg reg) nilOL)
= do use_sse2 <- sse2Enabled
let
sz = cmmTypeSize (cmmRegType reg)
size | not use_sse2 && isFloatSize sz = FF80
| otherwise = sz
--
return (Fixed sz (getRegisterReg use_sse2 reg) nilOL)
getRegister tree@(CmmRegOff _ _)
= getRegister (mangleIndexTree tree)
......@@ -437,78 +462,35 @@ getRegister (CmmMachOp (MO_SS_Conv W64 W32) [x]) = do
#endif
#if i386_TARGET_ARCH
getRegister (CmmLit (CmmFloat f W32)) = do
lbl <- getNewLabelNat
dflags <- getDynFlagsNat
dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
Amode addr addr_code <- getAmode dynRef
let code dst =
LDATA ReadOnlyData
[CmmDataLabel lbl,
CmmStaticLit (CmmFloat f W32)]
`consOL` (addr_code `snocOL`
GLD FF32 addr dst)
-- in
return (Any FF32 code)
getRegister (CmmLit (CmmFloat d W64))
| d == 0.0
= let code dst = unitOL (GLDZ dst)
in return (Any FF64 code)
| d == 1.0
= let code dst = unitOL (GLD1 dst)
in return (Any FF64 code)
| otherwise = do
lbl <- getNewLabelNat
dflags <- getDynFlagsNat
dynRef <- cmmMakeDynamicReference dflags addImportNat DataReference lbl
Amode addr addr_code <- getAmode dynRef
let code dst =
LDATA ReadOnlyData
[CmmDataLabel lbl,
CmmStaticLit (CmmFloat d W64)]
`consOL` (addr_code `snocOL`
GLD FF64 addr dst)
-- in
return (Any FF64 code)
#endif /* i386_TARGET_ARCH */
#if x86_64_TARGET_ARCH
getRegister (CmmLit (CmmFloat 0.0 w)) = do
let size = floatSize w
code dst = unitOL (XOR size (OpReg dst) (OpReg dst))
-- I don't know why there are xorpd, xorps, and pxor instructions.
-- They all appear to do the same thing --SDM
return (Any size code)
getRegister (CmmLit (CmmFloat f w)) = do
lbl <- getNewLabelNat
let code dst = toOL [
LDATA ReadOnlyData
[CmmDataLabel lbl,
CmmStaticLit (CmmFloat f w)],
MOV size (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
]
-- in
return (Any size code)
where size = floatSize w
#endif /* x86_64_TARGET_ARCH */
getRegister (CmmLit lit@(CmmFloat f w)) =
if_sse2 float_const_sse2 float_const_x87
where
float_const_sse2
| f == 0.0 = do
let
size = floatSize w
code dst = unitOL (XOR size (OpReg dst) (OpReg dst))
-- I don't know why there are xorpd, xorps, and pxor instructions.
-- They all appear to do the same thing --SDM
return (Any size code)
| otherwise = do
Amode addr code <- memConstant (widthInBytes w) lit
loadFloatAmode True w addr code
float_const_x87 = case w of
W64
| f == 0.0 ->
let code dst = unitOL (GLDZ dst)
in return (Any FF80 code)
| f == 1.0 ->
let code dst = unitOL (GLD1 dst)
in return (Any FF80 code)
_otherwise -> do
Amode addr code <- memConstant (widthInBytes w) lit
loadFloatAmode False w addr code
-- catch simple cases of zero- or sign-extended load
getRegister (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
......@@ -560,61 +542,20 @@ getRegister (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
= return $ Any II64 (\dst -> unitOL $
LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
getRegister (CmmMachOp (MO_F_Neg W32) [x]) = do
x_code <- getAnyReg x
lbl <- getNewLabelNat
let
code dst = x_code dst `appOL` toOL [
-- This is how gcc does it, so it can't be that bad:
LDATA ReadOnlyData16 [
CmmAlign 16,
CmmDataLabel lbl,
CmmStaticLit (CmmInt 0x80000000 W32),
CmmStaticLit (CmmInt 0 W32),
CmmStaticLit (CmmInt 0 W32),
CmmStaticLit (CmmInt 0 W32)
],
XOR FF32 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-- xorps, so we need the 128-bit constant
-- ToDo: rip-relative
]
--
return (Any FF32 code)
getRegister (CmmMachOp (MO_F_Neg W64) [x]) = do
x_code <- getAnyReg x
lbl <- getNewLabelNat
let
-- This is how gcc does it, so it can't be that bad:
code dst = x_code dst `appOL` toOL [
LDATA ReadOnlyData16 [
CmmAlign 16,
CmmDataLabel lbl,
CmmStaticLit (CmmInt 0x8000000000000000 W64),
CmmStaticLit (CmmInt 0 W64)
],
-- gcc puts an unpck here. Wonder if we need it.
XOR FF64 (OpAddr (ripRel (ImmCLbl lbl))) (OpReg dst)
-- xorpd, so we need the 128-bit constant
]
--
return (Any FF64 code)
#endif /* x86_64_TARGET_ARCH */
getRegister (CmmMachOp mop [x]) -- unary MachOps
= case mop of
#if i386_TARGET_ARCH
MO_F_Neg W32 -> trivialUFCode FF32 (GNEG FF32) x
MO_F_Neg W64 -> trivialUFCode FF64 (GNEG FF64) x
#endif
getRegister (CmmMachOp mop [x]) = do -- unary MachOps
sse2 <- sse2Enabled
case mop of
MO_F_Neg w
| sse2 -> sse2NegCode w x
| otherwise -> trivialUFCode FF80 (GNEG FF80) x
MO_S_Neg w -> triv_ucode NEGI (intSize w)
MO_F_Neg w -> triv_ucode NEGI (floatSize w)
MO_Not w -> triv_ucode NOT (intSize w)
-- Nop conversions
......@@ -659,13 +600,13 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
-- the form of a movzl and print it as a movl later.
#endif
#if i386_TARGET_ARCH
MO_FF_Conv W32 W64 -> conversionNop FF64 x
MO_FF_Conv W64 W32 -> conversionNop FF32 x
#else
MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
#endif
MO_FF_Conv W32 W64
| sse2 -> coerceFP2FP W64 x
| otherwise -> conversionNop FF80 x
MO_FF_Conv W64 W32
| sse2 -> coerceFP2FP W32 x
| otherwise -> conversionNop FF80 x
MO_FS_Conv from to -> coerceFP2Int from to x
MO_SF_Conv from to -> coerceInt2FP from to x
......@@ -707,8 +648,9 @@ getRegister (CmmMachOp mop [x]) -- unary MachOps
return (swizzleRegisterRep e_code new_size)
getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
= case mop of
getRegister e@(CmmMachOp mop [x, y]) = do -- dyadic MachOps
sse2 <- sse2Enabled
case mop of
MO_F_Eq w -> condFltReg EQQ x y
MO_F_Ne w -> condFltReg NE x y
MO_F_Gt w -> condFltReg GTT x y
......@@ -729,19 +671,14 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
MO_U_Lt rep -> condIntReg LU x y
MO_U_Le rep -> condIntReg LEU x y
#if i386_TARGET_ARCH
MO_F_Add w -> trivialFCode w GADD x y
MO_F_Sub w -> trivialFCode w GSUB x y
MO_F_Quot w -> trivialFCode w GDIV x y
MO_F_Mul w -> trivialFCode w GMUL x y
#endif
#if x86_64_TARGET_ARCH
MO_F_Add w -> trivialFCode w ADD x y
MO_F_Sub w -> trivialFCode w SUB x y
MO_F_Quot w -> trivialFCode w FDIV x y
MO_F_Mul w -> trivialFCode w MUL x y
#endif
MO_F_Add w | sse2 -> trivialFCode_sse2 w ADD x y
| otherwise -> trivialFCode_x87 w GADD x y
MO_F_Sub w | sse2 -> trivialFCode_sse2 w SUB x y
| otherwise -> trivialFCode_x87 w GSUB x y
MO_F_Quot w | sse2 -> trivialFCode_sse2 w FDIV x y
| otherwise -> trivialFCode_x87 w GDIV x y
MO_F_Mul w | sse2 -> trivialFCode_sse2 w MUL x y
| otherwise -> trivialFCode_x87 w GMUL x y
MO_Add rep -> add_code rep x y
MO_Sub rep -> sub_code rep x y
......@@ -892,13 +829,9 @@ getRegister e@(CmmMachOp mop [x, y]) -- dyadic MachOps
getRegister (CmmLoad mem pk)
| isFloatType pk
= do
Amode src mem_code <- getAmode mem
let
size = cmmTypeSize pk
code dst = mem_code `snocOL`
IF_ARCH_i386(GLD size src dst,
MOV size (OpAddr src) (OpReg dst))
return (Any size code)