Joachim Breitner · Marge Bot · Ryan Scott · Marge Bot · Carter Schonwald · fbf7331b
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -81,7 +81,6 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
                            | passFloatInXmm          -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
                        (W64, (vs, fs, d:ds, ls, ss))
                            | not passFloatInXmm      -> k (RegisterParam d, (vs, fs, ds, ls, ss))
-                        (W80, _) -> panic "F80 unsupported register type"
                        _ -> (assts, (r:rs))
              int = case (w, regs) of
                      (W128, _) -> panic "W128 unsupported register type"
@@ -100,6 +99,7 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
 passFloatArgsInXmm :: DynFlags -> Bool
 passFloatArgsInXmm dflags = case platformArch (targetPlatform dflags) of
                              ArchX86_64 -> True
+                              ArchX86    -> False
                              _          -> False
 -- We used to spill vector registers to the stack since the LLVM backend didn't

--- a/compiler/cmm/CmmExpr.hs
+++ b/compiler/cmm/CmmExpr.hs
@@ -474,6 +474,9 @@ instance Eq GlobalReg where
   FloatReg i == FloatReg j = i==j
   DoubleReg i == DoubleReg j = i==j
   LongReg i == LongReg j = i==j
+   -- NOTE: XMM, YMM, ZMM registers actually are the same registers
+   -- at least with respect to store at YMM i and then read from XMM i
+   -- and similarly for ZMM etc.
   XmmReg i == XmmReg j = i==j
   YmmReg i == YmmReg j = i==j
   ZmmReg i == ZmmReg j = i==j
@@ -584,6 +587,9 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
 globalRegType _      (FloatReg _)      = cmmFloat W32
 globalRegType _      (DoubleReg _)     = cmmFloat W64
 globalRegType _      (LongReg _)       = cmmBits W64
+-- TODO: improve the internal model of SIMD/vectorized registers
+-- the right design SHOULd improve handling of float and double code too.
+-- see remarks in "NOTE [SIMD Design for the future]"" in StgCmmPrim
 globalRegType _      (XmmReg _)        = cmmVec 4 (cmmBits W32)
 globalRegType _      (YmmReg _)        = cmmVec 8 (cmmBits W32)
 globalRegType _      (ZmmReg _)        = cmmVec 16 (cmmBits W32)

--- a/compiler/cmm/CmmType.hs
+++ b/compiler/cmm/CmmType.hs
@@ -166,9 +166,6 @@ isFloat64 _other                 = False
 -----------------------------------------------------------------------------
 data Width   = W8 | W16 | W32 | W64
-             | W80      -- Extended double-precision float,
-                        -- used in x86 native codegen only.
-                        -- (we use Ord, so it'd better be in this order)
             | W128
             | W256
             | W512
@@ -185,7 +182,7 @@ mrStr W64  = sLit("W64")
 mrStr W128 = sLit("W128")
 mrStr W256 = sLit("W256")
 mrStr W512 = sLit("W512")
-mrStr W80  = sLit("W80")
 -------- Common Widths  ------------
@@ -222,7 +219,7 @@ widthInBits W64  = 64
 widthInBits W128 = 128
 widthInBits W256 = 256
 widthInBits W512 = 512
-widthInBits W80  = 80
 widthInBytes :: Width -> Int
 widthInBytes W8   = 1
@@ -232,7 +229,7 @@ widthInBytes W64  = 8
 widthInBytes W128 = 16
 widthInBytes W256 = 32
 widthInBytes W512 = 64
-widthInBytes W80  = 10
 widthFromBytes :: Int -> Width
 widthFromBytes 1  = W8
@@ -242,7 +239,7 @@ widthFromBytes 8  = W64
 widthFromBytes 16 = W128
 widthFromBytes 32 = W256
 widthFromBytes 64 = W512
-widthFromBytes 10 = W80
 widthFromBytes n  = pprPanic "no width for given number of bytes" (ppr n)
 -- log_2 of the width in bytes, useful for generating shifts.
@@ -254,7 +251,7 @@ widthInLog W64  = 3
 widthInLog W128 = 4
 widthInLog W256 = 5
 widthInLog W512 = 6
-widthInLog W80  = panic "widthInLog: F80"
 -- widening / narrowing

--- a/compiler/codeGen/StgCmmPrim.hs
+++ b/compiler/codeGen/StgCmmPrim.hs
@@ -1727,8 +1727,38 @@ vecElemProjectCast dflags WordVec  W32 =  Just (mo_u_32ToWord dflags)
 vecElemProjectCast _      WordVec  W64 =  Nothing
 vecElemProjectCast _      _        _   =  Nothing
+-- NOTE [SIMD Design for the future]
 -- Check to make sure that we can generate code for the specified vector type
 -- given the current set of dynamic flags.
+-- Currently these checks are specific to x86 and x86_64 architecture.
+-- This should be fixed!
+-- In particular,
+-- 1) Add better support for other architectures! (this may require a redesign)
+-- 2) Decouple design choices from LLVM's pseudo SIMD model!
+--   The high level LLVM naive rep makes per CPU family SIMD generation is own
+--   optimization problem, and hides important differences in eg ARM vs x86_64 simd
+-- 3) Depending on the architecture, the SIMD registers may also support general
+--    computations on Float/Double/Word/Int scalars, but currently on
+--    for example x86_64, we always put Word/Int (or sized) in GPR
+--    (general purpose) registers. Would relaxing that allow for
+--    useful optimization opportunities?
+--      Phrased differently, it is worth experimenting with supporting
+--    different register mapping strategies than we currently have, especially if
+--    someday we want SIMD to be a first class denizen in GHC along with scalar
+--    values!
+--      The current design with respect to register mapping of scalars could
+--    very well be the best,but exploring the  design space and doing careful
+--    measurments is the only only way to validate that.
+--      In some next generation CPU ISAs, notably RISC V, the SIMD extension
+--    includes  support for a sort of run time CPU dependent vectorization parameter,
+--    where a loop may act upon a single scalar each iteration OR some 2,4,8 ...
+--    element chunk! Time will tell if that direction sees wide adoption,
+--    but it is from that context that unifying our handling of simd and scalars
+--    may benefit. It is not likely to benefit current architectures, though
+--    it may very well be a design perspective that helps guide improving the NCG.
 checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode ()
 checkVecCompatibility dflags vcat l w = do
    when (hscTarget dflags /= HscLlvm) $ do

--- a/compiler/llvmGen/LlvmCodeGen/Base.hs
+++ b/compiler/llvmGen/LlvmCodeGen/Base.hs
@@ -97,7 +97,6 @@ cmmToLlvmType ty | isVecType ty   = LMVector (vecLength ty) (cmmToLlvmType (vecE
 widthToLlvmFloat :: Width -> LlvmType
 widthToLlvmFloat W32  = LMFloat
 widthToLlvmFloat W64  = LMDouble
-widthToLlvmFloat W80  = LMFloat80
 widthToLlvmFloat W128 = LMFloat128
 widthToLlvmFloat w    = panic $ "widthToLlvmFloat: Bad float size: " ++ show w

--- a/compiler/main/DynFlags.hs
+++ b/compiler/main/DynFlags.hs
@@ -58,7 +58,7 @@ module DynFlags (
        fFlags, fLangFlags, xFlags,
        wWarningFlags,
        dynFlagDependencies,
-        tablesNextToCode, mkTablesNextToCode,
+        tablesNextToCode,
        makeDynFlagsConsistent,
        shouldUseColor,
        shouldUseHexWordLiterals,
@@ -5833,20 +5833,24 @@ data SseVersion = SSE1
 isSseEnabled :: DynFlags -> Bool
 isSseEnabled dflags = case platformArch (targetPlatform dflags) of
    ArchX86_64 -> True
-    ArchX86    -> sseVersion dflags >= Just SSE1
+    ArchX86    -> True
    _          -> False
 isSse2Enabled :: DynFlags -> Bool
 isSse2Enabled dflags = case platformArch (targetPlatform dflags) of
-    ArchX86_64 -> -- SSE2 is fixed on for x86_64.  It would be
+  -- We Assume  SSE1 and SSE2 operations are available on both
-                  -- possible to make it optional, but we'd need to
+  -- x86 and x86_64. Historically we didn't default to SSE2 and
-                  -- fix at least the foreign call code where the
+  -- SSE1 on x86, which results in defacto nondeterminism for how
-                  -- calling convention specifies the use of xmm regs,
+  -- rounding behaves in the associated x87 floating point instructions
-                  -- and possibly other places.
+  -- because variations in the spill/fpu stack placement of arguments for
-                  True
+  -- operations would change the precision and final result of what
-    ArchX86    -> sseVersion dflags >= Just SSE2
+  -- would otherwise be the same expressions with respect to single or
+  -- double precision IEEE floating point computations.
+    ArchX86_64 -> True
+    ArchX86    -> True
    _          -> False
 isSse4_2Enabled :: DynFlags -> Bool
 isSse4_2Enabled dflags = sseVersion dflags >= Just SSE42

--- a/compiler/main/SysTools.hs
+++ b/compiler/main/SysTools.hs
@@ -199,15 +199,9 @@ initSysTools top_dir
       let unreg_gcc_args = if targetUnregisterised
                            then ["-DNO_REGS", "-DUSE_MINIINTERPRETER"]
                            else []
-           -- TABLES_NEXT_TO_CODE affects the info table layout.
-           tntc_gcc_args
-            | mkTablesNextToCode targetUnregisterised
-               = ["-DTABLES_NEXT_TO_CODE"]
-            | otherwise = []
           cpp_args= map Option (words cpp_args_str)
           gcc_args = map Option (words gcc_args_str
-                               ++ unreg_gcc_args
+                               ++ unreg_gcc_args)
-                               ++ tntc_gcc_args)
       ldSupportsCompactUnwind <- getBooleanSetting "ld supports compact unwind"
       ldSupportsBuildId       <- getBooleanSetting "ld supports build-id"
       ldSupportsFilelist      <- getBooleanSetting "ld supports filelist"

--- a/compiler/nativeGen/AsmCodeGen.hs
+++ b/compiler/nativeGen/AsmCodeGen.hs
@@ -179,7 +179,7 @@ nativeCodeGen dflags this_mod modLoc h us cmms
 x86NcgImpl :: DynFlags -> NcgImpl (Alignment, CmmStatics)
                                  X86.Instr.Instr X86.Instr.JumpDest
 x86NcgImpl dflags
- = (x86_64NcgImpl dflags) { ncg_x86fp_kludge = map x86fp_kludge }
+ = (x86_64NcgImpl dflags)
 x86_64NcgImpl :: DynFlags -> NcgImpl (Alignment, CmmStatics)
                                  X86.Instr.Instr X86.Instr.JumpDest
@@ -194,7 +194,6 @@ x86_64NcgImpl dflags
       ,pprNatCmmDecl             = X86.Ppr.pprNatCmmDecl
       ,maxSpillSlots             = X86.Instr.maxSpillSlots dflags
       ,allocatableRegs           = X86.Regs.allocatableRegs platform
-       ,ncg_x86fp_kludge          = id
       ,ncgAllocMoreStack         = X86.Instr.allocMoreStack platform
       ,ncgExpandTop              = id
       ,ncgMakeFarBranches        = const id
@@ -215,7 +214,6 @@ ppcNcgImpl dflags
       ,pprNatCmmDecl             = PPC.Ppr.pprNatCmmDecl
       ,maxSpillSlots             = PPC.Instr.maxSpillSlots dflags
       ,allocatableRegs           = PPC.Regs.allocatableRegs platform
-       ,ncg_x86fp_kludge          = id
       ,ncgAllocMoreStack         = PPC.Instr.allocMoreStack platform
       ,ncgExpandTop              = id
       ,ncgMakeFarBranches        = PPC.Instr.makeFarBranches
@@ -236,7 +234,6 @@ sparcNcgImpl dflags
       ,pprNatCmmDecl             = SPARC.Ppr.pprNatCmmDecl
       ,maxSpillSlots             = SPARC.Instr.maxSpillSlots dflags
       ,allocatableRegs           = SPARC.Regs.allocatableRegs
-       ,ncg_x86fp_kludge          = id
       ,ncgAllocMoreStack         = noAllocMoreStack
       ,ncgExpandTop              = map SPARC.CodeGen.Expand.expandTop
       ,ncgMakeFarBranches        = const id
@@ -680,19 +677,10 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
                foldl' (\m (from,to) -> addImmediateSuccessor from to m )
                       cfgWithFixupBlks stack_updt_blks
-        ---- x86fp_kludge.  This pass inserts ffree instructions to clear
-        ---- the FPU stack on x86.  The x86 ABI requires that the FPU stack
-        ---- is clear, and library functions can return odd results if it
-        ---- isn't.
-        ----
-        ---- NB. must happen before shortcutBranches, because that
-        ---- generates JXX_GBLs which we can't fix up in x86fp_kludge.
-        let kludged = {-# SCC "x86fp_kludge" #-} ncg_x86fp_kludge ncgImpl alloced
        ---- generate jump tables
        let tabled      =
                {-# SCC "generateJumpTables" #-}
-                generateJumpTables ncgImpl kludged
+                generateJumpTables ncgImpl alloced
        dumpIfSet_dyn dflags
                Opt_D_dump_cfg_weights "CFG Update information"
@@ -787,12 +775,6 @@ checkLayout procsUnsequenced procsSequenced =
        getBlockIds (CmmProc _ _ _ (ListGraph blocks)) =
                setFromList $ map blockId blocks
-x86fp_kludge :: NatCmmDecl (Alignment, CmmStatics) X86.Instr.Instr -> NatCmmDecl (Alignment, CmmStatics) X86.Instr.Instr
-x86fp_kludge top@(CmmData _ _) = top
-x86fp_kludge (CmmProc info lbl live (ListGraph code)) =
-        CmmProc info lbl live (ListGraph $ X86.Instr.i386_insert_ffrees code)
 -- | Compute unwinding tables for the blocks of a procedure
 computeUnwinding :: Instruction instr
                 => DynFlags -> NcgImpl statics instr jumpDest

--- a/compiler/nativeGen/Format.hs
+++ b/compiler/nativeGen/Format.hs
@@ -47,7 +47,6 @@ data Format
        | II64
        | FF32
        | FF64
-        | FF80
        deriving (Show, Eq)
@@ -70,7 +69,7 @@ floatFormat width
 = case width of
        W32     -> FF32
        W64     -> FF64
-        W80     -> FF80
        other   -> pprPanic "Format.floatFormat" (ppr other)
@@ -80,7 +79,6 @@ isFloatFormat format
 = case format of
        FF32    -> True
        FF64    -> True
-        FF80    -> True
        _       -> False
@@ -101,7 +99,7 @@ formatToWidth format
        II64            -> W64
        FF32            -> W32
        FF64            -> W64
-        FF80            -> W80
 formatInBytes :: Format -> Int
 formatInBytes = widthInBytes . formatToWidth
--- a/compiler/nativeGen/NCGMonad.hs
+++ b/compiler/nativeGen/NCGMonad.hs
@@ -76,7 +76,6 @@ data NcgImpl statics instr jumpDest = NcgImpl {
    pprNatCmmDecl             :: NatCmmDecl statics instr -> SDoc,
    maxSpillSlots             :: Int,
    allocatableRegs           :: [RealReg],
-    ncg_x86fp_kludge          :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
    ncgExpandTop              :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
    ncgAllocMoreStack         :: Int -> NatCmmDecl statics instr
                              -> UniqSM (NatCmmDecl statics instr, [(BlockId,BlockId)]),

--- a/compiler/nativeGen/PPC/CodeGen.hs
+++ b/compiler/nativeGen/PPC/CodeGen.hs
@@ -1593,7 +1593,7 @@ genCCall'
    -> [CmmActual]        -- arguments (of mixed type)
    -> NatM InstrBlock
-{- 
+{-
    PowerPC Linux uses the System V Release 4 Calling Convention
    for PowerPC. It is described in the
    "System V Application Binary Interface PowerPC Processor Supplement".
@@ -1906,7 +1906,7 @@ genCCall' dflags gcp target dest_regs args
                          FF32 -> (1, 1, 4, fprs)
                          FF64 -> (2, 1, 8, fprs)
                          II64 -> panic "genCCall' passArguments II64"
-                          FF80 -> panic "genCCall' passArguments FF80"
                      GCP32ELF ->
                          case cmmTypeFormat rep of
                          II8  -> (1, 0, 4, gprs)
@@ -1916,7 +1916,6 @@ genCCall' dflags gcp target dest_regs args
                          FF32 -> (0, 1, 4, fprs)
                          FF64 -> (0, 1, 8, fprs)
                          II64 -> panic "genCCall' passArguments II64"
-                          FF80 -> panic "genCCall' passArguments FF80"
                      GCP64ELF _ ->
                          case cmmTypeFormat rep of
                          II8  -> (1, 0, 8, gprs)
@@ -1928,7 +1927,6 @@ genCCall' dflags gcp target dest_regs args
                          -- the FPRs.
                          FF32 -> (1, 1, 8, fprs)
                          FF64 -> (1, 1, 8, fprs)
-                          FF80 -> panic "genCCall' passArguments FF80"
        moveResult reduceToFF32 =
            case dest_regs of

--- a/compiler/nativeGen/PPC/Ppr.hs
+++ b/compiler/nativeGen/PPC/Ppr.hs
@@ -161,7 +161,7 @@ pprReg r
      RegVirtual (VirtualRegHi u)  -> text "%vHi_"  <> pprUniqueAlways u
      RegVirtual (VirtualRegF  u)  -> text "%vF_"   <> pprUniqueAlways u
      RegVirtual (VirtualRegD  u)  -> text "%vD_"   <> pprUniqueAlways u
-      RegVirtual (VirtualRegSSE u) -> text "%vSSE_" <> pprUniqueAlways u
  where
    ppr_reg_no :: Int -> SDoc
    ppr_reg_no i
@@ -179,8 +179,7 @@ pprFormat x
                II32 -> sLit "w"
                II64 -> sLit "d"
                FF32 -> sLit "fs"
-                FF64 -> sLit "fd"
+                FF64 -> sLit "fd")
-                _    -> panic "PPC.Ppr.pprFormat: no match")
 pprCond :: Cond -> SDoc
@@ -365,7 +364,6 @@ pprInstr (LD fmt reg addr) = hcat [
            II64 -> sLit "d"
            FF32 -> sLit "fs"
            FF64 -> sLit "fd"
-            _         -> panic "PPC.Ppr.pprInstr: no match"
            ),
        case addr of AddrRegImm _ _ -> empty
                     AddrRegReg _ _ -> char 'x',
@@ -405,7 +403,6 @@ pprInstr (LA fmt reg addr) = hcat [
            II64 -> sLit "d"
            FF32 -> sLit "fs"
            FF64 -> sLit "fd"
-            _         -> panic "PPC.Ppr.pprInstr: no match"
            ),
        case addr of AddrRegImm _ _ -> empty
                     AddrRegReg _ _ -> char 'x',

--- a/compiler/nativeGen/PPC/Regs.hs
+++ b/compiler/nativeGen/PPC/Regs.hs
@@ -131,7 +131,7 @@ regDotColor reg
        RcInteger       -> text "blue"
        RcFloat         -> text "red"
        RcDouble        -> text "green"
-        RcDoubleSSE     -> text "yellow"
 -- immediates ------------------------------------------------------------------

--- a/compiler/nativeGen/Reg.hs
+++ b/compiler/nativeGen/Reg.hs
@@ -56,7 +56,7 @@ data VirtualReg
        | VirtualRegHi {-# UNPACK #-} !Unique  -- High part of 2-word register
        | VirtualRegF  {-# UNPACK #-} !Unique
        | VirtualRegD  {-# UNPACK #-} !Unique
-        | VirtualRegSSE {-# UNPACK #-} !Unique
        deriving (Eq, Show)
 -- This is laborious, but necessary. We can't derive Ord because
@@ -69,15 +69,14 @@ instance Ord VirtualReg where
  compare (VirtualRegHi a) (VirtualRegHi b) = nonDetCmpUnique a b
  compare (VirtualRegF a) (VirtualRegF b) = nonDetCmpUnique a b
  compare (VirtualRegD a) (VirtualRegD b) = nonDetCmpUnique a b
-  compare (VirtualRegSSE a) (VirtualRegSSE b) = nonDetCmpUnique a b
  compare VirtualRegI{} _ = LT
  compare _ VirtualRegI{} = GT
  compare VirtualRegHi{} _ = LT
  compare _ VirtualRegHi{} = GT
  compare VirtualRegF{} _ = LT
  compare _ VirtualRegF{} = GT
-  compare VirtualRegD{} _ = LT
-  compare _ VirtualRegD{} = GT
 instance Uniquable VirtualReg where
@@ -87,16 +86,18 @@ instance Uniquable VirtualReg where
                VirtualRegHi u  -> u
                VirtualRegF u   -> u
                VirtualRegD u   -> u
-                VirtualRegSSE u -> u
 instance Outputable VirtualReg where
        ppr reg
         = case reg of
                VirtualRegI  u  -> text "%vI_"   <> pprUniqueAlways u
                VirtualRegHi u  -> text "%vHi_"  <> pprUniqueAlways u
-                VirtualRegF  u  -> text "%vF_"   <> pprUniqueAlways u
+                -- this code is kinda wrong on x86
-                VirtualRegD  u  -> text "%vD_"   <> pprUniqueAlways u
+                -- because float and double occupy the same register set
-                VirtualRegSSE u -> text "%vSSE_" <> pprUniqueAlways u
+                -- namely SSE2 register xmm0 .. xmm15
+                VirtualRegF  u  -> text "%vFloat_"   <> pprUniqueAlways u
+                VirtualRegD  u  -> text "%vDouble_"   <> pprUniqueAlways u
 renameVirtualReg :: Unique -> VirtualReg -> VirtualReg
@@ -106,7 +107,6 @@ renameVirtualReg u r
        VirtualRegHi _  -> VirtualRegHi u
        VirtualRegF _   -> VirtualRegF  u
        VirtualRegD _   -> VirtualRegD  u
-        VirtualRegSSE _ -> VirtualRegSSE u
 classOfVirtualReg :: VirtualReg -> RegClass
@@ -116,7 +116,7 @@ classOfVirtualReg vr
        VirtualRegHi{}  -> RcInteger
        VirtualRegF{}   -> RcFloat
        VirtualRegD{}   -> RcDouble
-        VirtualRegSSE{} -> RcDoubleSSE
 -- Determine the upper-half vreg for a 64-bit quantity on a 32-bit platform

--- a/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
+++ b/compiler/nativeGen/RegAlloc/Graph/TrivColorable.hs
@@ -134,6 +134,10 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcInteger conflicts excl
 trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclusions
        | let cALLOCATABLE_REGS_FLOAT
                  =        (case platformArch platform of
+                    -- On x86_64 and x86, Float and RcDouble
+                    -- use the same registers,
+                    -- so we only use RcDouble to represent the
+                    -- register allocation problem on those types.
                            ArchX86       -> 0
                            ArchX86_64    -> 0
                            ArchPPC       -> 0
@@ -160,8 +164,14 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcFloat conflicts exclus
 trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclusions
        | let cALLOCATABLE_REGS_DOUBLE
                  =        (case platformArch platform of
-                            ArchX86       -> 6
+                            ArchX86       -> 8
-                            ArchX86_64    -> 0
+                            -- in x86 32bit mode sse2 there are only
+                            -- 8 XMM registers xmm0 ... xmm7
+                            ArchX86_64    -> 10
+                            -- in x86_64 there are 16 XMM registers
+                            -- xmm0 .. xmm15, here 10 is a
+                            -- "dont need to solve conflicts" count that
+                            -- was chosen at some point in the past.
                            ArchPPC       -> 26
                            ArchSPARC     -> 11
                            ArchSPARC64   -> panic "trivColorable ArchSPARC64"
@@ -183,31 +193,7 @@ trivColorable platform virtualRegSqueeze realRegSqueeze RcDouble conflicts exclu
        = count3 < cALLOCATABLE_REGS_DOUBLE
-trivColorable platform virtualRegSqueeze realRegSqueeze RcDoubleSSE conflicts exclusions
-        | let cALLOCATABLE_REGS_SSE
-                  =        (case platformArch platform of
-                            ArchX86       -> 8
-                            ArchX86_64    -> 10
-                            ArchPPC       -> 0
-                            ArchSPARC     -> 0
-                            ArchSPARC64   -> panic "trivColorable ArchSPARC64"
-                            ArchPPC_64 _  -> 0
-                            ArchARM _ _ _ -> panic "trivColorable ArchARM"
-                            ArchARM64     -> panic "trivColorable ArchARM64"
-                            ArchAlpha     -> panic "trivColorable ArchAlpha"
-                            ArchMipseb    -> panic "trivColorable ArchMipseb"
-                            ArchMipsel    -> panic "trivColorable ArchMipsel"
-                            ArchJavaScript-> panic "trivColorable ArchJavaScript"
-                            ArchUnknown   -> panic "trivColorable ArchUnknown")
-        , count2        <- accSqueeze 0 cALLOCATABLE_REGS_SSE
-                                (virtualRegSqueeze RcDoubleSSE)
-                                conflicts
-        , count3        <- accSqueeze  count2    cALLOCATABLE_REGS_SSE
-                                (realRegSqueeze   RcDoubleSSE)
-                                exclusions
-        = count3 < cALLOCATABLE_REGS_SSE
 -- Specification Code ----------------------------------------------------------

--- a/compiler/nativeGen/RegClass.hs
+++ b/compiler/nativeGen/RegClass.hs
@@ -18,7 +18,6 @@ data RegClass
        = RcInteger
        | RcFloat
        | RcDouble
-        | RcDoubleSSE -- x86 only: the SSE regs are a separate class
        deriving Eq
@@ -26,10 +25,8 @@ instance Uniquable RegClass where
    getUnique RcInteger = mkRegClassUnique 0
    getUnique RcFloat   = mkRegClassUnique 1
    getUnique RcDouble  = mkRegClassUnique 2
-    getUnique RcDoubleSSE = mkRegClassUnique 3
 instance Outputable RegClass where
    ppr RcInteger       = Outputable.text "I"
    ppr RcFloat         = Outputable.text "F"
    ppr RcDouble        = Outputable.text "D"
-    ppr RcDoubleSSE     = Outputable.text "S"
--- a/compiler/nativeGen/SPARC/Instr.hs
+++ b/compiler/nativeGen/SPARC/Instr.hs
@@ -384,7 +384,6 @@ sparc_mkSpillInstr dflags reg _ slot
                        RcInteger -> II32
                        RcFloat   -> FF32
                        RcDouble  -> FF64
-                        _         -> panic "sparc_mkSpillInstr"
    in ST fmt reg (fpRel (negate off_w))
@@ -405,7 +404,6 @@ sparc_mkLoadInstr dflags reg _ slot
                        RcInteger -> II32
                        RcFloat   -> FF32
                        RcDouble  -> FF64
-                        _         -> panic "sparc_mkLoadInstr"
        in LD fmt (fpRel (- off_w)) reg
@@ -454,7 +452,6 @@ sparc_mkRegRegMoveInstr platform src dst
                RcInteger -> ADD  False False src (RIReg g0) dst
                RcDouble  -> FMOV FF64 src dst
                RcFloat   -> FMOV FF32 src dst
-                _         -> panic "sparc_mkRegRegMoveInstr"
        | otherwise
        = panic "SPARC.Instr.mkRegRegMoveInstr: classes of src and dest not the same"

--- a/compiler/nativeGen/SPARC/Ppr.hs
+++ b/compiler/nativeGen/SPARC/Ppr.hs
@@ -143,7 +143,7 @@ pprReg reg
                VirtualRegHi  u -> text "%vHi_"  <> pprUniqueAlways u
                VirtualRegF   u -> text "%vF_"   <> pprUniqueAlways u
                VirtualRegD   u -> text "%vD_"   <> pprUniqueAlways u
-                VirtualRegSSE u -> text "%vSSE_" <> pprUniqueAlways u
        RegReal rr
         -> case rr of
@@ -211,8 +211,7 @@ pprFormat x
        II32    -> sLit ""
        II64    -> sLit "d"
        FF32    -> sLit ""
-        FF64    -> sLit "d"
+        FF64    -> sLit "d")
-        _       -> panic "SPARC.Ppr.pprFormat: no match")
 -- | Pretty print a format for an instruction suffix.
@@ -226,8 +225,8 @@ pprStFormat x
        II32  -> sLit ""
        II64  -> sLit "x"
        FF32  -> sLit ""
-        FF64  -> sLit "d"
+        FF64  -> sLit "d")
-        _       -> panic "SPARC.Ppr.pprFormat: no match")
 -- | Pretty print a condition code.

--- a/compiler/nativeGen/SPARC/Regs.hs
+++ b/compiler/nativeGen/SPARC/Regs.hs
@@ -104,7 +104,6 @@ virtualRegSqueeze cls vr
                VirtualRegD{}           -> 1
                _other                  -> 0
-        _other -> 0
 {-# INLINE realRegSqueeze #-}
 realRegSqueeze :: RegClass -> RealReg -> Int
@@ -135,7 +134,6 @@ realRegSqueeze cls rr
                RealRegPair{}           -> 1
-        _other -> 0
 -- | All the allocatable registers in the machine,
 --      including register pairs.

--- a/compiler/nativeGen/X86/CodeGen.hs
+++ b/compiler/nativeGen/X86/CodeGen.hs
@@ -98,17 +98,25 @@ is32BitPlatform = do
 sse2Enabled :: NatM Bool
 sse2Enabled = do
  dflags <- getDynFlags
-  return (isSse2Enabled dflags)
+  case platformArch (targetPlatform dflags) of
+  -- We Assume  SSE1 and SSE2 operations are available on both
+  -- x86 and x86_64. Historically we didn't default to SSE2 and
+  -- SSE1 on x86, which results in defacto nondeterminism for how
+  -- rounding behaves in the associated x87 floating point instructions
+  -- because variations in the spill/fpu stack placement of arguments for
+  -- operations would change the precision and final result of what
+  -- would otherwise be the same expressions with respect to single or
+  -- double precision IEEE floating point computations.
+    ArchX86_64 -> return True
+    ArchX86    -> return True
+    _          -> panic "trying to generate x86/x86_64 on the wrong platform"
 sse4_2Enabled :: NatM Bool
 sse4_2Enabled = do
  dflags <- getDynFlags
  return (isSse4_2Enabled dflags)
-if_sse2 :: NatM a -> NatM a -> NatM a
-if_sse2 sse2 x87 = do
-  b <- sse2Enabled
-  if b then sse2 else x87
 cmmTopCodeGen
        :: RawCmmDecl
@@ -284,15 +292,14 @@ swizzleRegisterRep (Any _ codefn)     format = Any   format codefn
 -- | Grab the Reg for a CmmReg
-getRegisterReg :: Platform -> Bool -> CmmReg -> Reg
+getRegisterReg :: Platform  -> CmmReg -> Reg
-getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk))
+getRegisterReg _   (CmmLocal (LocalReg u pk))
-  = let fmt = cmmTypeFormat pk in
+  = -- by Assuming SSE2, Int,Word,Float,Double all can be register allocated
-    if isFloatFormat fmt && not use_sse2
+   let fmt = cmmTypeFormat pk in
-       then RegVirtual (mkVirtualReg u FF80)
+        RegVirtual (mkVirtualReg u fmt)
-       else RegVirtual (mkVirtualReg u fmt)
-getRegisterReg platform _ (CmmGlobal mid)
+getRegisterReg platform  (CmmGlobal mid)
  = case globalRegMaybe platform mid of
        Just reg -> RegReal $ reg
        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)
@@ -513,15 +520,14 @@ getRegister' dflags is32Bit (CmmReg reg)
            do reg' <- getPicBaseNat (archWordFormat is32Bit)
               return (Fixed (archWordFormat is32Bit) reg' nilOL)
        _ ->
-            do use_sse2 <- sse2Enabled
+            do
               let
                 fmt = cmmTypeFormat (cmmRegType dflags reg)
-                 format | not use_sse2 && isFloatFormat fmt = FF80
+                 format  = fmt
-                        | otherwise                         = fmt
               --
               let platform = targetPlatform dflags
               return (Fixed format
-                             (getRegisterReg platform use_sse2 reg)
+                             (getRegisterReg platform  reg)
                             nilOL)
@@ -557,8 +563,7 @@ getRegister' _ is32Bit (CmmMachOp (MO_SS_Conv W64 W32) [x])
  return $ Fixed II32 rlo code
 getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
-  if_sse2 float_const_sse2 float_const_x87
+  float_const_sse2  where
- where
  float_const_sse2
    | f == 0.0 = do
      let
@@ -570,21 +575,7 @@ getRegister' _ _ (CmmLit lit@(CmmFloat f w)) =
   | otherwise = do
      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
-      loadFloatAmode True w addr code
+      loadFloatAmode w addr code
-  float_const_x87 = case w of
-    W64
-      | f == 0.0 ->
-        let code dst = unitOL (GLDZ dst)
-        in  return (Any FF80 code)
-      | f == 1.0 ->
-        let code dst = unitOL (GLD1 dst)
-        in  return (Any FF80 code)
-    _otherwise -> do
-      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
-      loadFloatAmode False w addr code
 -- catch simple cases of zero- or sign-extended load
 getRegister' _ _ (CmmMachOp (MO_UU_Conv W8 W32) [CmmLoad addr _]) = do
@@ -641,11 +632,9 @@ getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg),
        LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst))
 getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
-    sse2 <- sse2Enabled
    case mop of
-      MO_F_Neg w
+      MO_F_Neg w  -> sse2NegCode w x
-         | sse2      -> sse2NegCode w x
-         | otherwise -> trivialUFCode FF80 (GNEG FF80) x
      MO_S_Neg w -> triv_ucode NEGI (intFormat w)
      MO_Not w   -> triv_ucode NOT  (intFormat w)
@@ -711,9 +700,8 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
      MO_XX_Conv W16 W64 | not is32Bit -> integerExtend W16 W64 MOV x
      MO_XX_Conv W32 W64 | not is32Bit -> integerExtend W32 W64 MOV x
-      MO_FF_Conv W32 W64
+      MO_FF_Conv W32 W64 -> coerceFP2FP W64 x
-        | sse2      -> coerceFP2FP W64 x
-        | otherwise -> conversionNop FF80 x
      MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
@@ -776,7 +764,6 @@ getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps
 getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
-  sse2 <- sse2Enabled
  case mop of
      MO_F_Eq _ -> condFltReg is32Bit EQQ x y
      MO_F_Ne _ -> condFltReg is32Bit NE  x y
@@ -800,14 +787,14 @@ getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps
      MO_U_Lt _ -> condIntReg LU  x y
      MO_U_Le _ -> condIntReg LEU x y
-      MO_F_Add w  | sse2      -> trivialFCode_sse2 w ADD  x y
+      MO_F_Add w   -> trivialFCode_sse2 w ADD  x y
-                  | otherwise -> trivialFCode_x87    GADD x y
-      MO_F_Sub w  | sse2      -> trivialFCode_sse2 w SUB  x y
+      MO_F_Sub w   -> trivialFCode_sse2 w SUB  x y
-                  | otherwise -> trivialFCode_x87    GSUB x y
-      MO_F_Quot w | sse2      -> trivialFCode_sse2 w FDIV x y
+      MO_F_Quot w  -> trivialFCode_sse2 w FDIV x y
-                  | otherwise -> trivialFCode_x87    GDIV x y
-      MO_F_Mul w  | sse2      -> trivialFCode_sse2 w MUL x y
+      MO_F_Mul w   -> trivialFCode_sse2 w MUL x y
-                  | otherwise -> trivialFCode_x87    GMUL x y
      MO_Add rep -> add_code rep x y
      MO_Sub rep -> sub_code rep x y
@@ -1001,8 +988,7 @@ getRegister' _ _ (CmmLoad mem pk)
  | isFloatType pk
  = do
    Amode addr mem_code <- getAmode mem
-    use_sse2 <- sse2Enabled
+    loadFloatAmode  (typeWidth pk) addr mem_code
-    loadFloatAmode use_sse2 (typeWidth pk) addr mem_code
 getRegister' _ is32Bit (CmmLoad mem pk)
  | is32Bit && not (isWord64 pk)
@@ -1132,9 +1118,7 @@ getNonClobberedReg expr = do
                return (reg, code)
 reg2reg :: Format -> Reg -> Reg -> Instr
-reg2reg format src dst
+reg2reg format src dst = MOV format (OpReg src) (OpReg dst)
-  | format == FF80 = GMOV src dst
-  | otherwise    = MOV format (OpReg src) (OpReg dst)
 --------------------------------------------------------------------------------
@@ -1243,8 +1227,7 @@ x86_complex_amode base index shift offset
 getNonClobberedOperand :: CmmExpr -> NatM (Operand, InstrBlock)
 getNonClobberedOperand (CmmLit lit) = do
-  use_sse2 <- sse2Enabled
+  if  isSuitableFloatingPointLit lit
-  if use_sse2 && isSuitableFloatingPointLit lit
    then do
      let CmmFloat _ w = lit
      Amode addr code <- memConstant (mkAlignment $ widthInBytes w) lit
@@ -1259,9 +1242,12 @@ getNonClobberedOperand (CmmLit lit) = do
 getNonClobberedOperand (CmmLoad mem pk) = do
  is32Bit <- is32BitPlatform
-  use_sse2 <- sse2Enabled
+  -- this logic could be simplified
-  if (not (isFloatType pk) || use_sse2)
+  -- TODO FIXME
-      && (if is32Bit then not (isWord64 pk) else True)
+  if   (if is32Bit then not (isWord64 pk) else True)
+      -- if 32bit and pk is at float/double/simd value
+      -- or if 64bit
+      --  this could use some eyeballs or i'll need to stare at it more later
    then do
      dflags <- getDynFlags
      let platform = targetPlatform dflags
@@ -1278,6 +1264,7 @@ getNonClobberedOperand (CmmLoad mem pk) = do
                   return (src, nilOL)
      return (OpAddr src', mem_code `appOL` save_code)
    else do
+      -- if its a word or gcptr on 32bit?
      getNonClobberedOperand_generic (CmmLoad mem pk)
 getNonClobberedOperand e = getNonClobberedOperand_generic e
@@ -1370,14 +1357,13 @@ memConstant align lit = do
  return (Amode addr code)
-loadFloatAmode :: Bool -> Width -> AddrMode -> InstrBlock -> NatM Register
+loadFloatAmode :: Width -> AddrMode -> InstrBlock -> NatM Register
-loadFloatAmode use_sse2 w addr addr_code = do
+loadFloatAmode w addr addr_code = do
  let format = floatFormat w
      code dst = addr_code `snocOL`
-                 if use_sse2
+                    MOV format (OpAddr addr) (OpReg dst)
-                    then MOV format (OpAddr addr) (OpReg dst)
-                    else GLD format addr dst
+  return (Any format code)
-  return (Any (if use_sse2 then format else FF80) code)
 -- if we want a floating-point literal as an operand, we can
@@ -1538,19 +1524,9 @@ condIntCode' _ cond x y = do
 condFltCode :: Cond -> CmmExpr -> CmmExpr -> NatM CondCode
 condFltCode cond x y
-  = if_sse2 condFltCode_sse2 condFltCode_x87
+  =  condFltCode_sse2
  where
-  condFltCode_x87
-    = ASSERT(cond `elem` ([EQQ, NE, LE, LTT, GE, GTT])) do
-    (x_reg, x_code) <- getNonClobberedReg x
-    (y_reg, y_code) <- getSomeReg y
-    let
-        code = x_code `appOL` y_code `snocOL`
-                GCMP cond x_reg y_reg
-    -- The GCMP insn does the test and sets the zero flag if comparable
-    -- and true.  Hence we always supply EQQ as the condition to test.
-    return (CondCode True EQQ code)
  -- in the SSE2 comparison ops (ucomiss, ucomisd) the left arg may be
  -- an operand, but the right must be a reg.  We can probably do better
@@ -1634,35 +1610,33 @@ assignReg_IntCode pk reg (CmmLoad src _) = do
  load_code <- intLoadCode (MOV pk) src
  dflags <- getDynFlags
  let platform = targetPlatform dflags
-  return (load_code (getRegisterReg platform False{-no sse2-} reg))
+  return (load_code (getRegisterReg platform reg))
 -- dst is a reg, but src could be anything
 assignReg_IntCode _ reg src = do
  dflags <- getDynFlags
  let platform = targetPlatform dflags
  code <- getAnyReg src
-  return (code (getRegisterReg platform False{-no sse2-} reg))
+  return (code (getRegisterReg platform reg))
 -- Floating point assignment to memory
 assignMem_FltCode pk addr src = do
  (src_reg, src_code) <- getNonClobberedReg src
  Amode addr addr_code <- getAmode addr
-  use_sse2 <- sse2Enabled
  let
        code = src_code `appOL`
               addr_code `snocOL`
-                if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr)
+               MOV pk (OpReg src_reg) (OpAddr addr)
-                            else GST pk src_reg addr
  return code
 -- Floating point assignment to a register/temporary
 assignReg_FltCode _ reg src = do
-  use_sse2 <- sse2Enabled
  src_code <- getAnyReg src
  dflags <- getDynFlags
  let platform = targetPlatform dflags
-  return (src_code (getRegisterReg platform use_sse2 reg))
+  return (src_code (getRegisterReg platform  reg))
 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
@@ -1945,7 +1919,7 @@ genCCall _ is32bit (PrimTarget (MO_Prefetch_Data n )) _  [src] _ =
 genCCall dflags is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] _ = do
    let platform = targetPlatform dflags
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    let dst_r = getRegisterReg platform (CmmLocal dst)
    case width of
        W64 | is32Bit -> do
               ChildCode64 vcode rlo <- iselExpr64 src
@@ -1972,7 +1946,7 @@ genCCall dflags is32Bit (PrimTarget (MO_PopCnt width)) dest_regs@[dst]
    if sse4_2
        then do code_src <- getAnyReg src
                src_r <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
                return $ code_src src_r `appOL`
                    (if width == W8 then
                         -- The POPCNT instruction doesn't take a r/m8
@@ -2004,7 +1978,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Pdep width)) dest_regs@[dst]
                code_mask <- getAnyReg mask
                src_r     <- getNewRegNat format
                mask_r    <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
                return $ code_src src_r `appOL` code_mask mask_r `appOL`
                    (if width == W8 then
                         -- The PDEP instruction doesn't take a r/m8
@@ -2037,7 +2011,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Pext width)) dest_regs@[dst]
                code_mask <- getAnyReg mask
                src_r     <- getNewRegNat format
                mask_r    <- getNewRegNat format
-                let dst_r = getRegisterReg platform False (CmmLocal dst)
+                let dst_r = getRegisterReg platform  (CmmLocal dst)
                return $ code_src src_r `appOL` code_mask mask_r `appOL`
                    (if width == W8 then
                         -- The PEXT instruction doesn't take a r/m8
@@ -2073,7 +2047,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Clz width)) dest_regs@[dst] args@[src] b
  | otherwise = do
    code_src <- getAnyReg src
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    let dst_r = getRegisterReg platform (CmmLocal dst)
    if isBmi2Enabled dflags
        then do
            src_r <- getNewRegNat (intFormat width)
@@ -2110,7 +2084,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
  | is32Bit, width == W64 = do
      ChildCode64 vcode rlo <- iselExpr64 src
      let rhi     = getHiVRegFromLo rlo
-          dst_r   = getRegisterReg platform False (CmmLocal dst)
+          dst_r   = getRegisterReg platform  (CmmLocal dst)
      lbl1 <- getBlockIdNat
      lbl2 <- getBlockIdNat
      let format = if width == W8 then II16 else intFormat width
@@ -2150,7 +2124,7 @@ genCCall dflags is32Bit (PrimTarget (MO_Ctz width)) [dst] [src] bid
  | otherwise = do
    code_src <- getAnyReg src
-    let dst_r = getRegisterReg platform False (CmmLocal dst)
+    let dst_r = getRegisterReg platform (CmmLocal dst)
    if isBmi2Enabled dflags
    then do
@@ -2201,9 +2175,8 @@ genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
        else getSimpleAmode dflags is32Bit addr  -- See genCCall for MO_Cmpxchg
    arg <- getNewRegNat format
    arg_code <- getAnyReg n
-    use_sse2 <- sse2Enabled
    let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
    code <- op_code dst_r arg amode
    return $ addr_code `appOL` arg_code arg `appOL` code
  where
@@ -2260,8 +2233,8 @@ genCCall dflags is32Bit (PrimTarget (MO_AtomicRMW width amop))
 genCCall dflags _ (PrimTarget (MO_AtomicRead width)) [dst] [addr] _ = do
  load_code <- intLoadCode (MOV (intFormat width)) addr
  let platform = targetPlatform dflags
-  use_sse2 <- sse2Enabled
-  return (load_code (getRegisterReg platform use_sse2 (CmmLocal dst)))
+  return (load_code (getRegisterReg platform  (CmmLocal dst)))
 genCCall _ _ (PrimTarget (MO_AtomicWrite width)) [] [addr, val] _ = do
    code <- assignMem_IntCode (intFormat width) addr val
@@ -2276,9 +2249,8 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
    newval_code <- getAnyReg new
    oldval <- getNewRegNat format
    oldval_code <- getAnyReg old
-    use_sse2 <- sse2Enabled
    let platform = targetPlatform dflags
-        dst_r    = getRegisterReg platform use_sse2 (CmmLocal dst)
+        dst_r    = getRegisterReg platform  (CmmLocal dst)
        code     = toOL
                   [ MOV format (OpReg oldval) (OpReg eax)
                   , LOCK (CMPXCHG format (OpReg newval) (OpAddr amode))
@@ -2292,14 +2264,12 @@ genCCall dflags is32Bit (PrimTarget (MO_Cmpxchg width)) [dst] [addr, old, new] _
 genCCall _ is32Bit target dest_regs args bid = do
  dflags <- getDynFlags
  let platform = targetPlatform dflags
-      sse2     = isSse2Enabled dflags
  case (target, dest_regs) of
    -- void return type prim op
    (PrimTarget op, []) ->
        outOfLineCmmOp bid op Nothing args
    -- we only cope with a single result for foreign calls
-    (PrimTarget op, [r])
+    (PrimTarget op, [r])  -> case op of
-      | sse2 -> case op of
          MO_F32_Fabs -> case args of
            [x] -> sse2FabsCode W32 x
            _ -> panic "genCCall: Wrong number of arguments for fabs"
@@ -2310,36 +2280,16 @@ genCCall _ is32Bit target dest_regs args bid = do
          MO_F32_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF32 args
          MO_F64_Sqrt -> actuallyInlineSSE2Op (\fmt r -> SQRT fmt (OpReg r)) FF64 args
          _other_op -> outOfLineCmmOp bid op (Just r) args
-      | otherwise -> do
-        l1 <- getNewLabelNat
-        l2 <- getNewLabelNat
-        if sse2
-          then outOfLineCmmOp bid op (Just r) args
-          else case op of
-              MO_F32_Sqrt -> actuallyInlineFloatOp GSQRT FF32 args
-              MO_F64_Sqrt -> actuallyInlineFloatOp GSQRT FF64 args
-              MO_F32_Sin  -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF32 args
-              MO_F64_Sin  -> actuallyInlineFloatOp (\s -> GSIN s l1 l2) FF64 args
-              MO_F32_Cos  -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF32 args
-              MO_F64_Cos  -> actuallyInlineFloatOp (\s -> GCOS s l1 l2) FF64 args
-              MO_F32_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF32 args
-              MO_F64_Tan  -> actuallyInlineFloatOp (\s -> GTAN s l1 l2) FF64 args
-              _other_op   -> outOfLineCmmOp bid op (Just r) args
       where
-        actuallyInlineFloatOp = actuallyInlineFloatOp' False
+        actuallyInlineSSE2Op = actuallyInlineFloatOp'
-        actuallyInlineSSE2Op = actuallyInlineFloatOp' True
-        actuallyInlineFloatOp' usesSSE instr format [x]
+        actuallyInlineFloatOp'  instr format [x]
              = do res <- trivialUFCode format (instr format) x
                   any <- anyReg res
-                   return (any (getRegisterReg platform usesSSE (CmmLocal r)))
+                   return (any (getRegisterReg platform  (CmmLocal r)))
-        actuallyInlineFloatOp' _ _ _ args
+        actuallyInlineFloatOp' _ _ args
              = panic $ "genCCall.actuallyInlineFloatOp': bad number of arguments! ("
                      ++ show (length args) ++ ")"
@@ -2358,7 +2308,7 @@ genCCall _ is32Bit target dest_regs args bid = do
                AND fmt (OpReg tmp) (OpReg dst)
                ]
-          return $ code (getRegisterReg platform True (CmmLocal r))
+          return $ code (getRegisterReg platform (CmmLocal r))
    (PrimTarget (MO_S_QuotRem  width), _) -> divOp1 platform True  width dest_regs args
    (PrimTarget (MO_U_QuotRem  width), _) -> divOp1 platform False width dest_regs args
@@ -2370,8 +2320,8 @@ genCCall _ is32Bit target dest_regs args bid = do
               let format = intFormat width
               lCode <- anyReg =<< trivialCode width (ADD_CC format)
                                     (Just (ADD_CC format)) arg_x arg_y
-               let reg_l = getRegisterReg platform True (CmmLocal res_l)
+               let reg_l = getRegisterReg platform (CmmLocal res_l)
-                   reg_h = getRegisterReg platform True (CmmLocal res_h)
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
                   code = hCode reg_h `appOL`
                          lCode reg_l `snocOL`
                          ADC format (OpImm (ImmInteger 0)) (OpReg reg_h)
@@ -2391,8 +2341,8 @@ genCCall _ is32Bit target dest_regs args bid = do
            do (y_reg, y_code) <- getRegOrMem arg_y
               x_code <- getAnyReg arg_x
               let format = intFormat width
-                   reg_h = getRegisterReg platform True (CmmLocal res_h)
+                   reg_h = getRegisterReg platform (CmmLocal res_h)
-                   reg_l = getRegisterReg platform True (CmmLocal res_l)
+                   reg_l = getRegisterReg platform (CmmLocal res_l)
                   code = y_code `appOL`
                          x_code rax `appOL`
                          toOL [MUL2 format y_reg,
@@ -2428,8 +2378,8 @@ genCCall _ is32Bit target dest_regs args bid = do
        divOp platform signed width [res_q, res_r]
              m_arg_x_high arg_x_low arg_y
            = do let format = intFormat width
-                     reg_q = getRegisterReg platform True (CmmLocal res_q)
+                     reg_q = getRegisterReg platform (CmmLocal res_q)
-                     reg_r = getRegisterReg platform True (CmmLocal res_r)
+                     reg_r = getRegisterReg platform (CmmLocal res_r)
                     widen | signed    = CLTD format
                           | otherwise = XOR format (OpReg rdx) (OpReg rdx)
                     instr | signed    = IDIV
@@ -2456,8 +2406,8 @@ genCCall _ is32Bit target dest_regs args bid = do
                 rCode <- anyReg =<< trivialCode width (instr format)
                                       (mrevinstr format) arg_x arg_y
                 reg_tmp <- getNewRegNat II8
-                 let reg_c = getRegisterReg platform True (CmmLocal res_c)
+                 let reg_c = getRegisterReg platform  (CmmLocal res_c)
-                     reg_r = getRegisterReg platform True (CmmLocal res_r)
+                     reg_r = getRegisterReg platform  (CmmLocal res_r)
                     code = rCode reg_r `snocOL`
                            SETCC cond (OpReg reg_tmp) `snocOL`
                            MOVZxL II8 (OpReg reg_tmp) (OpReg reg_c)
@@ -2501,8 +2451,7 @@ genCCall32' dflags target dest_regs args = do
        delta0 <- getDeltaNat
        setDeltaNat (delta0 - arg_pad_size)
-        use_sse2 <- sse2Enabled
+        push_codes <- mapM push_arg (reverse prom_args)
-        push_codes <- mapM (push_arg use_sse2) (reverse prom_args)
        delta <- getDeltaNat
        MASSERT(delta == delta0 - tot_arg_size)
@@ -2555,18 +2504,21 @@ genCCall32' dflags target dest_regs args = do
            assign_code []     = nilOL
            assign_code [dest]
              | isFloatType ty =
-                 if use_sse2
+                  -- we assume SSE2
-                    then let tmp_amode = AddrBaseIndex (EABaseReg esp)
+                  let tmp_amode = AddrBaseIndex (EABaseReg esp)
                                                       EAIndexNone
                                                       (ImmInt 0)
-                             fmt = floatFormat w
+                      fmt = floatFormat w
                         in toOL [ SUB II32 (OpImm (ImmInt b)) (OpReg esp),
                                   DELTA (delta0 - b),
-                                   GST fmt fake0 tmp_amode,
+                                   X87Store fmt  tmp_amode,
+                                   -- X87Store only supported for the CDECL ABI
+                                   -- NB: This code will need to be
+                                   -- revisted once GHC does more work around
+                                   -- SIGFPE f
                                   MOV fmt (OpAddr tmp_amode) (OpReg r_dest),
                                   ADD II32 (OpImm (ImmInt b)) (OpReg esp),
                                   DELTA delta0]
-                    else unitOL (GMOV fake0 r_dest)
              | isWord64 ty    = toOL [MOV II32 (OpReg eax) (OpReg r_dest),
                                        MOV II32 (OpReg edx) (OpReg r_dest_hi)]
              | otherwise      = unitOL (MOV (intFormat w)
@@ -2577,7 +2529,7 @@ genCCall32' dflags target dest_regs args = do
                    w  = typeWidth ty
                    b  = widthInBytes w
                    r_dest_hi = getHiVRegFromLo r_dest
-                    r_dest    = getRegisterReg platform use_sse2 (CmmLocal dest)
+                    r_dest    = getRegisterReg platform  (CmmLocal dest)
            assign_code many = pprPanic "genCCall.assign_code - too many return values:" (ppr many)
        return (push_code `appOL`
@@ -2592,10 +2544,10 @@ genCCall32' dflags target dest_regs args = do
        roundTo a x | x `mod` a == 0 = x
                    | otherwise = x + a - (x `mod` a)
-        push_arg :: Bool -> CmmActual {-current argument-}
+        push_arg :: CmmActual {-current argument-}
                        -> NatM InstrBlock  -- code
-        push_arg use_sse2 arg -- we don't need the hints on x86
+        push_arg  arg -- we don't need the hints on x86
          | isWord64 arg_ty = do
            ChildCode64 code r_lo <- iselExpr64 arg
            delta <- getDeltaNat
@@ -2619,9 +2571,10 @@ genCCall32' dflags target dest_regs args = do
                                                            (ImmInt 0)
                                      format = floatFormat (typeWidth arg_ty)
                                  in
-                                  if use_sse2
-                                     then MOV format (OpReg reg) (OpAddr addr)
+                                  -- assume SSE2
-                                     else GST format reg addr
+                                   MOV format (OpReg reg) (OpAddr addr)
                                 ]
                           )
@@ -2749,7 +2702,7 @@ genCCall64' dflags target dest_regs args = do
                _ -> unitOL (MOV (cmmTypeFormat rep) (OpReg rax) (OpReg r_dest))
          where
                rep = localRegType dest
-                r_dest = getRegisterReg platform True (CmmLocal dest)
+                r_dest = getRegisterReg platform  (CmmLocal dest)
        assign_code _many = panic "genCCall.assign_code many"
    return (adjust_rsp          `appOL`
@@ -3162,17 +3115,9 @@ condIntReg cond x y = do
 -- and plays better with the uOP cache.
 condFltReg :: Bool -> Cond -> CmmExpr -> CmmExpr -> NatM Register
-condFltReg is32Bit cond x y = if_sse2 condFltReg_sse2 condFltReg_x87
+condFltReg is32Bit cond x y = condFltReg_sse2
 where
-  condFltReg_x87 = do
-    CondCode _ cond cond_code <- condFltCode cond x y
-    tmp <- getNewRegNat II8
-    let
-        code dst = cond_code `appOL` toOL [
-                    SETCC cond (OpReg tmp),
-                    MOVZxL II8 (OpReg tmp) (OpReg dst)
-                  ]
-    return (Any II32 code)
  condFltReg_sse2 = do
    CondCode _ cond cond_code <- condFltCode cond x y
@@ -3336,18 +3281,6 @@ trivialUCode rep instr x = do
 -----------
-trivialFCode_x87 :: (Format -> Reg -> Reg -> Reg -> Instr)
-                 -> CmmExpr -> CmmExpr -> NatM Register
-trivialFCode_x87 instr x y = do
-  (x_reg, x_code) <- getNonClobberedReg x -- these work for float regs too
-  (y_reg, y_code) <- getSomeReg y
-  let
-     format = FF80 -- always, on x87
-     code dst =
-        x_code `appOL`
-        y_code `snocOL`
-        instr format x_reg y_reg dst
-  return (Any format code)
 trivialFCode_sse2 :: Width -> (Format -> Operand -> Operand -> Instr)
                  -> CmmExpr -> CmmExpr -> NatM Register
@@ -3368,17 +3301,8 @@ trivialUFCode format instr x = do
 --------------------------------------------------------------------------------
 coerceInt2FP :: Width -> Width -> CmmExpr -> NatM Register
-coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
+coerceInt2FP from to x =  coerce_sse2
 where
-   coerce_x87 = do
-     (x_reg, x_code) <- getSomeReg x
-     let
-           opc  = case to of W32 -> GITOF; W64 -> GITOD;
-                             n -> panic $ "coerceInt2FP.x87: unhandled width ("
-                                         ++ show n ++ ")"
-           code dst = x_code `snocOL` opc x_reg dst
-        -- ToDo: works for non-II32 reps?
-     return (Any FF80 code)
   coerce_sse2 = do
     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
@@ -3392,18 +3316,8 @@ coerceInt2FP from to x = if_sse2 coerce_sse2 coerce_x87
 --------------------------------------------------------------------------------
 coerceFP2Int :: Width -> Width -> CmmExpr -> NatM Register
-coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
+coerceFP2Int from to x =  coerceFP2Int_sse2
 where
-   coerceFP2Int_x87 = do
-     (x_reg, x_code) <- getSomeReg x
-     let
-           opc  = case from of W32 -> GFTOI; W64 -> GDTOI
-                               n -> panic $ "coerceFP2Int.x87: unhandled width ("
-                                           ++ show n ++ ")"
-           code dst = x_code `snocOL` opc x_reg dst
-        -- ToDo: works for non-II32 reps?
-     return (Any (intFormat to) code)
   coerceFP2Int_sse2 = do
     (x_op, x_code) <- getOperand x  -- ToDo: could be a safe operand
     let
@@ -3418,15 +3332,13 @@ coerceFP2Int from to x = if_sse2 coerceFP2Int_sse2 coerceFP2Int_x87
 --------------------------------------------------------------------------------
 coerceFP2FP :: Width -> CmmExpr -> NatM Register
 coerceFP2FP to x = do
-  use_sse2 <- sse2Enabled
  (x_reg, x_code) <- getSomeReg x
  let
-        opc | use_sse2  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
+        opc  = case to of W32 -> CVTSD2SS; W64 -> CVTSS2SD;
                                     n -> panic $ "coerceFP2FP: unhandled width ("
                                                 ++ show n ++ ")"
-            | otherwise = GDTOF
        code dst = x_code `snocOL` opc x_reg dst
-  return (Any (if use_sse2 then floatFormat to else FF80) code)
+  return (Any ( floatFormat to) code)
 --------------------------------------------------------------------------------
@@ -3443,7 +3355,7 @@ sse2NegCode w x = do
      x@II16 -> wrongFmt x
      x@II32 -> wrongFmt x
      x@II64 -> wrongFmt x
-      x@FF80 -> wrongFmt x
      where
        wrongFmt x = panic $ "sse2NegCode: " ++ show x
  Amode amode amode_code <- memConstant (mkAlignment $ widthInBytes w) const