diff --git a/compiler/GHC/Cmm/MachOp.hs b/compiler/GHC/Cmm/MachOp.hs
index 508359e40ab72966d55fe8f671936bf095e15034..09e5dc8d97b2e491636dca2191274bcfca0b9677 100644
--- a/compiler/GHC/Cmm/MachOp.hs
+++ b/compiler/GHC/Cmm/MachOp.hs
@@ -140,8 +140,8 @@ data MachOp
 
   -- Conversions.  Some of these will be NOPs.
   -- Floating-point conversions use the signed variant.
-  | MO_SF_Conv Width Width      -- Signed int -> Float
-  | MO_FS_Conv Width Width      -- Float -> Signed int
+  | MO_SF_Round    Width Width  -- Signed int -> Float
+  | MO_FS_Truncate Width Width  -- Float -> Signed int
   | MO_SS_Conv Width Width      -- Signed int -> Signed int
   | MO_UU_Conv Width Width      -- unsigned int -> unsigned int
   | MO_XX_Conv Width Width      -- int -> int; puts no requirements on the
@@ -152,7 +152,10 @@ data MachOp
                                 -- MO_XX_Conv, e.g.,
                                 --   MO_XX_CONV W64 W8 (MO_XX_CONV W8 W64 x)
                                 -- is equivalent to just x.
-  | MO_FF_Conv Width Width      -- Float -> Float
+  | MO_FF_Conv Width Width      -- Float  -> Float
+
+  | MO_WF_Bitcast Width      -- Word32/Word64   -> Float/Double
+  | MO_FW_Bitcast Width      -- Float/Double  -> Word32/Word64
 
   -- Vector element insertion and extraction operations
   | MO_V_Insert  Length Width   -- Insert scalar into vector
@@ -476,9 +479,11 @@ machOpResultType platform mop tys =
     MO_SS_Conv _ to     -> cmmBits to
     MO_UU_Conv _ to     -> cmmBits to
     MO_XX_Conv _ to     -> cmmBits to
-    MO_FS_Conv _ to     -> cmmBits to
-    MO_SF_Conv _ to     -> cmmFloat to
+    MO_FS_Truncate _ to -> cmmBits to
+    MO_SF_Round _ to    -> cmmFloat to
     MO_FF_Conv _ to     -> cmmFloat to
+    MO_WF_Bitcast   w   -> cmmFloat w
+    MO_FW_Bitcast   w   -> cmmBits w
 
     MO_V_Insert  l w    -> cmmVec l (cmmBits w)
     MO_V_Extract _ w    -> cmmBits w
@@ -568,12 +573,14 @@ machOpArgReps platform op =
     MO_U_Shr r          -> [r, wordWidth platform]
     MO_S_Shr r          -> [r, wordWidth platform]
 
-    MO_SS_Conv from _   -> [from]
-    MO_UU_Conv from _   -> [from]
-    MO_XX_Conv from _   -> [from]
-    MO_SF_Conv from _   -> [from]
-    MO_FS_Conv from _   -> [from]
-    MO_FF_Conv from _   -> [from]
+    MO_SS_Conv from _     -> [from]
+    MO_UU_Conv from _     -> [from]
+    MO_XX_Conv from _     -> [from]
+    MO_SF_Round from _    -> [from]
+    MO_FS_Truncate from _ -> [from]
+    MO_FF_Conv from _     -> [from]
+    MO_WF_Bitcast w       -> [w]
+    MO_FW_Bitcast w       -> [w]
 
     MO_V_Insert   l r   -> [typeWidth (vec l (cmmBits r)),r, W32]
     MO_V_Extract  l r   -> [typeWidth (vec l (cmmBits r)), W32]
diff --git a/compiler/GHC/Cmm/Opt.hs b/compiler/GHC/Cmm/Opt.hs
index 2d0b54232d71d821c705ca3a5b892b07519fac2c..9012b7adaf41ff945c9705a2d45f22c188234b2c 100644
--- a/compiler/GHC/Cmm/Opt.hs
+++ b/compiler/GHC/Cmm/Opt.hs
@@ -89,7 +89,7 @@ cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
         -- "from" type, in order to truncate to the correct size.
         -- The final narrow/widen to the destination type
         -- is implicit in the CmmLit.
-      MO_SF_Conv _from to -> CmmLit (CmmFloat (fromInteger x) to)
+      MO_SF_Round _frm to -> CmmLit (CmmFloat (fromInteger x) to)
       MO_SS_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
       MO_UU_Conv  from to -> CmmLit (CmmInt (narrowU from x) to)
       MO_XX_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
diff --git a/compiler/GHC/Cmm/Parser.y b/compiler/GHC/Cmm/Parser.y
index 18c34896cd052ba0f18d158a6884f690e4c314b7..365ede9f35dfda0d75314deb73a81908b98e438f 100644
--- a/compiler/GHC/Cmm/Parser.y
+++ b/compiler/GHC/Cmm/Parser.y
@@ -1078,12 +1078,12 @@ machOps = listToUFM $
 
         ( "f2f32",    flip MO_FF_Conv W32 ),  -- TODO; rounding mode
         ( "f2f64",    flip MO_FF_Conv W64 ),  -- TODO; rounding mode
-        ( "f2i8",     flip MO_FS_Conv W8 ),
-        ( "f2i16",    flip MO_FS_Conv W16 ),
-        ( "f2i32",    flip MO_FS_Conv W32 ),
-        ( "f2i64",    flip MO_FS_Conv W64 ),
-        ( "i2f32",    flip MO_SF_Conv W32 ),
-        ( "i2f64",    flip MO_SF_Conv W64 )
+        ( "f2i8",     flip MO_FS_Truncate W8 ),
+        ( "f2i16",    flip MO_FS_Truncate W16 ),
+        ( "f2i32",    flip MO_FS_Truncate W32 ),
+        ( "f2i64",    flip MO_FS_Truncate W64 ),
+        ( "i2f32",    flip MO_SF_Round W32 ),
+        ( "i2f64",    flip MO_SF_Round W64 )
         ]
 
 callishMachOps :: Platform -> UniqFM FastString ([CmmExpr] -> (CallishMachOp, [CmmExpr]))
diff --git a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
index f2810b38b9fb7dd3a88abbc5e04e0b02fc0970b3..fa68fedfede54fa73e40fbbccd96bb9d2bf6e9ab 100644
--- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
@@ -755,8 +755,8 @@ getRegister' config plat expr
         MO_S_Neg w -> negate code w reg
         MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` NEG (OpReg w dst) (OpReg w reg))
 
-        MO_SF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
-        MO_FS_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)
+        MO_SF_Round    from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))  -- (Signed ConVerT Float)
+        MO_FS_Truncate from to -> return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from reg)) -- (float convert (-> zero) signed)
 
         -- TODO this is very hacky
         -- Note, UBFM and SBFM expect source and target register to be of the same size, so we'll use @max from to@
@@ -764,6 +764,8 @@ getRegister' config plat expr
         MO_UU_Conv from to -> return $ Any (intFormat to) (\dst -> code `snocOL` UBFM (OpReg (max from to) dst) (OpReg (max from to) reg) (OpImm (ImmInt 0)) (toImm (min from to)))
         MO_SS_Conv from to -> ss_conv from to reg code
         MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))
+        MO_WF_Bitcast w    -> return $ Any (floatFormat w)  (\dst -> code `snocOL` FMOV (OpReg w dst) (OpReg w reg))
+        MO_FW_Bitcast w    -> return $ Any (intFormat w)    (\dst -> code `snocOL` FMOV (OpReg w dst) (OpReg w reg))
 
         -- Conversions
         MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
diff --git a/compiler/GHC/CmmToAsm/AArch64/Instr.hs b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
index 6d4dad4f774b3a0e90a7bfee14e29b06fdc35e55..8836ab7d9f7fa05123c65a33c246ca709e0927a9 100644
--- a/compiler/GHC/CmmToAsm/AArch64/Instr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
@@ -132,6 +132,7 @@ regUsageOfInstr platform instr = case instr of
   DMBISH                   -> usage ([], [])
 
   -- 9. Floating Point Instructions --------------------------------------------
+  FMOV dst src             -> usage (regOp src, regOp dst)
   FCVT dst src             -> usage (regOp src, regOp dst)
   SCVTF dst src            -> usage (regOp src, regOp dst)
   FCVTZS dst src           -> usage (regOp src, regOp dst)
@@ -267,6 +268,7 @@ patchRegsOfInstr instr env = case instr of
     DMBISH         -> DMBISH
 
     -- 9. Floating Point Instructions ------------------------------------------
+    FMOV o1 o2     -> FMOV (patchOp o1) (patchOp o2)
     FCVT o1 o2     -> FCVT (patchOp o1) (patchOp o2)
     SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
     FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
@@ -623,6 +625,8 @@ data Instr
     -- 8. Synchronization Instructions -----------------------------------------
     | DMBISH
     -- 9. Floating Point Instructions
+    -- move to/from general purpose <-> floating, or floating to floating
+    | FMOV Operand Operand
     -- Float ConVerT
     | FCVT Operand Operand
     -- Signed ConVerT Float
@@ -694,6 +698,7 @@ instrCon i =
       BL{} -> "BL"
       BCOND{} -> "BCOND"
       DMBISH{} -> "DMBISH"
+      FMOV{} -> "FMOV"
       FCVT{} -> "FCVT"
       SCVTF{} -> "SCVTF"
       FCVTZS{} -> "FCVTZS"
diff --git a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
index dd8dcf574d7a5572be15d2e737fa380df1316758..8c5d08167bfc663fcad575537a7257554d8d1780 100644
--- a/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Ppr.hs
@@ -524,6 +524,7 @@ pprInstr platform instr = case instr of
   DMBISH -> line $ text "\tdmb ish"
 
   -- 9. Floating Point Instructions --------------------------------------------
+  FMOV o1 o2 -> op2 (text "\tfmov") o1 o2
   FCVT o1 o2 -> op2 (text "\tfcvt") o1 o2
   SCVTF o1 o2 -> op2 (text "\tscvtf") o1 o2
   FCVTZS o1 o2 -> op2 (text "\tfcvtzs") o1 o2
diff --git a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
index b831db1e1900b050a30e554e7dd67ac90911072c..c95f62d37e86ed86f242c01aa2c1759d8682ec1d 100644
--- a/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/PPC/CodeGen.hs
@@ -516,8 +516,8 @@ getRegister' config platform (CmmMachOp mop [x]) -- unary MachOps
       MO_FF_Conv W64 W32 -> trivialUCode  FF32 FRSP x
       MO_FF_Conv W32 W64 -> conversionNop FF64 x
 
-      MO_FS_Conv from to -> coerceFP2Int from to x
-      MO_SF_Conv from to -> coerceInt2FP from to x
+      MO_FS_Truncate from to -> coerceFP2Int from to x
+      MO_SF_Round    from to -> coerceInt2FP from to x
 
       MO_SS_Conv from to
         | from >= to -> conversionNop (intFormat to) x
diff --git a/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs b/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
index 92e96ab83bf28224a3dd89c77ae1b633f769e8dc..778c6a916613714dc26210b832b9a37750d2673b 100644
--- a/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
+++ b/compiler/GHC/CmmToAsm/Wasm/FromCmm.hs
@@ -620,6 +620,49 @@ lower_MO_FF_Conv lbl W64 W32 [x] = do
         x_instr `WasmConcat` WasmF32DemoteF64
 lower_MO_FF_Conv _ _ _ _ = panic "lower_MO_FF_Conv: unreachable"
 
+
+-- | Lower a 'MO_WF_Bitcast' operation. Note that this is not a conversion,
+-- rather it reinterprets the data.
+lower_MO_WF_Bitcast ::
+  CLabel ->
+  Width ->
+  [CmmActual] ->
+  WasmCodeGenM w (SomeWasmExpr w)
+lower_MO_WF_Bitcast lbl W32 [x] = do
+  WasmExpr x_instr <- lower_CmmExpr_Typed lbl TagI32 x
+  pure $
+    SomeWasmExpr TagF32 $
+      WasmExpr  $
+        x_instr `WasmConcat` WasmReinterpret TagI32 TagF32
+lower_MO_WF_Bitcast lbl W64 [x] = do
+  WasmExpr x_instr <- lower_CmmExpr_Typed lbl TagI64 x
+  pure $
+    SomeWasmExpr TagF64 $
+      WasmExpr  $
+        x_instr `WasmConcat` WasmReinterpret TagI64 TagF64
+lower_MO_WF_Bitcast _ _ _ = panic "lower_MO_WF_Bitcast: unreachable"
+
+-- | Lower a 'MO_FW_Bitcast' operation. Note that this is not a conversion,
+-- rather it reinterprets the data.
+lower_MO_FW_Bitcast ::
+  CLabel ->
+  Width ->
+  [CmmActual] ->
+  WasmCodeGenM w (SomeWasmExpr w)
+lower_MO_FW_Bitcast lbl W32 [x] = do
+  WasmExpr x_instr <- lower_CmmExpr_Typed lbl TagF32 x
+  pure $
+    SomeWasmExpr TagI32 $
+      WasmExpr  $
+        x_instr `WasmConcat` WasmReinterpret TagF32 TagI32
+lower_MO_FW_Bitcast lbl W64 [x] = do
+  WasmExpr x_instr <- lower_CmmExpr_Typed lbl TagF64 x
+  pure $
+    SomeWasmExpr TagI64 $
+      WasmExpr  $
+        x_instr `WasmConcat` WasmReinterpret TagF64 TagI64
+lower_MO_FW_Bitcast _ _ _ = panic "lower_MO_FW_Bitcast: unreachable"
+
 -- | Lower a 'CmmMachOp'.
 lower_CmmMachOp ::
   CLabel ->
@@ -799,14 +842,14 @@ lower_CmmMachOp lbl (MO_Not w0) [x] =
 lower_CmmMachOp lbl (MO_Shl w0) xs = lower_MO_Shl lbl w0 xs
 lower_CmmMachOp lbl (MO_U_Shr w0) xs = lower_MO_U_Shr lbl w0 xs
 lower_CmmMachOp lbl (MO_S_Shr w0) xs = lower_MO_S_Shr lbl w0 xs
-lower_CmmMachOp lbl (MO_SF_Conv w0 w1) xs =
+lower_CmmMachOp lbl (MO_SF_Round w0 w1) xs =
   lower_MO_Un_Conv
     (WasmConvert Signed)
     lbl
     (cmmBits w0)
     (cmmFloat w1)
     xs
-lower_CmmMachOp lbl (MO_FS_Conv w0 w1) xs =
+lower_CmmMachOp lbl (MO_FS_Truncate w0 w1) xs =
   lower_MO_Un_Conv
     (WasmTruncSat Signed)
     lbl
@@ -817,6 +860,8 @@ lower_CmmMachOp lbl (MO_SS_Conv w0 w1) xs = lower_MO_SS_Conv lbl w0 w1 xs
 lower_CmmMachOp lbl (MO_UU_Conv w0 w1) xs = lower_MO_UU_Conv lbl w0 w1 xs
 lower_CmmMachOp lbl (MO_XX_Conv w0 w1) xs = lower_MO_UU_Conv lbl w0 w1 xs
 lower_CmmMachOp lbl (MO_FF_Conv w0 w1) xs = lower_MO_FF_Conv lbl w0 w1 xs
+lower_CmmMachOp lbl (MO_FW_Bitcast w) xs  = lower_MO_FW_Bitcast lbl w xs
+lower_CmmMachOp lbl (MO_WF_Bitcast w) xs  = lower_MO_WF_Bitcast lbl w xs
 lower_CmmMachOp _ mop _ =
   pprPanic "lower_CmmMachOp: unreachable" $
     vcat [ text "offending MachOp:" <+> pprMachOp mop ]
diff --git a/compiler/GHC/CmmToAsm/X86/CodeGen.hs b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
index 103a5f639ea0fcda426a4a19cbb1329f101f5659..1409e9b469418e2f3bc970c81a304fb111d99d00 100644
--- a/compiler/GHC/CmmToAsm/X86/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/X86/CodeGen.hs
@@ -848,10 +848,8 @@ getRegister' _ is32Bit (CmmReg reg)
             do
                let
                  fmt = cmmTypeFormat (cmmRegType reg)
-                 format  = fmt
-               --
                platform <- ncgPlatform <$> getConfig
-               return (Fixed format
+               return (Fixed fmt
                              (getRegisterReg platform reg)
                              nilOL)
 
@@ -1004,6 +1002,11 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = -- unary MachOps
       MO_SS_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
       MO_XX_Conv rep1 rep2 | rep1 == rep2 -> conversionNop (intFormat rep1) x
 
+      MO_FW_Bitcast W32 -> bitcast FF32 II32 x
+      MO_WF_Bitcast W32 -> bitcast II32 FF32 x
+      MO_FW_Bitcast W64 -> bitcast FF64 II64 x
+      MO_WF_Bitcast W64 -> bitcast II64 FF64 x
+
       -- widenings
       MO_UU_Conv W8  W32 -> integerExtend W8  W32 MOVZxL x
       MO_UU_Conv W16 W32 -> integerExtend W16 W32 MOVZxL x
@@ -1045,8 +1048,8 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = -- unary MachOps
 
       MO_FF_Conv W64 W32 -> coerceFP2FP W32 x
 
-      MO_FS_Conv from to -> coerceFP2Int from to x
-      MO_SF_Conv from to -> coerceInt2FP from to x
+      MO_FS_Truncate from to -> coerceFP2Int from to x
+      MO_SF_Round    from to -> coerceInt2FP from to x
 
       MO_V_Insert {}   -> needLlvm
       MO_V_Extract {}  -> needLlvm
@@ -1084,6 +1087,12 @@ getRegister' platform is32Bit (CmmMachOp mop [x]) = -- unary MachOps
                   instr (intFormat from) (OpReg reg) (OpReg dst)
             return (Any (intFormat to) code)
 
+        bitcast :: Format -> Format -> CmmExpr -> NatM Register
+        bitcast fmt rfmt expr =
+          do (src, e_code) <- getSomeReg expr
+             let code = \dst -> e_code `snocOL` (MOVD fmt (OpReg src) (OpReg dst))
+             return (Any rfmt code)
+
         toI8Reg :: Width -> CmmExpr -> NatM Register
         toI8Reg new_rep expr
             = do codefn <- getAnyReg expr
diff --git a/compiler/GHC/CmmToAsm/X86/Instr.hs b/compiler/GHC/CmmToAsm/X86/Instr.hs
index 8714fe488bdc0ff4ce4f229bfd26d10e9e372924..6bcb56f825c5a8d174da67ce8116e715b54431c1 100644
--- a/compiler/GHC/CmmToAsm/X86/Instr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Instr.hs
@@ -202,6 +202,11 @@ data Instr
              -- operand is interpreted to be a 32-bit sign-extended value.
              -- True 64-bit operands need to be moved with @MOVABS@, which we
              -- currently don't use.
+        | MOVD   Format Operand Operand -- ^ MOVD/MOVQ SSE2 instructions
+                                        -- (bitcast between a general purpose
+                                        -- register and a float register).
+                                        -- Format is input format, output format is
+                                        -- calculated in Ppr.hs
         | CMOV   Cond Format Operand Reg
         | MOVZxL      Format Operand Operand
               -- ^ The format argument is the size of operand 1 (the number of bits we keep)
@@ -368,6 +373,7 @@ regUsageOfInstr :: Platform -> Instr -> RegUsage
 regUsageOfInstr platform instr
  = case instr of
     MOV    _ src dst    -> usageRW src dst
+    MOVD   _ src dst    -> usageRW src dst
     CMOV _ _ src dst    -> mkRU (use_R src [dst]) [dst]
     MOVZxL _ src dst    -> usageRW src dst
     MOVSxL _ src dst    -> usageRW src dst
@@ -549,6 +555,7 @@ patchRegsOfInstr :: Instr -> (Reg -> Reg) -> Instr
 patchRegsOfInstr instr env
  = case instr of
     MOV  fmt src dst     -> patch2 (MOV  fmt) src dst
+    MOVD fmt src dst     -> patch2 (MOVD fmt) src dst
     CMOV cc fmt src dst  -> CMOV cc fmt (patchOp src) (env dst)
     MOVZxL fmt src dst   -> patch2 (MOVZxL fmt) src dst
     MOVSxL fmt src dst   -> patch2 (MOVSxL fmt) src dst
diff --git a/compiler/GHC/CmmToAsm/X86/Ppr.hs b/compiler/GHC/CmmToAsm/X86/Ppr.hs
index 9d9df691a2718a7e538a30a378e1313099e430b6..333d2693f4443e5aaadfb800250db08d8bc726d1 100644
--- a/compiler/GHC/CmmToAsm/X86/Ppr.hs
+++ b/compiler/GHC/CmmToAsm/X86/Ppr.hs
@@ -591,6 +591,9 @@ pprInstr platform i = case i of
    CMOV cc format src dst
      -> pprCondOpReg (text "cmov") format cc src dst
 
+   MOVD format src dst
+     -> pprMovdOpOp (text "mov") format src dst
+
    MOVZxL II32 src dst
       -> pprFormatOpOp (text "mov") II32 src dst
         -- 32-to-64 bit zero extension on x86_64 is accomplished by a simple
@@ -980,6 +983,29 @@ pprInstr platform i = case i of
            pprOperand platform format op2
        ]
 
+   pprMovdOpOp :: Line doc -> Format -> Operand -> Operand -> doc
+   pprMovdOpOp name format op1 op2
+     = let instr = case format of
+             -- bitcasts to/from a general purpose register to a floating point
+             -- register require II32 or II64.
+             II32 -> text "d"
+             II64 -> text "q"
+             FF32 -> text "d"
+             FF64 -> text "q"
+             _    -> panic "X86.Ppr.pprMovdOpOp: improper format for movd/movq."
+           out_fmt = case format of
+             II32 -> FF32
+             II64 -> FF64
+             FF32 -> II32
+             FF64 -> II64
+             _    -> panic "X86.Ppr.pprMovdOpOp: improper format for movd/movq."
+       in line $ hcat [
+           char '\t' <> name <> instr <> space,
+           pprOperand platform format op1,
+           comma,
+           pprOperand platform out_fmt op2
+           ]
+
    pprFormatOpRegReg :: Line doc -> Format -> Operand -> Reg -> Reg -> doc
    pprFormatOpRegReg name format op1 op2 op3
      = line $ hcat [
diff --git a/compiler/GHC/CmmToC.hs b/compiler/GHC/CmmToC.hs
index 7881350f4b7ef379305f9ff79b432fa79010a591..a8e060085447cebaac242d05a5a7cf2ec4611a56 100644
--- a/compiler/GHC/CmmToC.hs
+++ b/compiler/GHC/CmmToC.hs
@@ -784,6 +784,28 @@ pprMachOp_for_C platform mop = case mop of
 -- We won't know to generate (void*) casts here, but maybe from
 -- context elsewhere
 
+-- bitcasts, in the C backend these are performed with __builtin_memcpy.
+-- See rts/include/stg/Prim.h
+
+        MO_FW_Bitcast W32 -> text "hs_bitcastfloat2word"
+        MO_FW_Bitcast W64 -> text "hs_bitcastdouble2word64"
+
+        MO_WF_Bitcast W32 -> text "hs_bitcastword2float"
+        MO_WF_Bitcast W64 -> text "hs_bitcastword642double"
+
+        MO_FW_Bitcast w -> pprTrace "offending mop:"
+                                (text "MO_FW_Bitcast")
+                                (panic $ "PprC.pprMachOp_for_C: MO_FW_Bitcast"
+                                      ++ " called with improper width!"
+                                      ++ show w)
+
+        MO_WF_Bitcast w -> pprTrace "offending mop:"
+                                (text "MO_WF_Bitcast")
+                                (panic $ "PprC.pprMachOp_for_C: MO_WF_Bitcast"
+                                      ++ " called with improper width!"
+                                      ++ show w)
+
+
 -- noop casts
         MO_UU_Conv from to | from == to -> empty
         MO_UU_Conv _from to -> parens (machRep_U_CType platform to)
@@ -797,8 +819,8 @@ pprMachOp_for_C platform mop = case mop of
         MO_FF_Conv from to | from == to -> empty
         MO_FF_Conv _from to -> parens (machRep_F_CType to)
 
-        MO_SF_Conv _from to -> parens (machRep_F_CType to)
-        MO_FS_Conv _from to -> parens (machRep_S_CType platform to)
+        MO_SF_Round    _from to -> parens (machRep_F_CType to)
+        MO_FS_Truncate _from to -> parens (machRep_S_CType platform to)
 
         MO_RelaxedRead _ -> pprTrace "offending mop:"
                                 (text "MO_RelaxedRead")
@@ -896,7 +918,7 @@ signedOp (MO_S_Gt   _)    = True
 signedOp (MO_S_Lt   _)    = True
 signedOp (MO_S_Shr  _)    = True
 signedOp (MO_SS_Conv _ _) = True
-signedOp (MO_SF_Conv _ _) = True
+signedOp (MO_SF_Round _ _) = True
 signedOp _                = False
 
 shiftOp :: MachOp -> Maybe Width
@@ -1447,7 +1469,6 @@ floatToWord32 r = CmmInt (toInteger (castFloatToWord32 (fromRational r))) W32
 doubleToWord64 :: Rational -> CmmLit
 doubleToWord64 r = CmmInt (toInteger (castDoubleToWord64 (fromRational r))) W64
 
-
 -- ---------------------------------------------------------------------------
 -- Utils
 
diff --git a/compiler/GHC/CmmToLlvm/CodeGen.hs b/compiler/GHC/CmmToLlvm/CodeGen.hs
index 2b84434f7edcd2fc22f005a8b5b03560ce2e1db9..44c8788c3feb3d3d2adbb564bf071f295b810a9c 100644
--- a/compiler/GHC/CmmToLlvm/CodeGen.hs
+++ b/compiler/GHC/CmmToLlvm/CodeGen.hs
@@ -1431,8 +1431,8 @@ genMachOp _ op [x] = case op of
         let all0 = LMLitVar $ LMFloatLit (-0) (widthToLlvmFloat w)
         in negate (widthToLlvmFloat w) all0 LM_MO_FSub
 
-    MO_SF_Conv _ w -> fiConv (widthToLlvmFloat w) LM_Sitofp
-    MO_FS_Conv _ w -> fiConv (widthToLlvmInt w) LM_Fptosi
+    MO_SF_Round    _ w -> fiConv (widthToLlvmFloat w) LM_Sitofp
+    MO_FS_Truncate _ w -> fiConv (widthToLlvmInt w) LM_Fptosi
 
     MO_SS_Conv from to
         -> sameConv from (widthToLlvmInt to) LM_Trunc LM_Sext
@@ -1446,6 +1446,9 @@ genMachOp _ op [x] = case op of
     MO_FF_Conv from to
         -> sameConv from (widthToLlvmFloat to) LM_Fptrunc LM_Fpext
 
+    MO_WF_Bitcast w -> fiConv (widthToLlvmFloat w) LM_Bitcast
+    MO_FW_Bitcast w -> fiConv (widthToLlvmInt w)   LM_Bitcast
+
     MO_VS_Neg len w ->
         let ty    = widthToLlvmInt w
             vecty = LMVector len ty
@@ -1704,13 +1707,16 @@ genMachOp_slow opt op [x, y] = case op of
     MO_S_Neg _     -> panicOp
     MO_F_Neg _     -> panicOp
 
-    MO_SF_Conv _ _ -> panicOp
-    MO_FS_Conv _ _ -> panicOp
+    MO_SF_Round    _ _ -> panicOp
+    MO_FS_Truncate _ _ -> panicOp
     MO_SS_Conv _ _ -> panicOp
     MO_UU_Conv _ _ -> panicOp
     MO_XX_Conv _ _ -> panicOp
     MO_FF_Conv _ _ -> panicOp
 
+    MO_WF_Bitcast _to ->  panicOp
+    MO_FW_Bitcast _to ->  panicOp
+
     MO_V_Insert  {} -> panicOp
 
     MO_VS_Neg {} -> panicOp
diff --git a/compiler/GHC/Platform.hs b/compiler/GHC/Platform.hs
index 273082cdb6af8b6843f768630977dcddaab15dee..8514e591d1807c87c66b32bf8bbe4d56a46dcf32 100644
--- a/compiler/GHC/Platform.hs
+++ b/compiler/GHC/Platform.hs
@@ -17,6 +17,7 @@ module GHC.Platform
    , ByteOrder(..)
    , target32Bit
    , isARM
+   , isPPC
    , osElfTarget
    , osMachOTarget
    , osSubsectionsViaSymbols
diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs
index c19377de807eab3038e7968ee2a8835153f3b6de..0b5c3ef9f9f96268c39aaae078b72a9a8f0f6993 100644
--- a/compiler/GHC/StgToCmm/Prim.hs
+++ b/compiler/GHC/StgToCmm/Prim.hs
@@ -1543,15 +1543,24 @@ emitPrimOp cfg primop =
 
 -- Conversions
 
-  IntToDoubleOp   -> \args -> opTranslate args (MO_SF_Conv (wordWidth platform) W64)
-  DoubleToIntOp   -> \args -> opTranslate args (MO_FS_Conv W64 (wordWidth platform))
+  IntToDoubleOp   -> \args -> opTranslate args (MO_SF_Round (wordWidth platform) W64)
+  DoubleToIntOp   -> \args -> opTranslate args (MO_FS_Truncate W64 (wordWidth platform))
 
-  IntToFloatOp    -> \args -> opTranslate args (MO_SF_Conv (wordWidth platform) W32)
-  FloatToIntOp    -> \args -> opTranslate args (MO_FS_Conv W32 (wordWidth platform))
+  IntToFloatOp    -> \args -> opTranslate args (MO_SF_Round (wordWidth platform) W32)
+  FloatToIntOp    -> \args -> opTranslate args (MO_FS_Truncate W32 (wordWidth platform))
 
   FloatToDoubleOp -> \args -> opTranslate args (MO_FF_Conv W32 W64)
   DoubleToFloatOp -> \args -> opTranslate args (MO_FF_Conv W64 W32)
 
+  CastFloatToWord32Op  ->
+    \args -> translateBitcasts (MO_FW_Bitcast W32) args
+  CastWord32ToFloatOp  ->
+    \args -> translateBitcasts (MO_WF_Bitcast W32) args
+  CastDoubleToWord64Op ->
+    \args -> translateBitcasts (MO_FW_Bitcast W64) args
+  CastWord64ToDoubleOp ->
+    \args -> translateBitcasts (MO_WF_Bitcast W64) args
+
   IntQuotRemOp -> \args -> opCallishHandledLater args $
     if allowQuotRem
     then Left (MO_S_QuotRem  (wordWidth platform))
@@ -1746,10 +1755,6 @@ emitPrimOp cfg primop =
   TraceMarkerOp -> alwaysExternal
   SetThreadAllocationCounter -> alwaysExternal
   KeepAliveOp -> alwaysExternal
-  CastWord32ToFloatOp -> alwaysExternal
-  CastWord64ToDoubleOp -> alwaysExternal
-  CastDoubleToWord64Op -> alwaysExternal
-  CastFloatToWord32Op -> alwaysExternal
 
  where
   profile  = stgToCmmProfile  cfg
@@ -1834,6 +1839,14 @@ emitPrimOp cfg primop =
   allowInt2Mul  = stgToCmmAllowIntMul2Instr         cfg
   allowWord2Mul = stgToCmmAllowWordMul2Instr        cfg
 
+  -- a bit of a hack, for certain code generaters, e.g. PPC, and i386 we
+  -- continue to use the cmm versions of these functions instead of inline
+  -- assembly. Tracked in #24841.
+  ppc  = isPPC $ platformArch platform
+  i386 = target32Bit platform
+  translateBitcasts mop args | ppc || i386 = alwaysExternal args
+                             | otherwise   = opTranslate args mop
+
   allowFMA = stgToCmmAllowFMAInstr cfg
 
   fmaOp :: FMASign -> Width -> [CmmActual] -> PrimopCmmEmit
diff --git a/libraries/ghc-platform/src/GHC/Platform/ArchOS.hs b/libraries/ghc-platform/src/GHC/Platform/ArchOS.hs
index 3286337923e5d900def2dd047dea82136ae4e400..23a046c785a2560eff8dd6de782068d07576a874 100644
--- a/libraries/ghc-platform/src/GHC/Platform/ArchOS.hs
+++ b/libraries/ghc-platform/src/GHC/Platform/ArchOS.hs
@@ -10,7 +10,7 @@ module GHC.Platform.ArchOS
    , ArmISAExt(..)
    , ArmABI(..)
    , PPC_64ABI(..)
-   , isARM
+   , isARM, isPPC
    , stringEncodeArch
 
      -- * Operating systems
@@ -188,6 +188,11 @@ isARM (ArchARM {}) = True
 isARM ArchAArch64  = True
 isARM _ = False
 
+isPPC :: Arch -> Bool
+isPPC (ArchPPC_64 _) = True
+isPPC ArchPPC        = True
+isPPC _              = False
+
 -- | This predicate tells us whether the OS support Mach-O shared libraries.
 osMachOTarget :: OS -> Bool
 osMachOTarget OSDarwin = True
diff --git a/rts/include/stg/Prim.h b/rts/include/stg/Prim.h
index c4595cf4fd4d7852d6eb4a9cd663634372316fc2..ea8a91c23af70349f3030c72d259201cffa7a3f8 100644
--- a/rts/include/stg/Prim.h
+++ b/rts/include/stg/Prim.h
@@ -135,3 +135,30 @@ StgWord hs_ctz8(StgWord x);
 StgWord hs_ctz16(StgWord x);
 StgWord hs_ctz32(StgWord x);
 StgWord hs_ctz64(StgWord64 x);
+
+/* bitcasts, instead of creating a new C file we static inline these here. We
+ * use __builtin_memcpy instead of memcpy from string.h to avoid function
+ * prototype conflicts that occur in the C backend with the inclusion of
+ * string.h*/
+static inline StgFloat hs_bitcastword2float(StgWord32 x) {
+    StgFloat dest;
+    __builtin_memcpy(&dest, &x, sizeof(StgFloat));
+    return dest;
+}
+
+static inline StgDouble hs_bitcastword642double(StgWord64 x) {
+    StgDouble dest;
+    __builtin_memcpy(&dest, &x, sizeof(StgDouble));
+    return dest;
+}
+
+static inline StgWord32 hs_bitcastfloat2word(StgFloat x) {
+    StgWord32 dest;
+    __builtin_memcpy(&dest, &x, sizeof(StgWord32));
+    return dest;
+}
+static inline StgWord64 hs_bitcastdouble2word64(StgDouble x) {
+    StgWord64 dest;
+    __builtin_memcpy(&dest, &x, sizeof(StgWord64));
+    return dest;
+}