Compare revisions

Serge S. Gulin · Rodrigo Mesquita · Marge Bot · Rodrigo Mesquita · Marge Bot · Andreas Klebinger
--- a/compiler/GHC/Builtin/primops.txt.pp
+++ b/compiler/GHC/Builtin/primops.txt.pp
@@ -145,7 +145,6 @@ defaults
   cheap            = { primOpOkForSpeculation _thisOp }
   strictness       = { \ arity -> mkClosedDmdSig (replicate arity topDmd) topDiv }
   fixity           = Nothing
-   llvm_only        = False
   vector           = []
   deprecated_msg   = {}      -- A non-empty message indicates deprecation

@@ -1094,6 +1093,14 @@ primop   DoubleLtOp "<##"   Compare   Double# -> Double# -> Int#
 primop   DoubleLeOp "<=##"   Compare   Double# -> Double# -> Int#
   with fixity = infix 4

+primop   DoubleMinOp   "minDouble#"      GenPrimOp
+   Double# -> Double# -> Double#
+   with commutable = True
+
+primop   DoubleMaxOp   "maxDouble#"      GenPrimOp
+   Double# -> Double# -> Double#
+   with commutable = True
+
 primop   DoubleAddOp   "+##"   GenPrimOp
   Double# -> Double# -> Double#
   with commutable = True
@@ -1260,6 +1267,14 @@ primop   FloatNeOp  "neFloat#"   Compare
 primop   FloatLtOp  "ltFloat#"   Compare   Float# -> Float# -> Int#
 primop   FloatLeOp  "leFloat#"   Compare   Float# -> Float# -> Int#

+primop   FloatMinOp   "minFloat#"      GenPrimOp
+   Float# -> Float# -> Float#
+   with commutable = True
+
+primop   FloatMaxOp   "maxFloat#"      GenPrimOp
+   Float# -> Float# -> Float#
+   with commutable = True
+
 primop   FloatAddOp   "plusFloat#"      GenPrimOp
   Float# -> Float# -> Float#
   with commutable = True
@@ -4032,86 +4047,73 @@ section "SIMD Vectors"
  ,<Word8,Word8#,64>,<Word16,Word16#,32>,<Word32,Word32#,16>,<Word64,Word64#,8>]

 primtype VECTOR
-   with llvm_only = True
-        vector = ALL_VECTOR_TYPES
+   with vector = ALL_VECTOR_TYPES

 primop VecBroadcastOp "broadcast#" GenPrimOp
   SCALAR -> VECTOR
   { Broadcast a scalar to all elements of a vector. }
-   with llvm_only = True
-        vector = ALL_VECTOR_TYPES
+   with vector = ALL_VECTOR_TYPES

 primop VecPackOp "pack#" GenPrimOp
   VECTUPLE -> VECTOR
   { Pack the elements of an unboxed tuple into a vector. }
-   with llvm_only = True
-        vector = ALL_VECTOR_TYPES
+   with vector = ALL_VECTOR_TYPES

 primop VecUnpackOp "unpack#" GenPrimOp
   VECTOR -> VECTUPLE
   { Unpack the elements of a vector into an unboxed tuple. #}
-   with llvm_only = True
-        vector = ALL_VECTOR_TYPES
+   with vector = ALL_VECTOR_TYPES

 primop VecInsertOp "insert#" GenPrimOp
   VECTOR -> SCALAR -> Int# -> VECTOR
   { Insert a scalar at the given position in a vector. }
   with effect = CanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecAddOp "plus#" GenPrimOp
   VECTOR -> VECTOR -> VECTOR
   { Add two vectors element-wise. }
   with commutable = True
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecSubOp "minus#" GenPrimOp
   VECTOR -> VECTOR -> VECTOR
   { Subtract two vectors element-wise. }
-   with llvm_only = True
-        vector = ALL_VECTOR_TYPES
+   with vector = ALL_VECTOR_TYPES

 primop VecMulOp "times#" GenPrimOp
   VECTOR -> VECTOR -> VECTOR
   { Multiply two vectors element-wise. }
   with commutable = True
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecDivOp "divide#" GenPrimOp
   VECTOR -> VECTOR -> VECTOR
   { Divide two vectors element-wise. }
   with effect = CanFail
-        llvm_only = True
        vector = FLOAT_VECTOR_TYPES

 primop VecQuotOp "quot#" GenPrimOp
   VECTOR -> VECTOR -> VECTOR
   { Rounds towards zero element-wise. }
   with effect = CanFail
-        llvm_only = True
        vector = INT_VECTOR_TYPES

 primop VecRemOp "rem#" GenPrimOp
   VECTOR -> VECTOR -> VECTOR
   { Satisfies @('quot#' x y) 'times#' y 'plus#' ('rem#' x y) == x@. }
   with effect = CanFail
-        llvm_only = True
        vector = INT_VECTOR_TYPES

 primop VecNegOp "negate#" GenPrimOp
   VECTOR -> VECTOR
   { Negate element-wise. }
-   with llvm_only = True
-        vector = SIGNED_VECTOR_TYPES
+   with vector = SIGNED_VECTOR_TYPES

 primop VecIndexByteArrayOp "indexArray#" GenPrimOp
   ByteArray# -> Int# -> VECTOR
   { Read a vector from specified index of immutable array. }
   with effect = CanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecReadByteArrayOp "readArray#" GenPrimOp
@@ -4119,7 +4121,6 @@ primop VecReadByteArrayOp "readArray#" GenPrimOp
   { Read a vector from specified index of mutable array. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecWriteByteArrayOp "writeArray#" GenPrimOp
@@ -4127,14 +4128,12 @@ primop VecWriteByteArrayOp "writeArray#" GenPrimOp
   { Write a vector to specified index of mutable array. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecIndexOffAddrOp "indexOffAddr#" GenPrimOp
   Addr# -> Int# -> VECTOR
   { Reads vector; offset in bytes. }
   with effect = CanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecReadOffAddrOp "readOffAddr#" GenPrimOp
@@ -4142,7 +4141,6 @@ primop VecReadOffAddrOp "readOffAddr#" GenPrimOp
   { Reads vector; offset in bytes. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecWriteOffAddrOp "writeOffAddr#" GenPrimOp
@@ -4150,7 +4148,6 @@ primop VecWriteOffAddrOp "writeOffAddr#" GenPrimOp
   { Write vector; offset in bytes. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES


@@ -4158,7 +4155,6 @@ primop VecIndexScalarByteArrayOp "indexArrayAs#" GenPrimOp
   ByteArray# -> Int# -> VECTOR
   { Read a vector from specified index of immutable array of scalars; offset is in scalar elements. }
   with effect = CanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecReadScalarByteArrayOp "readArrayAs#" GenPrimOp
@@ -4166,7 +4162,6 @@ primop VecReadScalarByteArrayOp "readArrayAs#" GenPrimOp
   { Read a vector from specified index of mutable array of scalars; offset is in scalar elements. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecWriteScalarByteArrayOp "writeArrayAs#" GenPrimOp
@@ -4174,14 +4169,12 @@ primop VecWriteScalarByteArrayOp "writeArrayAs#" GenPrimOp
   { Write a vector to specified index of mutable array of scalars; offset is in scalar elements. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecIndexScalarOffAddrOp "indexOffAddrAs#" GenPrimOp
   Addr# -> Int# -> VECTOR
   { Reads vector; offset in scalar elements. }
   with effect = CanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecReadScalarOffAddrOp "readOffAddrAs#" GenPrimOp
@@ -4189,7 +4182,6 @@ primop VecReadScalarOffAddrOp "readOffAddrAs#" GenPrimOp
   { Reads vector; offset in scalar elements. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

 primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
@@ -4197,9 +4189,47 @@ primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
   { Write vector; offset in scalar elements. }
   with effect = ReadWriteEffect
        can_fail_warning = YesWarnCanFail
-        llvm_only = True
        vector = ALL_VECTOR_TYPES

+primop   VecFMAdd   "fmadd#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused multiply-add operation @x*y+z@. See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFMSub   "fmsub#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused multiply-subtract operation @x*y-z@. See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFNMAdd   "fnmadd#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused negate-multiply-add operation @-x*y+z@. See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+primop   VecFNMSub   "fnmsub#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR -> VECTOR
+   {Fused negate-multiply-subtract operation @-x*y-z@. See "GHC.Prim#fma".}
+   with
+      vector = FLOAT_VECTOR_TYPES
+
+primop VecShuffleOp "shuffle#" GenPrimOp
+  VECTOR -> VECTOR -> INTVECTUPLE -> VECTOR
+  {Shuffle elements of the concatenation of the input two vectors
+  into the result vector.}
+   with vector = ALL_VECTOR_TYPES
+
+primop VecMinOp "min#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR
+   {Component-wise minimum of two vectors.}
+   with
+      vector = ALL_VECTOR_TYPES
+
+primop VecMaxOp "max#" GenPrimOp
+   VECTOR -> VECTOR -> VECTOR
+   {Component-wise maximum of two vectors.}
+   with
+      vector = ALL_VECTOR_TYPES
+
 ------------------------------------------------------------------------

 section "Prefetch"

--- a/compiler/GHC/ByteCode/Asm.hs
+++ b/compiler/GHC/ByteCode/Asm.hs
@@ -664,7 +664,8 @@ mkNativeCallInfoSig platform NativeCallInfo{..}
  | otherwise
  = assertPpr (length regs <= 24) (text "too many registers for bitmap:" <+> ppr (length regs)) {- 24 bits for register bitmap -}
    assertPpr (cont_offset < 255) (text "continuation offset too large:" <+> ppr cont_offset) {- 8 bits for continuation offset (only for NativeTupleReturn) -}
-    assertPpr (all (`elem` regs) (regSetToList nativeCallRegs)) (text "not all registers accounted for") {- all regs accounted for -}
+    assertPpr (all (`elem` (map globalRegUseGlobalReg regs)) (regSetToList nativeCallRegs)) (text "not all registers accounted for") {- all regs accounted for -}
+      -- SLD: the above assertion seems wrong, because it doesn't account for register overlap
    foldl' reg_bit 0 (zip regs [0..]) .|. (cont_offset `shiftL` 24)
  where
    cont_offset :: Word32
@@ -672,8 +673,8 @@ mkNativeCallInfoSig platform NativeCallInfo{..}
      | nativeCallType == NativeTupleReturn = fromIntegral nativeCallStackSpillSize
      | otherwise                           = 0 -- there is no continuation for primcalls

-    reg_bit :: Word32 -> (GlobalReg, Int) -> Word32
-    reg_bit x (r, n)
+    reg_bit :: Word32 -> (GlobalRegUse, Int) -> Word32
+    reg_bit x (GlobalRegUse r _, n)
      | r `elemRegSet` nativeCallRegs = x .|. 1 `shiftL` n
      | otherwise                     = x
    regs = allArgRegsCover platform

--- a/compiler/GHC/Cmm.hs
+++ b/compiler/GHC/Cmm.hs
@@ -100,7 +100,7 @@ data GenCmmDecl d h g
  = CmmProc     -- A procedure
     h                 -- Extra header such as the info table
     CLabel            -- Entry label
-     [GlobalReg]       -- Registers live on entry. Note that the set of live
+     [GlobalRegUse]    -- Registers live on entry. Note that the set of live
                       -- registers will be correct in generated C-- code, but
                       -- not in hand-written C-- code. However,
                       -- splitAtProcPoints calculates correct liveness

--- a/compiler/GHC/Cmm/CallConv.hs
+++ b/compiler/GHC/Cmm/CallConv.hs
@@ -7,7 +7,6 @@ module GHC.Cmm.CallConv (
 ) where

 import GHC.Prelude
-import Data.List (nub)

 import GHC.Cmm.Expr
 import GHC.Runtime.Heap.Layout
@@ -17,6 +16,8 @@ import GHC.Platform
 import GHC.Platform.Profile
 import GHC.Utils.Outputable
 import GHC.Utils.Panic
+import GHC.Data.List.SetOps (nubOrdBy)
+import Data.Ord (comparing)

 -- Calculate the 'GlobalReg' or stack locations for function call
 -- parameters as used by the Cmm calling convention.
@@ -67,14 +68,16 @@ assignArgumentsPos profile off conv arg_ty reps = (stk_off, assignments)
      assign_regs assts (r:rs) regs | isVecType ty   = vec
                                    | isFloatType ty = float
                                    | otherwise      = int
-        where vec = case (w, regs) of
-                      (W128, AvailRegs vs fs ds ls (s:ss))
-                          | passVectorInReg W128 profile -> k (RegisterParam (XmmReg s), AvailRegs vs fs ds ls ss)
-                      (W256, AvailRegs vs fs ds ls (s:ss))
-                          | passVectorInReg W256 profile -> k (RegisterParam (YmmReg s), AvailRegs vs fs ds ls ss)
-                      (W512, AvailRegs vs fs ds ls (s:ss))
-                          | passVectorInReg W512 profile -> k (RegisterParam (ZmmReg s), AvailRegs vs fs ds ls ss)
-                      _ -> (assts, (r:rs))
+        where vec = case regs of
+                      AvailRegs vs fs ds ls (s:ss)
+                        | passVectorInReg w profile
+                          -> let reg_class = case w of
+                                    W128 -> XmmReg
+                                    W256 -> YmmReg
+                                    W512 -> ZmmReg
+                                    _    -> panic "CmmCallConv.assignArgumentsPos: Invalid vector width"
+                              in k (RegisterParam (reg_class s), AvailRegs vs fs ds ls ss)
+                      _ -> (assts, r:rs)
              float = case (w, regs) of
                        (W32, AvailRegs vs fs ds ls (s:ss))
                            | passFloatInXmm          -> k (RegisterParam (FloatReg s), AvailRegs vs fs ds ls ss)
@@ -213,28 +216,26 @@ allRegs platform =
 nodeOnly :: AvailRegs
 nodeOnly = noAvailRegs { availVanillaRegs = [VanillaReg 1] }

-- This returns the set of global registers that *cover* the machine registers
-- used for argument passing. On platforms where registers can overlap---right
-- now just x86-64, where Float and Double registers overlap---passing this set
-- of registers is guaranteed to preserve the contents of all live registers. We
-- only use this functionality in hand-written C-- code in the RTS.
-realArgRegsCover :: Platform -> [GlobalReg]
+-- | This returns the set of global registers that *cover* the machine registers
+-- used for argument passing. On platforms where registers can overlap, passing
+-- this set of registers is guaranteed to preserve the contents of all live
+-- registers. We only use this functionality in hand-written C-- code in the RTS.
+realArgRegsCover :: Platform -> [GlobalRegUse]
 realArgRegsCover platform
    | passFloatArgsInXmm platform
-    = realVanillaRegs    platform ++
-      realLongRegs       platform ++
-      realDoubleRegs     platform
-        -- we only need to save the low Double part of XMM registers.
-        -- Moreover, the NCG can't load/store full XMM
-        -- registers for now...
+    = [ GlobalRegUse r (globalRegSpillType platform r) | r <- realVanillaRegs platform ]
+   ++ [ GlobalRegUse r (globalRegSpillType platform r) | r <- realLongRegs    platform ]
+   ++ [ GlobalRegUse r (globalRegSpillType platform r) | r <- realDoubleRegs  platform ]
+        -- The above seems wrong, as it means we only save the low 64 bits
+        -- of XMM/YMM/ZMM registers on X86_64, which is probably wrong.
+        --
+        -- Challenge: change the realDoubleRegs line to use ZmmReg instead,
+        -- and fix the resulting compiler errors.

    | otherwise
-    = realVanillaRegs platform ++
-      realFloatRegs   platform ++
-      realDoubleRegs  platform ++
-      realLongRegs    platform
-        -- we don't save XMM registers if they are not used for parameter passing
-
+    = [ GlobalRegUse r (globalRegSpillType platform r)
+      | r <- realVanillaRegs platform ++ realFloatRegs platform ++ realDoubleRegs platform ++ realLongRegs platform
+      ] -- we don't save XMM registers if they are not used for parameter passing

 {-

@@ -335,9 +336,11 @@ realArgRegsCover platform
           make sure to also update GHC.StgToByteCode.layoutNativeCall
 -}

-- Like realArgRegsCover but always includes the node. This covers all real
+-- | Like 'realArgRegsCover' but always includes the node. This covers all real
 -- and virtual registers actually used for passing arguments.
-
-allArgRegsCover :: Platform -> [GlobalReg]
+allArgRegsCover :: Platform -> [GlobalRegUse]
 allArgRegsCover platform =
-  nub (VanillaReg 1 : realArgRegsCover platform)
+  nubOrdBy (comparing globalRegUseGlobalReg)
+    (GlobalRegUse node (globalRegSpillType platform node) : realArgRegsCover platform)
+  where
+    node = VanillaReg 1
--- a/compiler/GHC/Cmm/Graph.hs
+++ b/compiler/GHC/Cmm/Graph.hs
@@ -208,7 +208,7 @@ mkJump profile conv e actuals updfr_off =

 -- | A jump where the caller says what the live GlobalRegs are.  Used
 -- for low-level hand-written Cmm.
-mkRawJump       :: Profile -> CmmExpr -> UpdFrameOffset -> [GlobalReg]
+mkRawJump       :: Profile -> CmmExpr -> UpdFrameOffset -> [GlobalRegUse]
                -> CmmAGraph
 mkRawJump profile e updfr_off vols =
  lastWithArgs profile Jump Old NativeNodeCall [] updfr_off $
@@ -297,7 +297,7 @@ stackStubExpr w = CmmLit (CmmInt 0 w)
 copyInOflow  :: Profile -> Convention -> Area
             -> [CmmFormal]
             -> [CmmFormal]
-             -> (Int, [GlobalReg], CmmAGraph)
+             -> (Int, [GlobalRegUse], CmmAGraph)

 copyInOflow profile conv area formals extra_stk
  = (offset, gregs, catAGraphs $ map mkMiddle nodes)
@@ -308,9 +308,9 @@ copyInOflow profile conv area formals extra_stk
 copyIn :: Profile -> Convention -> Area
       -> [CmmFormal]
       -> [CmmFormal]
-       -> (ByteOff, [GlobalReg], [CmmNode O O])
+       -> (ByteOff, [GlobalRegUse], [CmmNode O O])
 copyIn profile conv area formals extra_stk
-  = (stk_size, [r | (_, RegisterParam r) <- args], map ci (stk_args ++ args))
+  = (stk_size, [GlobalRegUse r (localRegType lr)| (lr, RegisterParam r) <- args], map ci (stk_args ++ args))
  where
    platform = profilePlatform profile

@@ -365,7 +365,7 @@ data Transfer = Call | JumpRet | Jump | Ret deriving Eq
 copyOutOflow :: Profile -> Convention -> Transfer -> Area -> [CmmExpr]
             -> UpdFrameOffset
             -> [CmmExpr] -- extra stack args
-             -> (Int, [GlobalReg], CmmAGraph)
+             -> (Int, [GlobalRegUse], CmmAGraph)

 -- Generate code to move the actual parameters into the locations
 -- required by the calling convention.  This includes a store for the
@@ -383,8 +383,8 @@ copyOutOflow profile conv transfer area actuals updfr_off extra_stack_stuff
    (regs, graph) = foldr co ([], mkNop) (setRA ++ args ++ stack_params)

    co :: (CmmExpr, ParamLocation)
-       -> ([GlobalReg], CmmAGraph)
-       -> ([GlobalReg], CmmAGraph)
+       -> ([GlobalRegUse], CmmAGraph)
+       -> ([GlobalRegUse], CmmAGraph)
    co (v, RegisterParam r@(VanillaReg {})) (rs, ms) =
        let width = cmmExprWidth platform v
            value
@@ -393,12 +393,14 @@ copyOutOflow profile conv transfer area actuals updfr_off extra_stack_stuff
                | width < wordWidth platform =
                    CmmMachOp (MO_XX_Conv width (wordWidth platform)) [v]
                | otherwise = panic "Parameter width greater than word width"
+            ru = GlobalRegUse r (cmmExprType platform value)

-        in (r:rs, mkAssign (CmmGlobal $ GlobalRegUse r (cmmExprType platform value)) value <*> ms)
+        in (ru:rs, mkAssign (CmmGlobal ru) value <*> ms)

    -- Non VanillaRegs
    co (v, RegisterParam r) (rs, ms) =
-        (r:rs, mkAssign (CmmGlobal $ GlobalRegUse r (cmmExprType platform v)) v <*> ms)
+      let ru = GlobalRegUse r (cmmExprType platform v)
+      in (ru:rs, mkAssign (CmmGlobal ru) v <*> ms)

    co (v, StackParam off)  (rs, ms)
      = (rs, mkStore (CmmStackSlot area off) (value v) <*> ms)
@@ -461,13 +463,13 @@ copyOutOflow profile conv transfer area actuals updfr_off extra_stack_stuff


 mkCallEntry :: Profile -> Convention -> [CmmFormal] -> [CmmFormal]
-            -> (Int, [GlobalReg], CmmAGraph)
+            -> (Int, [GlobalRegUse], CmmAGraph)
 mkCallEntry profile conv formals extra_stk
  = copyInOflow profile conv Old formals extra_stk

 lastWithArgs :: Profile -> Transfer -> Area -> Convention -> [CmmExpr]
             -> UpdFrameOffset
-             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> (ByteOff -> [GlobalRegUse] -> CmmAGraph)
             -> CmmAGraph
 lastWithArgs profile transfer area conv actuals updfr_off last =
  lastWithArgsAndExtraStack profile transfer area conv actuals
@@ -476,7 +478,7 @@ lastWithArgs profile transfer area conv actuals updfr_off last =
 lastWithArgsAndExtraStack :: Profile
             -> Transfer -> Area -> Convention -> [CmmExpr]
             -> UpdFrameOffset -> [CmmExpr]
-             -> (ByteOff -> [GlobalReg] -> CmmAGraph)
+             -> (ByteOff -> [GlobalRegUse] -> CmmAGraph)
             -> CmmAGraph
 lastWithArgsAndExtraStack profile transfer area conv actuals updfr_off
                          extra_stack last =
@@ -490,7 +492,7 @@ noExtraStack :: [CmmExpr]
 noExtraStack = []

 toCall :: CmmExpr -> Maybe BlockId -> UpdFrameOffset -> ByteOff
-       -> ByteOff -> [GlobalReg]
+       -> ByteOff -> [GlobalRegUse]
       -> CmmAGraph
 toCall e cont updfr_off res_space arg_space regs =
  mkLast $ CmmCall e cont regs arg_space res_space updfr_off
--- a/compiler/GHC/Cmm/Lexer.x
+++ b/compiler/GHC/Cmm/Lexer.x
@@ -104,11 +104,14 @@ $white_no_nl+           ;
  "False"               { kw CmmT_False }
  "likely"              { kw CmmT_likely}

-  P@decimal             { global_regN VanillaReg      gcWord }
-  R@decimal             { global_regN VanillaReg       bWord }
-  F@decimal             { global_regN FloatReg  (const $ cmmFloat W32) }
-  D@decimal             { global_regN DoubleReg (const $ cmmFloat W64) }
-  L@decimal             { global_regN LongReg   (const $ cmmBits  W64) }
+  P@decimal             { global_regN 1 VanillaReg      gcWord }
+  R@decimal             { global_regN 1 VanillaReg       bWord }
+  F@decimal             { global_regN 1 FloatReg  (const $ cmmFloat W32) }
+  D@decimal             { global_regN 1 DoubleReg (const $ cmmFloat W64) }
+  L@decimal             { global_regN 1 LongReg   (const $ cmmBits  W64) }
+  XMM@decimal           { global_regN 3 XmmReg    (const $ cmmVec 2 (cmmFloat W64)) }
+  YMM@decimal           { global_regN 3 YmmReg    (const $ cmmVec 4 (cmmFloat W64)) }
+  ZMM@decimal           { global_regN 3 ZmmReg    (const $ cmmVec 8 (cmmFloat W64)) }
  Sp                    { global_reg  Sp               bWord }
  SpLim                 { global_reg  SpLim            bWord }
  Hp                    { global_reg  Hp              gcWord }
@@ -173,9 +176,9 @@ data CmmToken
  | CmmT_bits16
  | CmmT_bits32
  | CmmT_bits64
-  | CmmT_bits128
-  | CmmT_bits256
-  | CmmT_bits512
+  | CmmT_vec128
+  | CmmT_vec256
+  | CmmT_vec512
  | CmmT_float32
  | CmmT_float64
  | CmmT_gcptr
@@ -211,14 +214,16 @@ special_char span buf _len = return (L span (CmmT_SpecChar (currentChar buf)))
 kw :: CmmToken -> Action
 kw tok span _buf _len = return (L span tok)

-global_regN :: (Int -> GlobalReg) -> (Platform -> CmmType) -> Action
-global_regN con ty_fn span buf len
+global_regN :: Int -> (Int -> GlobalReg) -> (Platform -> CmmType) -> Action
+global_regN ident_nb_chars con ty_fn span buf len
  = do { platform <- getPlatform
       ; let reg = con (fromIntegral n)
             ty = ty_fn platform
       ; return (L span (CmmT_GlobalReg (GlobalRegUse reg ty))) }
-  where buf' = stepOn buf
-        n = parseUnsignedInteger buf' (len-1) 10 octDecDigit
+  where buf' = go ident_nb_chars buf
+          where go 0 b = b
+                go i b = go (i-1) (stepOn b)
+        n = parseUnsignedInteger buf' (len-ident_nb_chars) 10 octDecDigit

 global_reg :: GlobalReg -> (Platform -> CmmType) -> Action
 global_reg reg ty_fn span _buf _len
@@ -269,9 +274,9 @@ reservedWordsFM = listToUFM $
        ( "bits16",             CmmT_bits16 ),
        ( "bits32",             CmmT_bits32 ),
        ( "bits64",             CmmT_bits64 ),
-        ( "bits128",            CmmT_bits128 ),
-        ( "bits256",            CmmT_bits256 ),
-        ( "bits512",            CmmT_bits512 ),
+        ( "vec128",             CmmT_vec128 ),
+        ( "vec256",             CmmT_vec256 ),
+        ( "vec512",             CmmT_vec512 ),
        ( "float32",            CmmT_float32 ),
        ( "float64",            CmmT_float64 ),
 -- New forms
@@ -279,9 +284,6 @@ reservedWordsFM = listToUFM $
        ( "b16",                CmmT_bits16 ),
        ( "b32",                CmmT_bits32 ),
        ( "b64",                CmmT_bits64 ),
-        ( "b128",               CmmT_bits128 ),
-        ( "b256",               CmmT_bits256 ),
-        ( "b512",               CmmT_bits512 ),
        ( "f32",                CmmT_float32 ),
        ( "f64",                CmmT_float64 ),
        ( "gcptr",              CmmT_gcptr ),

--- a/compiler/GHC/Cmm/Lint.hs
+++ b/compiler/GHC/Cmm/Lint.hs
@@ -171,7 +171,7 @@ lintCmmMiddle node = case node of
  CmmAssign reg expr -> do
            erep <- lintCmmExpr expr
            let reg_ty = cmmRegType reg
-            unless (erep `cmmEqType_ignoring_ptrhood` reg_ty) $
+            unless (erep `cmmCompatType` reg_ty) $
              cmmLintAssignErr (CmmAssign reg expr) erep reg_ty

  CmmStore l r _alignment -> do

--- a/compiler/GHC/Cmm/Liveness.hs
+++ b/compiler/GHC/Cmm/Liveness.hs
@@ -59,7 +59,7 @@ cmmLocalLiveness platform graph =
    check facts =
        noLiveOnEntry entry (expectJust "check" $ mapLookup entry facts) facts

-cmmGlobalLiveness :: Platform -> CmmGraph -> BlockEntryLiveness GlobalReg
+cmmGlobalLiveness :: Platform -> CmmGraph -> BlockEntryLiveness GlobalRegUse
 cmmGlobalLiveness platform graph =
    analyzeCmmBwd liveLattice (xferLive platform) graph mapEmpty

@@ -92,7 +92,7 @@ xferLive platform (BlockCC eNode middle xNode) fBase =
        !result = foldNodesBwdOO (gen_kill platform) middle joined
    in mapSingleton (entryLabel eNode) result
 {-# SPECIALIZE xferLive :: Platform -> TransferFun (CmmLive LocalReg) #-}
-{-# SPECIALIZE xferLive :: Platform -> TransferFun (CmmLive GlobalReg) #-}
+{-# SPECIALIZE xferLive :: Platform -> TransferFun (CmmLive GlobalRegUse) #-}

 -----------------------------------------------------------------------------
 -- | Specialization that only retains the keys for local variables.

--- a/compiler/GHC/Cmm/MachOp.hs
+++ b/compiler/GHC/Cmm/MachOp.hs
@@ -116,7 +116,7 @@ data MachOp

  -- Floating-point fused multiply-add operations
  -- | Fused multiply-add, see 'FMASign'.
-  | MO_FMA FMASign Width
+  | MO_FMA FMASign Length Width

  -- Floating point comparison
  | MO_F_Eq Width
@@ -126,6 +126,9 @@ data MachOp
  | MO_F_Gt Width
  | MO_F_Lt Width

+  | MO_F_Min Width
+  | MO_F_Max Width
+
  -- Bitwise operations.  Not all of these may be supported
  -- at all sizes, and only integral Widths are valid.
  | MO_And   Width
@@ -158,8 +161,9 @@ data MachOp
  | MO_FW_Bitcast Width      -- Float/Double  -> Word32/Word64

  -- Vector element insertion and extraction operations
-  | MO_V_Insert  Length Width   -- Insert scalar into vector
-  | MO_V_Extract Length Width   -- Extract scalar from vector
+  | MO_V_Broadcast Length Width -- Broadcast a scalar into a vector
+  | MO_V_Insert    Length Width -- Insert scalar into vector
+  | MO_V_Extract   Length Width -- Extract scalar from vector

  -- Integer vector operations
  | MO_V_Add Length Width
@@ -175,9 +179,14 @@ data MachOp
  | MO_VU_Quot Length Width
  | MO_VU_Rem  Length Width

+  -- Vector shuffles
+  | MO_V_Shuffle  Length Width [Int]
+  | MO_VF_Shuffle Length Width [Int]
+
  -- Floating point vector element insertion and extraction operations
-  | MO_VF_Insert  Length Width   -- Insert scalar into vector
-  | MO_VF_Extract Length Width   -- Extract scalar from vector
+  | MO_VF_Broadcast Length Width   -- Broadcast a scalar into a vector
+  | MO_VF_Insert    Length Width   -- Insert scalar into vector
+  | MO_VF_Extract   Length Width   -- Extract scalar from vector

  -- Floating point vector operations
  | MO_VF_Add  Length Width
@@ -186,6 +195,14 @@ data MachOp
  | MO_VF_Mul  Length Width
  | MO_VF_Quot Length Width

+  -- Min/max operations
+  | MO_VS_Min Length Width
+  | MO_VS_Max Length Width
+  | MO_VU_Min Length Width
+  | MO_VU_Max Length Width
+  | MO_VF_Min Length Width
+  | MO_VF_Max Length Width
+
  -- | An atomic read with no memory ordering. Address msut
  -- be naturally aligned.
  | MO_RelaxedRead Width
@@ -316,6 +333,8 @@ isCommutableMachOp mop =
        MO_Xor _                -> True
        MO_F_Add _              -> True
        MO_F_Mul _              -> True
+        MO_F_Min {}             -> True
+        MO_F_Max {}             -> True
        _other                  -> False

 -- ----------------------------------------------------------------------------
@@ -458,8 +477,10 @@ machOpResultType platform mop tys =
    MO_F_Mul r          -> cmmFloat r
    MO_F_Quot r         -> cmmFloat r
    MO_F_Neg r          -> cmmFloat r
+    MO_F_Min r          -> cmmFloat r
+    MO_F_Max r          -> cmmFloat r

-    MO_FMA _ r          -> cmmFloat r
+    MO_FMA _ l r        -> if l == 1 then cmmFloat r else cmmVec l (cmmFloat r)

    MO_F_Eq  {}         -> comparisonResultRep platform
    MO_F_Ne  {}         -> comparisonResultRep platform
@@ -485,6 +506,7 @@ machOpResultType platform mop tys =
    MO_WF_Bitcast   w   -> cmmFloat w
    MO_FW_Bitcast   w   -> cmmBits w

+    MO_V_Broadcast l w  -> cmmVec l (cmmBits w)
    MO_V_Insert  l w    -> cmmVec l (cmmBits w)
    MO_V_Extract _ w    -> cmmBits w

@@ -495,10 +517,18 @@ machOpResultType platform mop tys =
    MO_VS_Quot l w      -> cmmVec l (cmmBits w)
    MO_VS_Rem  l w      -> cmmVec l (cmmBits w)
    MO_VS_Neg  l w      -> cmmVec l (cmmBits w)
+    MO_VS_Min  l w      -> cmmVec l (cmmBits w)
+    MO_VS_Max  l w      -> cmmVec l (cmmBits w)

    MO_VU_Quot l w      -> cmmVec l (cmmBits w)
    MO_VU_Rem  l w      -> cmmVec l (cmmBits w)
+    MO_VU_Min  l w      -> cmmVec l (cmmBits w)
+    MO_VU_Max  l w      -> cmmVec l (cmmBits w)

+    MO_V_Shuffle  l w _ -> cmmVec l (cmmBits w)
+    MO_VF_Shuffle l w _ -> cmmVec l (cmmFloat w)
+
+    MO_VF_Broadcast l w -> cmmVec l (cmmFloat w)
    MO_VF_Insert  l w   -> cmmVec l (cmmFloat w)
    MO_VF_Extract _ w   -> cmmFloat w

@@ -507,6 +537,8 @@ machOpResultType platform mop tys =
    MO_VF_Mul  l w      -> cmmVec l (cmmFloat w)
    MO_VF_Quot l w      -> cmmVec l (cmmFloat w)
    MO_VF_Neg  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Min  l w      -> cmmVec l (cmmFloat w)
+    MO_VF_Max  l w      -> cmmVec l (cmmFloat w)

    MO_RelaxedRead r    -> cmmBits r
    MO_AlignmentCheck _ _ -> ty1
@@ -555,8 +587,10 @@ machOpArgReps platform op =
    MO_F_Mul r          -> [r,r]
    MO_F_Quot r         -> [r,r]
    MO_F_Neg r          -> [r]
+    MO_F_Min r          -> [r,r]
+    MO_F_Max r          -> [r,r]

-    MO_FMA _ r          -> [r,r,r]
+    MO_FMA _ l r        -> [vecwidth l r, vecwidth l r, vecwidth l r]

    MO_F_Eq  r          -> [r,r]
    MO_F_Ne  r          -> [r,r]
@@ -582,31 +616,45 @@ machOpArgReps platform op =
    MO_WF_Bitcast w       -> [w]
    MO_FW_Bitcast w       -> [w]

-    MO_V_Insert   l r   -> [typeWidth (vec l (cmmBits r)),r, W32]
-    MO_V_Extract  l r   -> [typeWidth (vec l (cmmBits r)), W32]
-    MO_VF_Insert  l r   -> [typeWidth (vec l (cmmFloat r)),r,W32]
-    MO_VF_Extract l r   -> [typeWidth (vec l (cmmFloat r)),W32]
-      -- SIMD vector indices are always 32 bit
+    MO_V_Shuffle  l r _ -> [vecwidth l r, vecwidth l r]
+    MO_VF_Shuffle l r _ -> [vecwidth l r, vecwidth l r]

-    MO_V_Add _ r        -> [r,r]
-    MO_V_Sub _ r        -> [r,r]
-    MO_V_Mul _ r        -> [r,r]
-
-    MO_VS_Quot _ r      -> [r,r]
-    MO_VS_Rem  _ r      -> [r,r]
-    MO_VS_Neg  _ r      -> [r]
-
-    MO_VU_Quot _ r      -> [r,r]
-    MO_VU_Rem  _ r      -> [r,r]
+    MO_V_Broadcast _ r  -> [r]
+    MO_V_Insert   l r   -> [vecwidth l r, r, W32]
+    MO_V_Extract  l r   -> [vecwidth l r, W32]
+    MO_VF_Broadcast _ r -> [r]
+    MO_VF_Insert  l r   -> [vecwidth l r, r, W32]
+    MO_VF_Extract l r   -> [vecwidth l r, W32]
+      -- SIMD vector indices are always 32 bit

-    MO_VF_Add  _ r      -> [r,r]
-    MO_VF_Sub  _ r      -> [r,r]
-    MO_VF_Mul  _ r      -> [r,r]
-    MO_VF_Quot _ r      -> [r,r]
-    MO_VF_Neg  _ r      -> [r]
+    MO_V_Add l w        -> [vecwidth l w, vecwidth l w]
+    MO_V_Sub l w        -> [vecwidth l w, vecwidth l w]
+    MO_V_Mul l w        -> [vecwidth l w, vecwidth l w]
+
+    MO_VS_Quot l w      -> [vecwidth l w, vecwidth l w]
+    MO_VS_Rem  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VS_Neg  l w      -> [vecwidth l w]
+    MO_VS_Min  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VS_Max  l w      -> [vecwidth l w, vecwidth l w]
+
+    MO_VU_Quot l w      -> [vecwidth l w, vecwidth l w]
+    MO_VU_Rem  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VU_Min  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VU_Max  l w      -> [vecwidth l w, vecwidth l w]
+
+    -- NOTE: The below is owing to the fact that floats use the SSE registers
+    MO_VF_Add  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VF_Sub  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VF_Mul  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VF_Quot l w      -> [vecwidth l w, vecwidth l w]
+    MO_VF_Neg  l w      -> [vecwidth l w]
+    MO_VF_Min  l w      -> [vecwidth l w, vecwidth l w]
+    MO_VF_Max  l w      -> [vecwidth l w, vecwidth l w]

    MO_RelaxedRead _    -> [wordWidth platform]
    MO_AlignmentCheck _ r -> [r]
+  where
+    vecwidth l w = widthFromBytes (l * widthInBytes w)

 -----------------------------------------------------------------------------
 -- CallishMachOp

--- a/compiler/GHC/Cmm/Node.hs
+++ b/compiler/GHC/Cmm/Node.hs
@@ -118,7 +118,7 @@ data CmmNode e x where
          -- occur in CmmExprs, namely as (CmmLit (CmmBlock b)) or
          -- (CmmStackSlot (Young b) _).

-      cml_args_regs :: [GlobalReg],
+      cml_args_regs :: [GlobalRegUse],
          -- The argument GlobalRegs (Rx, Fx, Dx, Lx) that are passed
          -- to the call.  This is essential information for the
          -- native code generator's register allocator; without
@@ -544,7 +544,7 @@ instance UserOfRegs LocalReg (CmmNode e x) where
               => (b -> LocalReg -> b) -> b -> a -> b
          fold f z n = foldRegsUsed platform f z n

-instance UserOfRegs GlobalReg (CmmNode e x) where
+instance UserOfRegs GlobalRegUse (CmmNode e x) where
  {-# INLINEABLE foldRegsUsed #-}
  foldRegsUsed platform f !z n = case n of
    CmmAssign _ expr -> fold f z expr
@@ -555,8 +555,8 @@ instance UserOfRegs GlobalReg (CmmNode e x) where
    CmmCall {cml_target=tgt, cml_args_regs=args} -> fold f (fold f z args) tgt
    CmmForeignCall {tgt=tgt, args=args} -> fold f (fold f z tgt) args
    _ -> z
-    where fold :: forall a b.  UserOfRegs GlobalReg a
-               => (b -> GlobalReg -> b) -> b -> a -> b
+    where fold :: forall a b.  UserOfRegs GlobalRegUse a
+               => (b -> GlobalRegUse -> b) -> b -> a -> b
          fold f z n = foldRegsUsed platform f z n
 instance (Ord r, UserOfRegs r CmmReg) => UserOfRegs r ForeignTarget where
  -- The (Ord r) in the context is necessary here
@@ -576,7 +576,7 @@ instance DefinerOfRegs LocalReg (CmmNode e x) where
               => (b -> LocalReg -> b) -> b -> a -> b
          fold f z n = foldRegsDefd platform f z n

-instance DefinerOfRegs GlobalReg (CmmNode e x) where
+instance DefinerOfRegs GlobalRegUse (CmmNode e x) where
  {-# INLINEABLE foldRegsDefd #-}
  foldRegsDefd platform f !z n = case n of
    CmmAssign lhs _ -> fold f z lhs
@@ -585,12 +585,13 @@ instance DefinerOfRegs GlobalReg (CmmNode e x) where
    CmmForeignCall {} -> fold f z activeRegs
                      -- See Note [Safe foreign calls clobber STG registers]
    _ -> z
-    where fold :: forall a b. DefinerOfRegs GlobalReg a
-               => (b -> GlobalReg -> b) -> b -> a -> b
+    where fold :: forall a b. DefinerOfRegs GlobalRegUse a
+               => (b -> GlobalRegUse -> b) -> b -> a -> b
          fold f z n = foldRegsDefd platform f z n

-          activeRegs = activeStgRegs platform
-          activeCallerSavesRegs = filter (callerSaves platform) activeRegs
+          activeRegs :: [GlobalRegUse]
+          activeRegs = map (\ r -> GlobalRegUse r (globalRegSpillType platform r)) $ activeStgRegs platform
+          activeCallerSavesRegs = filter (callerSaves platform . globalRegUseGlobalReg) activeRegs

          foreignTargetRegs (ForeignTarget _ (ForeignConvention _ _ _ CmmNeverReturns)) = []
          foreignTargetRegs _ = activeCallerSavesRegs

--- a/compiler/GHC/Cmm/Opt.hs
+++ b/compiler/GHC/Cmm/Opt.hs
@@ -79,7 +79,11 @@ cmmMachOpFoldM
    -> MachOp
    -> [CmmExpr]
    -> Maybe CmmExpr
-
+cmmMachOpFoldM _ (MO_V_Broadcast {}) _ = Nothing
+cmmMachOpFoldM _ (MO_VF_Broadcast {}) _ = Nothing
+  -- SIMD NCG TODO: supporting constant folding for vector operations
+  -- would require augmenting getRegister' to handle them.
+  -- See the code for "getRegister' platform _ (CmmLit lit)".
 cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
  = Just $! case op of
      MO_S_Neg _ -> CmmLit (CmmInt (-x) rep)
@@ -93,7 +97,6 @@ cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
      MO_SS_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
      MO_UU_Conv  from to -> CmmLit (CmmInt (narrowU from x) to)
      MO_XX_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
-
      _ -> panic $ "cmmMachOpFoldM: unknown unary op: " ++ show op

 -- Eliminate shifts that are wider than the shiftee
@@ -237,23 +240,33 @@ cmmMachOpFoldM _ MO_Add{} [ CmmMachOp op@MO_Add{} [pic, CmmLit lit]
  = Just $! CmmMachOp op [pic, CmmLit $ cmmOffsetLit lit off ]
  where off = fromIntegral (narrowS rep n)

-- Make a RegOff if we can
+-- Make a RegOff if we can. We don't perform this optimization if rep is greater
+-- than the host word size because we use an Int to store the offset. See
+-- #24893 and #24700. This should be fixed to ensure that optimizations don't
+-- depend on the compiler host platform.
 cmmMachOpFoldM _ (MO_Add _) [CmmReg reg, CmmLit (CmmInt n rep)]
+  | validOffsetRep rep
  = Just $! cmmRegOff reg (fromIntegral (narrowS rep n))
 cmmMachOpFoldM _ (MO_Add _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
+  | validOffsetRep rep
  = Just $! cmmRegOff reg (off + fromIntegral (narrowS rep n))
 cmmMachOpFoldM _ (MO_Sub _) [CmmReg reg, CmmLit (CmmInt n rep)]
+  | validOffsetRep rep
  = Just $! cmmRegOff reg (- fromIntegral (narrowS rep n))
 cmmMachOpFoldM _ (MO_Sub _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
+  | validOffsetRep rep
  = Just $! cmmRegOff reg (off - fromIntegral (narrowS rep n))

 -- Fold label(+/-)offset into a CmmLit where possible

 cmmMachOpFoldM _ (MO_Add _) [CmmLit lit, CmmLit (CmmInt i rep)]
+  | validOffsetRep rep
  = Just $! CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
 cmmMachOpFoldM _ (MO_Add _) [CmmLit (CmmInt i rep), CmmLit lit]
+  | validOffsetRep rep
  = Just $! CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
 cmmMachOpFoldM _ (MO_Sub _) [CmmLit lit, CmmLit (CmmInt i rep)]
+  | validOffsetRep rep
  = Just $! CmmLit (cmmOffsetLit lit (fromIntegral (negate (narrowU rep i))))


@@ -409,6 +422,13 @@ cmmMachOpFoldM platform mop [x, (CmmLit (CmmInt n _))]

 cmmMachOpFoldM _ _ _ = Nothing

+-- | Check that a literal width is compatible with the host word size used to
+-- store offsets. This should be fixed properly (using larger types to store
+-- literal offsets). See #24893
+validOffsetRep :: Width -> Bool
+validOffsetRep rep = widthInBits rep <= finiteBitSize (undefined :: Int)
+
+
 {- Note [Comparison operators]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 If we have

--- a/compiler/GHC/Cmm/Parser.y
+++ b/compiler/GHC/Cmm/Parser.y
@@ -381,9 +381,9 @@ import qualified Data.ByteString.Char8 as BS8
        'bits16'        { L _ (CmmT_bits16) }
        'bits32'        { L _ (CmmT_bits32) }
        'bits64'        { L _ (CmmT_bits64) }
-        'bits128'       { L _ (CmmT_bits128) }
-        'bits256'       { L _ (CmmT_bits256) }
-        'bits512'       { L _ (CmmT_bits512) }
+        'vec128'        { L _ (CmmT_vec128) }
+        'vec256'        { L _ (CmmT_vec256) }
+        'vec512'        { L _ (CmmT_vec512) }
        'float32'       { L _ (CmmT_float32) }
        'float64'       { L _ (CmmT_float64) }
        'gcptr'         { L _ (CmmT_gcptr) }
@@ -770,13 +770,13 @@ safety  :: { Safety }
        : {- empty -}                   { PlayRisky }
        | STRING                        {% parseSafety $1 }

-vols    :: { [GlobalReg] }
+vols    :: { [GlobalRegUse] }
        : '[' ']'                       { [] }
-        | '[' '*' ']'                   {% do platform <- PD.getPlatform
-                                         ; return (realArgRegsCover platform) }
-                                           -- All of them. See comment attached
-                                           -- to realArgRegsCover
-        | '[' globals ']'               { map globalRegUseGlobalReg $2 }
+        | '[' '*' ']'                   {% do platform <- PD.getPlatform;
+                                              return $ realArgRegsCover platform }
+                                               -- All of them. See comment attached
+                                               -- to realArgRegsCover
+        | '[' globals ']'               { $2 }

 globals :: { [GlobalRegUse] }
        : GLOBALREG                     { [$1] }
@@ -942,9 +942,9 @@ typenot8 :: { CmmType }
        : 'bits16'              { b16 }
        | 'bits32'              { b32 }
        | 'bits64'              { b64 }
-        | 'bits128'             { b128 }
-        | 'bits256'             { b256 }
-        | 'bits512'             { b512 }
+        | 'vec128'              { cmmVec 2 f64 }
+        | 'vec256'              { cmmVec 4 f64 }
+        | 'vec512'              { cmmVec 8 f64 }
        | 'float32'             { f32 }
        | 'float64'             { f64 }
        | 'gcptr'               {% do platform <- PD.getPlatform; return $ gcWord platform }
@@ -1050,11 +1050,13 @@ machOps = listToUFM $
        ( "fneg",       MO_F_Neg ),
        ( "fmul",       MO_F_Mul ),
        ( "fquot",      MO_F_Quot ),
+        ( "fmin",       MO_F_Min ),
+        ( "fmax",       MO_F_Max ),

-        ( "fmadd" ,     MO_FMA FMAdd  ),
-        ( "fmsub" ,     MO_FMA FMSub  ),
-        ( "fnmadd",     MO_FMA FNMAdd ),
-        ( "fnmsub",     MO_FMA FNMSub ),
+        ( "fmadd" ,     MO_FMA FMAdd  1 ),
+        ( "fmsub" ,     MO_FMA FMSub  1 ),
+        ( "fnmadd",     MO_FMA FNMAdd 1 ),
+        ( "fnmsub",     MO_FMA FNMSub 1 ),

        ( "feq",        MO_F_Eq ),
        ( "fne",        MO_F_Ne ),
@@ -1377,7 +1379,7 @@ mkReturnSimple profile actuals updfr_off =
  where e = entryCode platform (cmmLoadGCWord platform (CmmStackSlot Old updfr_off))
        platform = profilePlatform profile

-doRawJump :: CmmParse CmmExpr -> [GlobalReg] -> CmmParse ()
+doRawJump :: CmmParse CmmExpr -> [GlobalRegUse] -> CmmParse ()
 doRawJump expr_code vols = do
  profile <- getProfile
  expr <- expr_code

--- a/compiler/GHC/Cmm/ProcPoint.hs
+++ b/compiler/GHC/Cmm/ProcPoint.hs
@@ -262,7 +262,7 @@ splitAtProcPoints platform entry_label callPPs procPoints procMap cmmProc = do


  let liveness = cmmGlobalLiveness platform g
-  let ppLiveness pp = filter isArgReg $ regSetToList $
+  let ppLiveness pp = filter (isArgReg . globalRegUseGlobalReg) $ regSetToList $
                        expectJust "ppLiveness" $ mapLookup pp liveness
  graphEnv <- return $ foldlGraphBlocks add_block mapEmpty g


--- a/compiler/GHC/Cmm/Reg.hs
+++ b/compiler/GHC/Cmm/Reg.hs
@@ -96,8 +96,8 @@ instance Outputable CmmReg where
 pprReg :: CmmReg -> SDoc
 pprReg r
   = case r of
-        CmmLocal  local                   -> pprLocalReg  local
-        CmmGlobal (GlobalRegUse global _) -> pprGlobalReg global
+        CmmLocal  local                     -> pprLocalReg  local
+        CmmGlobal (GlobalRegUse global _ty) -> pprGlobalReg global

 cmmRegType :: CmmReg -> CmmType
 cmmRegType (CmmLocal  reg) = localRegType reg
@@ -202,6 +202,13 @@ data GlobalReg
  | LongReg             -- long int registers (64-bit, really)
        {-# UNPACK #-} !Int     -- its number

+  -- I think we should redesign 'GlobalReg', for example instead of
+  -- FloatReg/DoubleReg/XmmReg/YmmReg/ZmmReg we could have a single VecReg
+  -- which also stores the type we are storing in it.
+  --
+  -- We might then be able to get rid of GlobalRegUse, as the type information
+  -- would already be contained in a 'GlobalReg'.
+
  | XmmReg                      -- 128-bit SIMD vector register
        {-# UNPACK #-} !Int     -- its number

@@ -212,39 +219,40 @@ data GlobalReg
        {-# UNPACK #-} !Int     -- its number

  -- STG registers
-  | Sp                  -- Stack ptr; points to last occupied stack location.
-  | SpLim               -- Stack limit
-  | Hp                  -- Heap ptr; points to last occupied heap location.
-  | HpLim               -- Heap limit register
-  | CCCS                -- Current cost-centre stack
-  | CurrentTSO          -- pointer to current thread's TSO
-  | CurrentNursery      -- pointer to allocation area
-  | HpAlloc             -- allocation count for heap check failure
+  | Sp                  -- ^ Stack ptr; points to last occupied stack location.
+  | SpLim               -- ^ Stack limit
+  | Hp                  -- ^ Heap ptr; points to last occupied heap location.
+  | HpLim               -- ^ Heap limit register
+  | CCCS                -- ^ Current cost-centre stack
+  | CurrentTSO          -- ^ pointer to current thread's TSO
+  | CurrentNursery      -- ^ pointer to allocation area
+  | HpAlloc             -- ^ allocation count for heap check failure

                -- We keep the address of some commonly-called
                -- functions in the register table, to keep code
                -- size down:
-  | EagerBlackholeInfo  -- stg_EAGER_BLACKHOLE_info
-  | GCEnter1            -- stg_gc_enter_1
-  | GCFun               -- stg_gc_fun
+  | EagerBlackholeInfo  -- ^ address of stg_EAGER_BLACKHOLE_info
+  | GCEnter1            -- ^ address of stg_gc_enter_1
+  | GCFun               -- ^ address of stg_gc_fun

-  -- Base offset for the register table, used for accessing registers
+  -- | Base offset for the register table, used for accessing registers
  -- which do not have real registers assigned to them.  This register
  -- will only appear after we have expanded GlobalReg into memory accesses
  -- (where necessary) in the native code generator.
  | BaseReg

-  -- The register used by the platform for the C stack pointer. This is
+  -- | The register used by the platform for the C stack pointer. This is
  -- a break in the STG abstraction used exclusively to setup stack unwinding
  -- information.
  | MachSp

-  -- The is a dummy register used to indicate to the stack unwinder where
+  -- | A dummy register used to indicate to the stack unwinder where
  -- a routine would return to.
  | UnwindReturnReg

-  -- Base Register for PIC (position-independent code) calculations
-  -- Only used inside the native code generator. It's exact meaning differs
+  -- | Base Register for PIC (position-independent code) calculations.
+  --
+  -- Only used inside the native code generator. Its exact meaning differs
  -- from platform to platform (see module PositionIndependentCode).
  | PicBaseReg


--- a/compiler/GHC/Cmm/Sink.hs
+++ b/compiler/GHC/Cmm/Sink.hs
@@ -709,7 +709,7 @@ conflicts platform (r, rhs, addr) node
 globalRegistersConflict :: Platform -> CmmExpr -> CmmNode e x -> Bool
 globalRegistersConflict platform expr node =
   -- See Note [Inlining foldRegsDefd]
-   inline foldRegsDefd platform (\b r -> b || globalRegUsedIn platform r expr)
+   inline foldRegsDefd platform (\b r -> b || globalRegUsedIn platform (globalRegUseGlobalReg r) expr)
                False node

 -- Returns True if node defines any local registers that are used in the

--- a/compiler/GHC/Cmm/Type.hs
+++ b/compiler/GHC/Cmm/Type.hs
@@ -4,7 +4,7 @@ module GHC.Cmm.Type
    , cInt
    , cmmBits, cmmFloat
    , typeWidth, setCmmTypeWidth
-    , cmmEqType, cmmEqType_ignoring_ptrhood
+    , cmmEqType, cmmCompatType
    , isFloatType, isGcPtrType, isBitsType
    , isWordAny, isWord32, isWord64
    , isFloat64, isFloat32
@@ -87,21 +87,27 @@ instance Outputable CmmCat where
 cmmEqType :: CmmType -> CmmType -> Bool -- Exact equality
 cmmEqType (CmmType c1 w1) (CmmType c2 w2) = c1==c2 && w1==w2

-cmmEqType_ignoring_ptrhood :: CmmType -> CmmType -> Bool
-  -- This equality is temporary; used in CmmLint
-  -- but the RTS files are not yet well-typed wrt pointers
-cmmEqType_ignoring_ptrhood (CmmType c1 w1) (CmmType c2 w2)
-   = c1 `weak_eq` c2 && w1==w2
+-- | A weaker notion of equality of 'CmmType's than 'cmmEqType',
+-- used (only) in Cmm Lint.
+--
+-- Why "weaker"? Because:
+--
+--  - we don't distinguish GcPtr vs NonGcPtr, because the the RTS files
+--    are not yet well-typed wrt pointers,
+--  - for vectors, we only compare the widths, because in practice things like
+--    X86 xmm registers support different types of data (e.g. 4xf32, 2xf64, 2xu64 etc).
+cmmCompatType :: CmmType -> CmmType -> Bool
+cmmCompatType (CmmType c1 w1) (CmmType c2 w2)
+   = c1 `weak_eq` c2 && w1 == w2
   where
     weak_eq :: CmmCat -> CmmCat -> Bool
-     FloatCat         `weak_eq` FloatCat         = True
-     FloatCat         `weak_eq` _other           = False
-     _other           `weak_eq` FloatCat         = False
-     (VecCat l1 cat1) `weak_eq` (VecCat l2 cat2) = l1 == l2
-                                                   && cat1 `weak_eq` cat2
-     (VecCat {})      `weak_eq` _other           = False
-     _other           `weak_eq` (VecCat {})      = False
-     _word1           `weak_eq` _word2           = True        -- Ignores GcPtr
+     FloatCat    `weak_eq` FloatCat    = True
+     FloatCat    `weak_eq` _other      = False
+     _other      `weak_eq` FloatCat    = False
+     (VecCat {}) `weak_eq` (VecCat {}) = True  -- only compare overall width
+     (VecCat {}) `weak_eq` _other      = False
+     _other      `weak_eq` (VecCat {}) = False
+     _word1      `weak_eq` _word2      = True  -- Ignores GcPtr

 --- Simple operations on CmmType -----
 typeWidth :: CmmType -> Width

--- a/compiler/GHC/CmmToAsm.hs
+++ b/compiler/GHC/CmmToAsm.hs
@@ -240,6 +240,7 @@ finishNativeGen logger config modLoc bufh us ngs

        -- dump global NCG stats for graph coloring allocator
        let stats = concat (ngs_colorStats ngs)
+            platform = ncgPlatform config
        unless (null stats) $ do

          -- build the global register conflict graph
@@ -250,7 +251,7 @@ finishNativeGen logger config modLoc bufh us ngs

          dump_stats (Color.pprStats stats graphGlobal)

-          let platform = ncgPlatform config
+
          putDumpFileMaybe logger
                  Opt_D_dump_asm_conflicts "Register conflict graph"
                  FormatText
@@ -265,7 +266,7 @@ finishNativeGen logger config modLoc bufh us ngs
        -- dump global NCG stats for linear allocator
        let linearStats = concat (ngs_linearStats ngs)
        unless (null linearStats) $
-          dump_stats (Linear.pprStats (concat (ngs_natives ngs)) linearStats)
+          dump_stats (Linear.pprStats platform (concat (ngs_natives ngs)) linearStats)

        -- write out the imports
        let ctx = ncgAsmContext config
@@ -506,7 +507,7 @@ cmmNativeGen logger ncgImpl us fileIds dbgMap cmm count
         if ( ncgRegsGraph config || ncgRegsIterative config )
          then do
                -- the regs usable for allocation
-                let (alloc_regs :: UniqFM RegClass (UniqSet RealReg))
+                let alloc_regs :: UniqFM RegClass (UniqSet RealReg)
                        = foldr (\r -> plusUFM_C unionUniqSets
                                        $ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r))
                                emptyUFM

--- a/compiler/GHC/CmmToAsm/AArch64.hs
+++ b/compiler/GHC/CmmToAsm/AArch64.hs
@@ -44,7 +44,7 @@ ncgAArch64 config
 -- | Instruction instance for aarch64
 instance Instruction AArch64.Instr where
        regUsageOfInstr         = AArch64.regUsageOfInstr
-        patchRegsOfInstr        = AArch64.patchRegsOfInstr
+        patchRegsOfInstr _      = AArch64.patchRegsOfInstr
        isJumpishInstr          = AArch64.isJumpishInstr
        jumpDestsOfInstr        = AArch64.jumpDestsOfInstr
        canFallthroughTo        = AArch64.canFallthroughTo
@@ -54,7 +54,7 @@ instance Instruction AArch64.Instr where
        takeDeltaInstr          = AArch64.takeDeltaInstr
        isMetaInstr             = AArch64.isMetaInstr
        mkRegRegMoveInstr _     = AArch64.mkRegRegMoveInstr
-        takeRegRegMoveInstr     = AArch64.takeRegRegMoveInstr
+        takeRegRegMoveInstr _   = AArch64.takeRegRegMoveInstr
        mkJumpInstr             = AArch64.mkJumpInstr
        mkStackAllocInstr       = AArch64.mkStackAllocInstr
        mkStackDeallocInstr     = AArch64.mkStackDeallocInstr

--- a/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
@@ -758,8 +758,79 @@ getRegister' config plat expr
        -- Conversions
        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e

-        _ -> pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+        MO_Eq {} -> notUnary
+        MO_Ne {} -> notUnary
+        MO_Mul {} -> notUnary
+        MO_S_MulMayOflo {} -> notUnary
+        MO_S_Quot {} -> notUnary
+        MO_S_Rem {} -> notUnary
+        MO_U_Quot {} -> notUnary
+        MO_U_Rem {} -> notUnary
+        MO_S_Ge {} -> notUnary
+        MO_S_Le {} -> notUnary
+        MO_S_Gt {} -> notUnary
+        MO_S_Lt {} -> notUnary
+        MO_U_Ge {} -> notUnary
+        MO_U_Le {} -> notUnary
+        MO_U_Gt {} -> notUnary
+        MO_U_Lt {} -> notUnary
+        MO_F_Add {} -> notUnary
+        MO_F_Sub {} -> notUnary
+        MO_F_Mul {} -> notUnary
+        MO_F_Quot {} -> notUnary
+        MO_FMA {} -> notUnary
+        MO_F_Eq {} -> notUnary
+        MO_F_Ne {} -> notUnary
+        MO_F_Ge {} -> notUnary
+        MO_F_Le {} -> notUnary
+        MO_F_Gt {} -> notUnary
+        MO_F_Lt {} -> notUnary
+        MO_And {} -> notUnary
+        MO_Or {} -> notUnary
+        MO_Xor {} -> notUnary
+        MO_Shl {} -> notUnary
+        MO_U_Shr {} -> notUnary
+        MO_S_Shr {} -> notUnary
+        MO_V_Insert {} -> notUnary
+        MO_V_Extract {} -> notUnary
+        MO_V_Add {} -> notUnary
+        MO_V_Sub {} -> notUnary
+        MO_V_Mul {} -> notUnary
+        MO_VS_Quot {} -> notUnary
+        MO_VS_Rem {} -> notUnary
+        MO_VS_Neg {} -> notUnary
+        MO_VU_Quot {} -> notUnary
+        MO_VU_Rem {} -> notUnary
+        MO_V_Shuffle {} -> notUnary
+        MO_VF_Shuffle  {} -> notUnary
+        MO_VF_Insert {} -> notUnary
+        MO_VF_Extract {} -> notUnary
+        MO_VF_Add {} -> notUnary
+        MO_VF_Sub {} -> notUnary
+        MO_VF_Mul {} -> notUnary
+        MO_VF_Quot {} -> notUnary
+        MO_Add {} -> notUnary
+        MO_Sub {} -> notUnary
+
+        MO_F_Min {} -> notUnary
+        MO_F_Max {} -> notUnary
+        MO_VU_Min {} -> notUnary
+        MO_VU_Max {} -> notUnary
+        MO_VS_Min {} -> notUnary
+        MO_VS_Max {} -> notUnary
+        MO_VF_Min {} -> notUnary
+        MO_VF_Max {} -> notUnary
+
+        MO_AlignmentCheck {} ->
+          pprPanic "getRegister' (monadic CmmMachOp):" (pdoc plat expr)
+
+        MO_V_Broadcast {} -> vectorsNeedLlvm
+        MO_VF_Broadcast {} -> vectorsNeedLlvm
+        MO_VF_Neg {} -> vectorsNeedLlvm
      where
+        notUnary = pprPanic "getRegister' (non-unary CmmMachOp with 1 argument):" (pdoc plat expr)
+        vectorsNeedLlvm =
+            sorry "SIMD operations on AArch64 currently require the LLVM backend"
        toImm W8 =  (OpImm (ImmInt 7))
        toImm W16 = (OpImm (ImmInt 15))
        toImm W32 = (OpImm (ImmInt 31))
@@ -1064,6 +1135,8 @@ getRegister' config plat expr
        MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ SUB d x y)
        MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ MUL d x y)
        MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ SDIV d x y)
+        MO_F_Min w   -> floatOp w (\d x y -> unitOL $ FMIN d x y)
+        MO_F_Max w   -> floatOp w (\d x y -> unitOL $ FMAX d x y)

        -- Floating point comparison
        MO_F_Eq w    -> floatCond w (\d x y -> toOL [ CMP x y, CSET d EQ ])
@@ -1087,10 +1160,56 @@ getRegister' config plat expr
        MO_U_Shr w -> intOp False w (\d x y -> unitOL $ LSR d x y)
        MO_S_Shr w -> intOp True  w (\d x y -> unitOL $ ASR d x y)

-        -- TODO
-
-        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $
-                (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+        -- Non-dyadic MachOp with 2 arguments
+        MO_S_Neg {} -> notDyadic
+        MO_F_Neg {} -> notDyadic
+        MO_FMA {} -> notDyadic
+        MO_Not {} -> notDyadic
+        MO_SF_Round {} -> notDyadic
+        MO_FS_Truncate {} -> notDyadic
+        MO_SS_Conv {} -> notDyadic
+        MO_UU_Conv {} -> notDyadic
+        MO_XX_Conv {} -> notDyadic
+        MO_FF_Conv {} -> notDyadic
+        MO_WF_Bitcast {} -> notDyadic
+        MO_FW_Bitcast {} -> notDyadic
+        MO_V_Broadcast {} -> notDyadic
+        MO_VF_Broadcast {} -> notDyadic
+        MO_V_Insert {} -> notDyadic
+        MO_VF_Insert {} -> notDyadic
+        MO_AlignmentCheck {} -> notDyadic
+        MO_RelaxedRead {} -> notDyadic
+
+        -- Vector operations: currently unsupported in the AArch64 NCG.
+        MO_V_Extract {} -> vectorsNeedLlvm
+        MO_V_Add {} -> vectorsNeedLlvm
+        MO_V_Sub {} -> vectorsNeedLlvm
+        MO_V_Mul {} -> vectorsNeedLlvm
+        MO_VS_Quot {} -> vectorsNeedLlvm
+        MO_VS_Rem {} -> vectorsNeedLlvm
+        MO_VS_Neg {} -> vectorsNeedLlvm
+        MO_VU_Quot {} -> vectorsNeedLlvm
+        MO_VU_Rem {} -> vectorsNeedLlvm
+        MO_VF_Extract {} -> vectorsNeedLlvm
+        MO_VF_Add {} -> vectorsNeedLlvm
+        MO_VF_Sub {} -> vectorsNeedLlvm
+        MO_VF_Neg {} -> vectorsNeedLlvm
+        MO_VF_Mul {} -> vectorsNeedLlvm
+        MO_VF_Quot {} -> vectorsNeedLlvm
+        MO_V_Shuffle {} -> vectorsNeedLlvm
+        MO_VF_Shuffle {} -> vectorsNeedLlvm
+        MO_VU_Min {} -> vectorsNeedLlvm
+        MO_VU_Max {} -> vectorsNeedLlvm
+        MO_VS_Min {} -> vectorsNeedLlvm
+        MO_VS_Max {} -> vectorsNeedLlvm
+        MO_VF_Min {} -> vectorsNeedLlvm
+        MO_VF_Max {} -> vectorsNeedLlvm
+        where
+          notDyadic =
+            pprPanic "getRegister' (non-dyadic CmmMachOp with 2 arguments): " $
+              (pprMachOp op) <+> text "in" <+> (pdoc plat expr)
+          vectorsNeedLlvm =
+            sorry "SIMD operations on AArch64 currently require the LLVM backend"

    -- Generic ternary case.
    CmmMachOp op [x, y, z] ->
@@ -1104,16 +1223,25 @@ getRegister' config plat expr
        -- x86 fnmadd - x * y + z <=> AArch64 fmsub : d = - r1 * r2 + r3
        -- x86 fnmsub - x * y - z <=> AArch64 fnmadd: d = - r1 * r2 - r3

-        MO_FMA var w -> case var of
-          FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
-          FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
-          FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
-          FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+        MO_FMA var l w
+          | l == 1
+          -> case var of
+            FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
+            FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FNMSub d n m a)
+            FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FMSub  d n m a)
+            FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
+          | otherwise
+          -> vectorsNeedLlvm
+
+        MO_V_Insert {} -> vectorsNeedLlvm
+        MO_VF_Insert {} -> vectorsNeedLlvm

        _ -> pprPanic "getRegister' (unhandled ternary CmmMachOp): " $
                (pprMachOp op) <+> text "in" <+> (pdoc plat expr)

      where
+          vectorsNeedLlvm =
+            sorry "SIMD operations on AArch64 currently require the LLVM backend"
          float3Op w op = do
            (reg_fx, format_x, code_fx) <- getFloatReg x
            (reg_fy, format_y, code_fy) <- getFloatReg y

--- a/compiler/GHC/CmmToAsm/AArch64/Instr.hs
+++ b/compiler/GHC/CmmToAsm/AArch64/Instr.hs
@@ -15,6 +15,7 @@ import GHC.CmmToAsm.Types
 import GHC.CmmToAsm.Utils
 import GHC.CmmToAsm.Config
 import GHC.Platform.Reg
+import GHC.Platform.Reg.Class.Unified

 import GHC.Platform.Regs
 import GHC.Cmm.BlockId
@@ -30,6 +31,7 @@ import GHC.Utils.Panic
 import Data.Maybe (fromMaybe)

 import GHC.Stack
+import GHC.CmmToAsm.Reg.Target (targetClassOfReg)

 -- | LR and FP (8 byte each) are the prologue of each stack frame
 stackFrameHeaderSize :: Int
@@ -143,6 +145,8 @@ regUsageOfInstr platform instr = case instr of
  FCVTZS dst src           -> usage (regOp src, regOp dst)
  FABS dst src             -> usage (regOp src, regOp dst)
  FSQRT dst src            -> usage (regOp src, regOp dst)
+  FMIN dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
+  FMAX dst src1 src2       -> usage (regOp src1 ++ regOp src2, regOp dst)
  FMA _ dst src1 src2 src3 ->
    usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)

@@ -153,8 +157,15 @@ regUsageOfInstr platform instr = case instr of
        -- filtering the usage is necessary, otherwise the register
        -- allocator will try to allocate pre-defined fixed stg
        -- registers as well, as they show up.
-        usage (src, dst) = RU (filter (interesting platform) src)
-                              (filter (interesting platform) dst)
+        usage (src, dst) = RU (map mkFmt $ filter (interesting platform) src)
+                              (map mkFmt $ filter (interesting platform) dst)
+          -- SIMD NCG TODO: the format here is used for register spilling/unspilling.
+          -- As the AArch64 NCG does not currently support SIMD registers,
+          -- this simple logic is OK.
+        mkFmt r = RegFormat r fmt
+          where fmt = case targetClassOfReg platform r of
+                        RcInteger -> II64
+                        RcFloatOrVector -> FF64

        regAddr :: AddrMode -> [Reg]
        regAddr (AddrRegReg r1 r2) = [r1, r2]
@@ -290,6 +301,8 @@ patchRegsOfInstr instr env = case instr of
    FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
    FABS o1 o2     -> FABS (patchOp o1) (patchOp o2)
    FSQRT o1 o2    -> FSQRT (patchOp o1) (patchOp o2)
+    FMIN o1 o2 o3  -> FMIN (patchOp o1) (patchOp o2) (patchOp o3)
+    FMAX o1 o2 o3  -> FMAX (patchOp o1) (patchOp o2) (patchOp o3)
    FMA s o1 o2 o3 o4 ->
      FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)

@@ -378,12 +391,12 @@ patchJumpInstr instr patchF
 mkSpillInstr
   :: HasCallStack
   => NCGConfig
-   -> Reg       -- register to spill
+   -> RegFormat -- register to spill
   -> Int       -- current stack delta
   -> Int       -- spill slot to use
   -> [Instr]

-mkSpillInstr config reg delta slot =
+mkSpillInstr config (RegFormat reg fmt) delta slot =
  case off - delta of
    imm | -256 <= imm && imm <= 255                               -> [ mkStrSp imm ]
    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkStrSp imm ]
@@ -394,8 +407,8 @@ mkSpillInstr config reg delta slot =
    where
        a .&~. b = a .&. (complement b)

-        fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
-
+        -- SIMD NCG TODO: emit the correct instructions to spill a vector register.
+        -- You can take inspiration from the X86_64 backend.
        mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
        mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
        mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
@@ -404,12 +417,11 @@ mkSpillInstr config reg delta slot =

 mkLoadInstr
   :: NCGConfig
-   -> Reg       -- register to load
+   -> RegFormat
   -> Int       -- current stack delta
   -> Int       -- spill slot to use
   -> [Instr]
-
-mkLoadInstr config reg delta slot =
+mkLoadInstr config (RegFormat reg fmt) delta slot =
  case off - delta of
    imm | -256 <= imm && imm <= 255                               -> [ mkLdrSp imm ]
    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkLdrSp imm ]
@@ -420,8 +432,8 @@ mkLoadInstr config reg delta slot =
    where
        a .&~. b = a .&. (complement b)

-        fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
-
+        -- SIMD NCG TODO: emit the correct instructions to load a vector register.
+        -- You can take inspiration from the X86_64 backend.
        mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
        mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
        mkLdrIp0 imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
@@ -451,8 +463,10 @@ isMetaInstr instr

 -- | Copy the value in a register to another one.
 -- Must work for all register classes.
-mkRegRegMoveInstr :: Reg -> Reg -> Instr
-mkRegRegMoveInstr src dst = ANN (text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+mkRegRegMoveInstr :: Format -> Reg -> Reg -> Instr
+mkRegRegMoveInstr _fmt src dst
+  = ANN (text "Reg->Reg Move: " <> ppr src <> text " -> " <> ppr dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
+  -- SIMD NCG TODO: incorrect for vector formats

 -- | Take the source and destination from this reg -> reg move instruction
 -- or Nothing if it's not one
@@ -661,6 +675,10 @@ data Instr
    | FCVTZS Operand Operand
    -- Float ABSolute value
    | FABS Operand Operand
+    -- Float minimum
+    | FMIN Operand Operand Operand
+    -- Float maximum
+    | FMAX Operand Operand Operand
    -- Float SQuare RooT
    | FSQRT Operand Operand

@@ -737,6 +755,8 @@ instrCon i =
      FCVTZS{} -> "FCVTZS"
      FABS{} -> "FABS"
      FSQRT{} -> "FSQRT"
+      FMIN {} -> "FMIN"
+      FMAX {} -> "FMAX"
      FMA variant _ _ _ _ ->
        case variant of
          FMAdd  -> "FMADD"
No results found