Commit acd79558 authored by Abhiroop Sarkar's avatar Abhiroop Sarkar Committed by Marge Bot

Add support for SIMD operations in the NCG

This adds support for constructing vector types from Float#, Double# etc
and performing arithmetic operations on them
Cleaned-Up-By: Ben Gamari's avatarBen Gamari <ben@well-typed.com>
parent df3e5b74
Pipeline #7910 passed with stages
in 297 minutes and 3 seconds
......@@ -64,13 +64,20 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
assign_regs assts (r:rs) regs | isVecType ty = vec
| isFloatType ty = float
| otherwise = int
where vec = case (w, regs) of
(W128, (vs, fs, ds, ls, s:ss))
| passVectorInReg W128 dflags -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
(W256, (vs, fs, ds, ls, s:ss))
| passVectorInReg W256 dflags -> k (RegisterParam (YmmReg s), (vs, fs, ds, ls, ss))
(W512, (vs, fs, ds, ls, s:ss))
| passVectorInReg W512 dflags -> k (RegisterParam (ZmmReg s), (vs, fs, ds, ls, ss))
where vec = case regs of
(vs, fs, ds, ls, s:ss)
| passVectorInReg w dflags
-> let elt_ty = vecElemType ty
reg_ty = if isFloatType elt_ty
then Float else Integer
reg_class = case w of
W128 -> XmmReg
W256 -> YmmReg
W512 -> ZmmReg
_ -> panic "CmmCallConv.assignArgumentsPos: Invalid vector width"
in k (RegisterParam
(reg_class s (vecLength ty) (typeWidth elt_ty) reg_ty),
(vs, fs, ds, ls, ss))
_ -> (assts, (r:rs))
float = case (w, regs) of
(W32, (vs, fs, ds, ls, s:ss))
......@@ -89,6 +96,7 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
(_, (vs, fs, ds, l:ls, ss)) | widthInBits w > widthInBits (wordWidth dflags)
-> k (RegisterParam l, (vs, fs, ds, ls, ss))
_ -> (assts, (r:rs))
k (asst, regs') = assign_regs ((r, asst) : assts) rs regs'
ty = arg_ty r
w = typeWidth ty
......@@ -202,11 +210,13 @@ nodeOnly = ([VanillaReg 1], [], [], [], [])
-- only use this functionality in hand-written C-- code in the RTS.
realArgRegsCover :: DynFlags -> [GlobalReg]
realArgRegsCover dflags
| passFloatArgsInXmm dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
realLongRegs dflags ++
map XmmReg (realXmmRegNos dflags)
| otherwise = map ($VGcPtr) (realVanillaRegs dflags) ++
realFloatRegs dflags ++
realDoubleRegs dflags ++
realLongRegs dflags ++
map XmmReg (realXmmRegNos dflags)
| passFloatArgsInXmm dflags
= map ($VGcPtr) (realVanillaRegs dflags) ++
realLongRegs dflags ++
map (\x -> XmmReg x 2 W64 Integer) (realXmmRegNos dflags)
| otherwise
= map ($VGcPtr) (realVanillaRegs dflags) ++
realFloatRegs dflags ++
realDoubleRegs dflags ++
realLongRegs dflags ++
map (\x -> XmmReg x 2 W64 Integer) (realXmmRegNos dflags)
......@@ -14,6 +14,7 @@ module CmmExpr
, currentTSOReg, currentNurseryReg, hpAllocReg, cccsReg
, node, baseReg
, VGcPtr(..)
, GlobalVecRegTy(..)
, DefinerOfRegs, UserOfRegs
, foldRegsDefd, foldRegsUsed
......@@ -41,6 +42,7 @@ import Outputable (panic)
import Unique
import Data.Set (Set)
import Data.Monoid ((<>))
import qualified Data.Set as Set
import BasicTypes (Alignment, mkAlignment, alignmentOf)
......@@ -392,6 +394,7 @@ data VGcPtr = VGcPtr | VNonGcPtr deriving( Eq, Show )
-----------------------------------------------------------------------------
{-
Note [Overlapping global registers]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The backend might not faithfully implement the abstraction of the STG
machine with independent registers for different values of type
......@@ -413,6 +416,26 @@ on a particular platform. The instance Eq GlobalReg is syntactic
equality of STG registers and does not take overlap into
account. However it is still used in UserOfRegs/DefinerOfRegs and
there are likely still bugs there, beware!
Note [SIMD registers]
~~~~~~~~~~~~~~~~~~~~~
GHC's treatment of SIMD registers is heavily modelled after the x86_64
architecture. Namely we have 128- (XMM), 256- (YMM), and 512-bit (ZMM)
registers. Furthermore, we treat each possible format in these registers as a
distinct register which overlaps with the others. For instance, we XMM1 as a
2xI64 register is distinct from but overlaps with (in the sense defined in Note
[Overlapping global registers]) its use as a 4xI32 register.
This model makes it easier to fit SIMD registers into the NCG, which generally
expects that each global register has a single, known CmmType.
In the future we could consider further refactoring this to eliminate the
XMM, YMM, and ZMM register names (which are quite x86-specific) and instead just
having a set of NxM-bit vector registers (e.g. Vec2x64A, Vec2x64B, ...,
Vec4x32A, ..., Vec4x64A).
-}
data GlobalReg
......@@ -432,12 +455,15 @@ data GlobalReg
| XmmReg -- 128-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
!Length !Width !GlobalVecRegTy
| YmmReg -- 256-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
!Length !Width !GlobalVecRegTy
| ZmmReg -- 512-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
!Length !Width !GlobalVecRegTy
-- STG registers
| Sp -- Stack ptr; points to last occupied stack location.
......@@ -478,17 +504,17 @@ data GlobalReg
deriving( Show )
data GlobalVecRegTy = Integer | Float
deriving (Show, Eq, Ord)
instance Eq GlobalReg where
VanillaReg i _ == VanillaReg j _ = i==j -- Ignore type when seeking clashes
FloatReg i == FloatReg j = i==j
DoubleReg i == DoubleReg j = i==j
LongReg i == LongReg j = i==j
-- NOTE: XMM, YMM, ZMM registers actually are the same registers
-- at least with respect to store at YMM i and then read from XMM i
-- and similarly for ZMM etc.
XmmReg i == XmmReg j = i==j
YmmReg i == YmmReg j = i==j
ZmmReg i == ZmmReg j = i==j
XmmReg i l w grt == XmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
YmmReg i l w grt == YmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
ZmmReg i l w grt == ZmmReg j l' w' grt' = i==j && l == l' && w == w' && grt == grt'
Sp == Sp = True
SpLim == SpLim = True
Hp == Hp = True
......@@ -512,9 +538,21 @@ instance Ord GlobalReg where
compare (FloatReg i) (FloatReg j) = compare i j
compare (DoubleReg i) (DoubleReg j) = compare i j
compare (LongReg i) (LongReg j) = compare i j
compare (XmmReg i) (XmmReg j) = compare i j
compare (YmmReg i) (YmmReg j) = compare i j
compare (ZmmReg i) (ZmmReg j) = compare i j
compare (XmmReg i l w grt)
(XmmReg j l' w' grt') = compare i j
<> compare l l'
<> compare w w'
<> compare grt grt'
compare (YmmReg i l w grt)
(YmmReg j l' w' grt') = compare i j
<> compare l l'
<> compare w w'
<> compare grt grt'
compare (ZmmReg i l w grt)
(ZmmReg j l' w' grt') = compare i j
<> compare l l'
<> compare w w'
<> compare grt grt'
compare Sp Sp = EQ
compare SpLim SpLim = EQ
compare Hp Hp = EQ
......@@ -538,12 +576,12 @@ instance Ord GlobalReg where
compare _ (DoubleReg _) = GT
compare (LongReg _) _ = LT
compare _ (LongReg _) = GT
compare (XmmReg _) _ = LT
compare _ (XmmReg _) = GT
compare (YmmReg _) _ = LT
compare _ (YmmReg _) = GT
compare (ZmmReg _) _ = LT
compare _ (ZmmReg _) = GT
compare (XmmReg _ _ _ _) _ = LT
compare _ (XmmReg _ _ _ _) = GT
compare (YmmReg _ _ _ _) _ = LT
compare _ (YmmReg _ _ _ _) = GT
compare (ZmmReg _ _ _ _) _ = LT
compare _ (ZmmReg _ _ _ _) = GT
compare Sp _ = LT
compare _ Sp = GT
compare SpLim _ = LT
......@@ -596,12 +634,15 @@ globalRegType dflags (VanillaReg _ VNonGcPtr) = bWord dflags
globalRegType _ (FloatReg _) = cmmFloat W32
globalRegType _ (DoubleReg _) = cmmFloat W64
globalRegType _ (LongReg _) = cmmBits W64
-- TODO: improve the internal model of SIMD/vectorized registers
-- the right design SHOULd improve handling of float and double code too.
-- see remarks in "NOTE [SIMD Design for the future]"" in StgCmmPrim
globalRegType _ (XmmReg _) = cmmVec 4 (cmmBits W32)
globalRegType _ (YmmReg _) = cmmVec 8 (cmmBits W32)
globalRegType _ (ZmmReg _) = cmmVec 16 (cmmBits W32)
globalRegType _ (XmmReg _ l w ty) = case ty of
Integer -> cmmVec l (cmmBits w)
Float -> cmmVec l (cmmFloat w)
globalRegType _ (YmmReg _ l w ty) = case ty of
Integer -> cmmVec l (cmmBits w)
Float -> cmmVec l (cmmFloat w)
globalRegType _ (ZmmReg _ l w ty) = case ty of
Integer -> cmmVec l (cmmBits w)
Float -> cmmVec l (cmmFloat w)
globalRegType dflags Hp = gcWord dflags
-- The initialiser for all
......
......@@ -148,9 +148,13 @@ lintCmmMiddle node = case node of
dflags <- getDynFlags
erep <- lintCmmExpr expr
let reg_ty = cmmRegType dflags reg
if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
then return ()
else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
case isVecCatType reg_ty of
True -> if ((typeWidth reg_ty) == (typeWidth erep))
then return ()
else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
_ -> if (erep `cmmEqType_ignoring_ptrhood` reg_ty)
then return ()
else cmmLintAssignErr (CmmAssign reg expr) erep reg_ty
CmmStore l r -> do
_ <- lintCmmExpr l
......
......@@ -136,8 +136,9 @@ data MachOp
| MO_VU_Rem Length Width
-- Floting point vector element insertion and extraction operations
| MO_VF_Insert Length Width -- Insert scalar into vector
| MO_VF_Extract Length Width -- Extract scalar from vector
| MO_VF_Broadcast Length Width -- Broadcast a scalar into a vector
| MO_VF_Insert Length Width -- Insert scalar into vector
| MO_VF_Extract Length Width -- Extract scalar from vector
-- Floating point vector operations
| MO_VF_Add Length Width
......@@ -430,6 +431,7 @@ machOpResultType dflags mop tys =
MO_VU_Quot l w -> cmmVec l (cmmBits w)
MO_VU_Rem l w -> cmmVec l (cmmBits w)
MO_VF_Broadcast l w -> cmmVec l (cmmFloat w)
MO_VF_Insert l w -> cmmVec l (cmmFloat w)
MO_VF_Extract _ w -> cmmFloat w
......@@ -522,16 +524,21 @@ machOpArgReps dflags op =
MO_VU_Quot _ r -> [r,r]
MO_VU_Rem _ r -> [r,r]
MO_VF_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
MO_VF_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
-- offset is always W32 as mentioned in StgCmmPrim.hs
MO_VF_Broadcast l r -> [vecwidth l r, r]
MO_VF_Insert l r -> [vecwidth l r, r, W32]
MO_VF_Extract l r -> [vecwidth l r, W32]
MO_VF_Add _ r -> [r,r]
MO_VF_Sub _ r -> [r,r]
MO_VF_Mul _ r -> [r,r]
MO_VF_Quot _ r -> [r,r]
MO_VF_Neg _ r -> [r]
-- NOTE: The below is owing to the fact that floats use the SSE registers
MO_VF_Add l w -> [vecwidth l w, vecwidth l w]
MO_VF_Sub l w -> [vecwidth l w, vecwidth l w]
MO_VF_Mul l w -> [vecwidth l w, vecwidth l w]
MO_VF_Quot l w -> [vecwidth l w, vecwidth l w]
MO_VF_Neg l w -> [vecwidth l w]
MO_AlignmentCheck _ r -> [r]
where
vecwidth l w = widthFromBytes (l*widthInBytes w)
-----------------------------------------------------------------------------
-- CallishMachOp
......
......@@ -6,6 +6,7 @@ module CmmType
, typeWidth, cmmEqType, cmmEqType_ignoring_ptrhood
, isFloatType, isGcPtrType, isBitsType
, isWord32, isWord64, isFloat64, isFloat32
, isVecCatType
, Width(..)
, widthInBits, widthInBytes, widthInLog, widthFromBytes
......@@ -133,7 +134,7 @@ cInt :: DynFlags -> CmmType
cInt dflags = cmmBits (cIntWidth dflags)
------------ Predicates ----------------
isFloatType, isGcPtrType, isBitsType :: CmmType -> Bool
isFloatType, isGcPtrType, isBitsType, isVecCatType :: CmmType -> Bool
isFloatType (CmmType FloatCat _) = True
isFloatType _other = False
......@@ -143,6 +144,9 @@ isGcPtrType _other = False
isBitsType (CmmType BitsCat _) = True
isBitsType _ = False
isVecCatType (CmmType (VecCat _ _) _) = True
isVecCatType _other = False
isWord32, isWord64, isFloat32, isFloat64 :: CmmType -> Bool
-- isWord64 is true of 64-bit non-floats (both gc-ptrs and otherwise)
-- isFloat32 and 64 are obvious
......
......@@ -713,6 +713,10 @@ pprMachOp_for_C mop = case mop of
(panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
++ " should have been handled earlier!")
MO_VF_Broadcast {} -> pprTrace "offending mop:"
(text "MO_VF_Broadcast")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Broadcast"
++ " should have been handled earlier!")
MO_VF_Insert {} -> pprTrace "offending mop:"
(text "MO_VF_Insert")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
......
......@@ -261,9 +261,9 @@ pprGlobalReg gr
FloatReg n -> char 'F' <> int n
DoubleReg n -> char 'D' <> int n
LongReg n -> char 'L' <> int n
XmmReg n -> text "XMM" <> int n
YmmReg n -> text "YMM" <> int n
ZmmReg n -> text "ZMM" <> int n
XmmReg n _ _ _ -> text "XMM" <> int n
YmmReg n _ _ _ -> text "YMM" <> int n
ZmmReg n _ _ _ -> text "ZMM" <> int n
Sp -> text "Sp"
SpLim -> text "SpLim"
Hp -> text "Hp"
......
......@@ -57,27 +57,27 @@ baseRegOffset dflags (DoubleReg 4) = oFFSET_StgRegTable_rD4 dflags
baseRegOffset dflags (DoubleReg 5) = oFFSET_StgRegTable_rD5 dflags
baseRegOffset dflags (DoubleReg 6) = oFFSET_StgRegTable_rD6 dflags
baseRegOffset _ (DoubleReg n) = panic ("Registers above D6 are not supported (tried to use D" ++ show n ++ ")")
baseRegOffset dflags (XmmReg 1) = oFFSET_StgRegTable_rXMM1 dflags
baseRegOffset dflags (XmmReg 2) = oFFSET_StgRegTable_rXMM2 dflags
baseRegOffset dflags (XmmReg 3) = oFFSET_StgRegTable_rXMM3 dflags
baseRegOffset dflags (XmmReg 4) = oFFSET_StgRegTable_rXMM4 dflags
baseRegOffset dflags (XmmReg 5) = oFFSET_StgRegTable_rXMM5 dflags
baseRegOffset dflags (XmmReg 6) = oFFSET_StgRegTable_rXMM6 dflags
baseRegOffset _ (XmmReg n) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
baseRegOffset dflags (YmmReg 1) = oFFSET_StgRegTable_rYMM1 dflags
baseRegOffset dflags (YmmReg 2) = oFFSET_StgRegTable_rYMM2 dflags
baseRegOffset dflags (YmmReg 3) = oFFSET_StgRegTable_rYMM3 dflags
baseRegOffset dflags (YmmReg 4) = oFFSET_StgRegTable_rYMM4 dflags
baseRegOffset dflags (YmmReg 5) = oFFSET_StgRegTable_rYMM5 dflags
baseRegOffset dflags (YmmReg 6) = oFFSET_StgRegTable_rYMM6 dflags
baseRegOffset _ (YmmReg n) = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
baseRegOffset dflags (ZmmReg 1) = oFFSET_StgRegTable_rZMM1 dflags
baseRegOffset dflags (ZmmReg 2) = oFFSET_StgRegTable_rZMM2 dflags
baseRegOffset dflags (ZmmReg 3) = oFFSET_StgRegTable_rZMM3 dflags
baseRegOffset dflags (ZmmReg 4) = oFFSET_StgRegTable_rZMM4 dflags
baseRegOffset dflags (ZmmReg 5) = oFFSET_StgRegTable_rZMM5 dflags
baseRegOffset dflags (ZmmReg 6) = oFFSET_StgRegTable_rZMM6 dflags
baseRegOffset _ (ZmmReg n) = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
baseRegOffset dflags (XmmReg 1 _ _ _) = oFFSET_StgRegTable_rXMM1 dflags
baseRegOffset dflags (XmmReg 2 _ _ _) = oFFSET_StgRegTable_rXMM2 dflags
baseRegOffset dflags (XmmReg 3 _ _ _) = oFFSET_StgRegTable_rXMM3 dflags
baseRegOffset dflags (XmmReg 4 _ _ _) = oFFSET_StgRegTable_rXMM4 dflags
baseRegOffset dflags (XmmReg 5 _ _ _) = oFFSET_StgRegTable_rXMM5 dflags
baseRegOffset dflags (XmmReg 6 _ _ _) = oFFSET_StgRegTable_rXMM6 dflags
baseRegOffset _ (XmmReg n _ _ _) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
baseRegOffset dflags (YmmReg 1 _ _ _) = oFFSET_StgRegTable_rYMM1 dflags
baseRegOffset dflags (YmmReg 2 _ _ _) = oFFSET_StgRegTable_rYMM2 dflags
baseRegOffset dflags (YmmReg 3 _ _ _) = oFFSET_StgRegTable_rYMM3 dflags
baseRegOffset dflags (YmmReg 4 _ _ _) = oFFSET_StgRegTable_rYMM4 dflags
baseRegOffset dflags (YmmReg 5 _ _ _) = oFFSET_StgRegTable_rYMM5 dflags
baseRegOffset dflags (YmmReg 6 _ _ _) = oFFSET_StgRegTable_rYMM6 dflags
baseRegOffset _ (YmmReg n _ _ _) = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
baseRegOffset dflags (ZmmReg 1 _ _ _) = oFFSET_StgRegTable_rZMM1 dflags
baseRegOffset dflags (ZmmReg 2 _ _ _) = oFFSET_StgRegTable_rZMM2 dflags
baseRegOffset dflags (ZmmReg 3 _ _ _) = oFFSET_StgRegTable_rZMM3 dflags
baseRegOffset dflags (ZmmReg 4 _ _ _) = oFFSET_StgRegTable_rZMM4 dflags
baseRegOffset dflags (ZmmReg 5 _ _ _) = oFFSET_StgRegTable_rZMM5 dflags
baseRegOffset dflags (ZmmReg 6 _ _ _) = oFFSET_StgRegTable_rZMM6 dflags
baseRegOffset _ (ZmmReg n _ _ _) = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
baseRegOffset dflags Sp = oFFSET_StgRegTable_rSp dflags
baseRegOffset dflags SpLim = oFFSET_StgRegTable_rSpLim dflags
baseRegOffset dflags (LongReg 1) = oFFSET_StgRegTable_rL1 dflags
......
......@@ -669,7 +669,7 @@ emitPrimOp _ [res] Word2DoubleOp [w] = emitPrimCall [res]
-- SIMD primops
emitPrimOp dflags [res] (VecBroadcastOp vcat n w) [e] = do
checkVecCompatibility dflags vcat n w
doVecPackOp (vecElemInjectCast dflags vcat w) ty zeros (replicate n e) res
doVecBroadcastOp (vecElemInjectCast dflags vcat w) ty zeros e res
where
zeros :: CmmExpr
zeros = CmmLit $ CmmVec (replicate n zero)
......@@ -1765,9 +1765,8 @@ vecElemProjectCast _ _ _ = Nothing
checkVecCompatibility :: DynFlags -> PrimOpVecCat -> Length -> Width -> FCode ()
checkVecCompatibility dflags vcat l w = do
when (hscTarget dflags /= HscLlvm) $ do
sorry $ unlines ["SIMD vector instructions require the LLVM back-end."
,"Please use -fllvm."]
when (hscTarget dflags /= HscLlvm && hscTarget dflags /= HscAsm) $ do
sorry "SIMD vector instructions not supported for the C backend or GHCi"
check vecWidth vcat l w
where
check :: Width -> PrimOpVecCat -> Length -> Width -> FCode ()
......@@ -1792,6 +1791,38 @@ checkVecCompatibility dflags vcat l w = do
------------------------------------------------------------------------------
-- Helpers for translating vector packing and unpacking.
doVecBroadcastOp :: Maybe MachOp -- Cast from element to vector component
-> CmmType -- Type of vector
-> CmmExpr -- Initial vector
-> CmmExpr -- Elements
-> CmmFormal -- Destination for result
-> FCode ()
doVecBroadcastOp maybe_pre_write_cast ty z es res = do
dst <- newTemp ty
emitAssign (CmmLocal dst) z
vecBroadcast dst es 0
where
vecBroadcast :: CmmFormal -> CmmExpr -> Int -> FCode ()
vecBroadcast src e _ = do
dst <- newTemp ty
if isFloatType (vecElemType ty)
then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Broadcast len wid)
[CmmReg (CmmLocal src), cast e])
--TODO : Add the MachOp MO_V_Broadcast
else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
[CmmReg (CmmLocal src), cast e])
emitAssign (CmmLocal res) (CmmReg (CmmLocal dst))
cast :: CmmExpr -> CmmExpr
cast val = case maybe_pre_write_cast of
Nothing -> val
Just cast -> CmmMachOp cast [val]
len :: Length
len = vecLength ty
wid :: Width
wid = typeWidth (vecElemType ty)
doVecPackOp :: Maybe MachOp -- Cast from element to vector component
-> CmmType -- Type of vector
......@@ -1809,16 +1840,16 @@ doVecPackOp maybe_pre_write_cast ty z es res = do
emitAssign (CmmLocal res) (CmmReg (CmmLocal src))
vecPack src (e : es) i = do
dst <- newTemp ty
if isFloatType (vecElemType ty)
then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
[CmmReg (CmmLocal src), cast e, iLit])
else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
[CmmReg (CmmLocal src), cast e, iLit])
vecPack dst es (i + 1)
dst <- newTemp ty
if isFloatType (vecElemType ty)
then emitAssign (CmmLocal dst) (CmmMachOp (MO_VF_Insert len wid)
[CmmReg (CmmLocal src), cast e, iLit])
else emitAssign (CmmLocal dst) (CmmMachOp (MO_V_Insert len wid)
[CmmReg (CmmLocal src), cast e, iLit])
vecPack dst es (i + 1)
where
-- vector indices are always 32-bits
iLit = CmmLit (CmmInt (toInteger i) W32)
iLit = CmmLit (CmmInt ((toInteger i) * 16) W32)
cast :: CmmExpr -> CmmExpr
cast val = case maybe_pre_write_cast of
......
......@@ -152,12 +152,12 @@ llvmFunArgs dflags live =
where platform = targetPlatform dflags
isLive r = not (isSSE r) || r `elem` alwaysLive || r `elem` live
isPassed r = not (isSSE r) || isLive r
isSSE (FloatReg _) = True
isSSE (DoubleReg _) = True
isSSE (XmmReg _) = True
isSSE (YmmReg _) = True
isSSE (ZmmReg _) = True
isSSE _ = False
isSSE (FloatReg _) = True
isSSE (DoubleReg _) = True
isSSE (XmmReg _ _ _ _ ) = True
isSSE (YmmReg _ _ _ _ ) = True
isSSE (ZmmReg _ _ _ _ ) = True
isSSE _ = False
-- | Llvm standard fun attributes
llvmStdFunAttrs :: [LlvmFuncAttr]
......
......@@ -1287,6 +1287,7 @@ genMachOp _ op [x] = case op of
MO_VU_Quot _ _ -> panicOp
MO_VU_Rem _ _ -> panicOp
MO_VF_Broadcast _ _ -> panicOp
MO_VF_Insert _ _ -> panicOp
MO_VF_Extract _ _ -> panicOp
......@@ -1483,6 +1484,7 @@ genMachOp_slow opt op [x, y] = case op of
MO_VS_Neg {} -> panicOp
MO_VF_Broadcast {} -> panicOp
MO_VF_Insert {} -> panicOp
MO_VF_Extract {} -> panicOp
......@@ -1844,9 +1846,9 @@ funEpilogue live = do
let liveRegs = alwaysLive ++ live
isSSE (FloatReg _) = True
isSSE (DoubleReg _) = True
isSSE (XmmReg _) = True
isSSE (YmmReg _) = True
isSSE (ZmmReg _) = True
isSSE (XmmReg _ _ _ _) = True
isSSE (YmmReg _ _ _ _) = True
isSSE (ZmmReg _ _ _ _) = True
isSSE _ = False
-- Set to value or "undef" depending on whether the register is
......
......@@ -60,24 +60,24 @@ lmGlobalReg dflags suf reg
DoubleReg 4 -> doubleGlobal $ "D4" ++ suf
DoubleReg 5 -> doubleGlobal $ "D5" ++ suf
DoubleReg 6 -> doubleGlobal $ "D6" ++ suf
XmmReg 1 -> xmmGlobal $ "XMM1" ++ suf
XmmReg 2 -> xmmGlobal $ "XMM2" ++ suf
XmmReg 3 -> xmmGlobal $ "XMM3" ++ suf
XmmReg 4 -> xmmGlobal $ "XMM4" ++ suf
XmmReg 5 -> xmmGlobal $ "XMM5" ++ suf
XmmReg 6 -> xmmGlobal $ "XMM6" ++ suf
YmmReg 1 -> ymmGlobal $ "YMM1" ++ suf
YmmReg 2 -> ymmGlobal $ "YMM2" ++ suf
YmmReg 3 -> ymmGlobal $ "YMM3" ++ suf
YmmReg 4 -> ymmGlobal $ "YMM4" ++ suf
YmmReg 5 -> ymmGlobal $ "YMM5" ++ suf
YmmReg 6 -> ymmGlobal $ "YMM6" ++ suf
ZmmReg 1 -> zmmGlobal $ "ZMM1" ++ suf
ZmmReg 2 -> zmmGlobal $ "ZMM2" ++ suf
ZmmReg 3 -> zmmGlobal $ "ZMM3" ++ suf
ZmmReg 4 -> zmmGlobal $ "ZMM4" ++ suf
ZmmReg 5 -> zmmGlobal $ "ZMM5" ++ suf
ZmmReg 6 -> zmmGlobal $ "ZMM6" ++ suf
XmmReg 1 _ _ _ -> xmmGlobal $ "XMM1" ++ suf
XmmReg 2 _ _ _ -> xmmGlobal $ "XMM2" ++ suf
XmmReg 3 _ _ _ -> xmmGlobal $ "XMM3" ++ suf
XmmReg 4 _ _ _ -> xmmGlobal $ "XMM4" ++ suf
XmmReg 5 _ _ _ -> xmmGlobal $ "XMM5" ++ suf
XmmReg 6 _ _ _ -> xmmGlobal $ "XMM6" ++ suf
YmmReg 1 _ _ _ -> ymmGlobal $ "YMM1" ++ suf
YmmReg 2 _ _ _ -> ymmGlobal $ "YMM2" ++ suf
YmmReg 3 _ _ _ -> ymmGlobal $ "YMM3" ++ suf
YmmReg 4 _ _ _ -> ymmGlobal $ "YMM4" ++ suf
YmmReg 5 _ _ _ -> ymmGlobal $ "YMM5" ++ suf
YmmReg 6 _ _ _ -> ymmGlobal $ "YMM6" ++ suf
ZmmReg 1 _ _ _ -> zmmGlobal $ "ZMM1" ++ suf
ZmmReg 2 _ _ _ -> zmmGlobal $ "ZMM2" ++ suf
ZmmReg 3 _ _ _ -> zmmGlobal $ "ZMM3" ++ suf
ZmmReg 4 _ _ _ -> zmmGlobal $ "ZMM4" ++ suf
ZmmReg 5 _ _ _ -> zmmGlobal $ "ZMM5" ++ suf
ZmmReg 6 _ _ _ -> zmmGlobal $ "ZMM6" ++ suf
MachSp -> wordGlobal $ "MachSp" ++ suf
_other -> panic $ "LlvmCodeGen.Reg: GlobalReg (" ++ (show reg)
++ ") not supported!"
......
......@@ -219,6 +219,7 @@ module DynFlags (
-- * SSE and AVX
isSseEnabled,
isSse2Enabled,
isSse4_1Enabled,
isSse4_2Enabled,
isBmiEnabled,
isBmi2Enabled,
......@@ -5908,6 +5909,8 @@ isSse2Enabled dflags = case platformArch (targetPlatform dflags) of
ArchX86 -> True
_ -> False
isSse4_1Enabled :: DynFlags -> Bool
isSse4_1Enabled dflags = sseVersion dflags >= Just SSE4
isSse4_2Enabled :: DynFlags -> Bool
isSse4_2Enabled dflags = sseVersion dflags >= Just SSE42
......
......@@ -10,9 +10,11 @@
--
module Format (
Format(..),
ScalarFormat(..),
intFormat,
floatFormat,
isFloatFormat,
isVecFormat,
cmmTypeFormat,
formatToWidth,
formatInBytes
......@@ -25,6 +27,29 @@ import GhcPrelude
import Cmm
import Outputable
-- Note [GHC's data format representations]