Commit 680441de authored by gmainlan@microsoft.com's avatar gmainlan@microsoft.com
Browse files

Merge branch 'wip/simd'

This merge revises and extends the current SIMD support in GHC. Notable
features:

 * Support for AVX, AVX2, and AVX-512. Support for AVX-512 is untested.

 * SIMD primops are currently LLVM-only and documented in
   compiler/prelude/primops.txt.pp.

 * By default only 128-bit wide SIMD vectors are passed in registers, and then
   only on the X86_64 architecture. There is a "hidden" flag,
   -fllvm-pass-vectors-in-regs, that causes GHC to generate LLVM code that
   assumes all vectors are passed in registers by LLVM. This can be used with a
   suitably patched version of LLVM, and if we get LLVM 3.4 patched, we can
   consider turning it on by default for LLVM 3.4+. This would mean that we
   couldn't mix LLVM <3.4-compiled object files with LLVM >=3.4-compiled object
   files, but I don't see that as much of a problem.

 * utils/genprimcode has been hacked up to allow us to write vector operations
   once and have them instantiated at multiple vector types. I'm not thrilled
   with this solution, but after discussing with Simon PJ, what I've implemented
   seems to be the minimal reasonable solution to the problem of exploding
   primop boilerplate. The changes are documented in
   compiler/prelude/primops.txt.pp.

 * Error handling is sub-optimal. My patch checks to make sure that vector
   primops can be compiled efficiently based on the current set of dynamic
   flags. For example, if -mavx is not specified and the user tries to use a
   primop that adds together two 256-bit wide vectors of double-precision
   elements, the user will see an error message like:

     ghc-stage2: sorry! (unimplemented feature or known bug)
       (GHC version 7.7.20130916 for x86_64-unknown-linux):
	 256-bit wide floating point SIMD vector instructions require at least -mavx.
parents 6e6e6f5b 25eeb678
......@@ -14,6 +14,7 @@ import Cmm (Convention(..))
import PprCmm ()
import DynFlags
import Platform
import Outputable
-- Calculate the 'GlobalReg' or stack locations for function call
......@@ -65,15 +66,22 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
| isFloatType ty = float
| otherwise = int
where vec = case (w, regs) of
(W128, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
(W128, (vs, fs, ds, ls, s:ss))
| passVectorInReg W128 dflags -> k (RegisterParam (XmmReg s), (vs, fs, ds, ls, ss))
(W256, (vs, fs, ds, ls, s:ss))
| passVectorInReg W256 dflags -> k (RegisterParam (YmmReg s), (vs, fs, ds, ls, ss))
(W512, (vs, fs, ds, ls, s:ss))
| passVectorInReg W512 dflags -> k (RegisterParam (ZmmReg s), (vs, fs, ds, ls, ss))
_ -> (assts, (r:rs))
float = case (w, regs) of
(W32, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss))
(W32, (vs, fs, ds, ls, s:ss))
| passFloatInXmm -> k (RegisterParam (FloatReg s), (vs, fs, ds, ls, ss))
(W32, (vs, f:fs, ds, ls, ss))
| not hasXmmRegs -> k (RegisterParam f, (vs, fs, ds, ls, ss))
(W64, (vs, fs, ds, ls, s:ss)) -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
| not passFloatInXmm -> k (RegisterParam f, (vs, fs, ds, ls, ss))
(W64, (vs, fs, ds, ls, s:ss))
| passFloatInXmm -> k (RegisterParam (DoubleReg s), (vs, fs, ds, ls, ss))
(W64, (vs, fs, d:ds, ls, ss))
| not hasXmmRegs -> k (RegisterParam d, (vs, fs, ds, ls, ss))
| not passFloatInXmm -> k (RegisterParam d, (vs, fs, ds, ls, ss))
(W80, _) -> panic "F80 unsupported register type"
_ -> (assts, (r:rs))
int = case (w, regs) of
......@@ -88,8 +96,26 @@ assignArgumentsPos dflags off conv arg_ty reps = (stk_off, assignments)
w = typeWidth ty
gcp | isGcPtrType ty = VGcPtr
| otherwise = VNonGcPtr
hasXmmRegs = mAX_Real_XMM_REG dflags /= 0
passFloatInXmm = passFloatArgsInXmm dflags
passFloatArgsInXmm :: DynFlags -> Bool
passFloatArgsInXmm dflags = case platformArch (targetPlatform dflags) of
ArchX86_64 -> True
_ -> False
-- On X86_64, we always pass 128-bit-wide vectors in registers. On 32-bit X86
-- and for all larger vector sizes on X86_64, LLVM's GHC calling convention
-- doesn't currently passing vectors in registers. The patch to update the GHC
-- calling convention to support passing SIMD vectors in registers is small and
-- well-contained, so it may make it into LLVM 3.4. The hidden
-- -fllvm-pass-vectors-in-regs flag will generate LLVM code that attempts to
-- pass vectors in registers, but it must only be used with a version of LLVM
-- that has an updated GHC calling convention.
passVectorInReg :: Width -> DynFlags -> Bool
passVectorInReg W128 dflags = case platformArch (targetPlatform dflags) of
ArchX86_64 -> True
_ -> gopt Opt_LlvmPassVectorsInRegisters dflags
passVectorInReg _ dflags = gopt Opt_LlvmPassVectorsInRegisters dflags
assignStack :: DynFlags -> ByteOff -> (a -> CmmType) -> [a]
-> (
......@@ -158,7 +184,10 @@ realVanillaRegs dflags = map VanillaReg $ regList (mAX_Real_Vanilla_REG dflags)
realFloatRegs dflags = map FloatReg $ regList (mAX_Real_Float_REG dflags)
realDoubleRegs dflags = map DoubleReg $ regList (mAX_Real_Double_REG dflags)
realLongRegs dflags = map LongReg $ regList (mAX_Real_Long_REG dflags)
realXmmRegNos dflags = regList (mAX_Real_XMM_REG dflags)
realXmmRegNos dflags
| isSse2Enabled dflags = regList (mAX_Real_XMM_REG dflags)
| otherwise = []
regList :: Int -> [Int]
regList n = [1 .. n]
......@@ -180,12 +209,11 @@ nodeOnly = ([VanillaReg 1], [], [], [], [])
-- only use this functionality in hand-written C-- code in the RTS.
realArgRegsCover :: DynFlags -> [GlobalReg]
realArgRegsCover dflags
| hasXmmRegs = map ($VGcPtr) (realVanillaRegs dflags) ++
realDoubleRegs dflags ++
realLongRegs dflags
| otherwise = map ($VGcPtr) (realVanillaRegs dflags) ++
realFloatRegs dflags ++
realDoubleRegs dflags ++
realLongRegs dflags
where
hasXmmRegs = mAX_Real_XMM_REG dflags /= 0
| passFloatArgsInXmm dflags = map ($VGcPtr) (realVanillaRegs dflags) ++
realLongRegs dflags ++
map XmmReg (realXmmRegNos dflags)
| otherwise = map ($VGcPtr) (realVanillaRegs dflags) ++
realFloatRegs dflags ++
realDoubleRegs dflags ++
realLongRegs dflags ++
map XmmReg (realXmmRegNos dflags)
......@@ -343,6 +343,12 @@ data GlobalReg
| XmmReg -- 128-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
| YmmReg -- 256-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
| ZmmReg -- 512-bit SIMD vector register
{-# UNPACK #-} !Int -- its number
-- STG registers
| Sp -- Stack ptr; points to last occupied stack location.
| SpLim -- Stack limit
......@@ -379,6 +385,8 @@ instance Eq GlobalReg where
DoubleReg i == DoubleReg j = i==j
LongReg i == LongReg j = i==j
XmmReg i == XmmReg j = i==j
YmmReg i == YmmReg j = i==j
ZmmReg i == ZmmReg j = i==j
Sp == Sp = True
SpLim == SpLim = True
Hp == Hp = True
......@@ -401,6 +409,8 @@ instance Ord GlobalReg where
compare (DoubleReg i) (DoubleReg j) = compare i j
compare (LongReg i) (LongReg j) = compare i j
compare (XmmReg i) (XmmReg j) = compare i j
compare (YmmReg i) (YmmReg j) = compare i j
compare (ZmmReg i) (ZmmReg j) = compare i j
compare Sp Sp = EQ
compare SpLim SpLim = EQ
compare Hp Hp = EQ
......@@ -424,6 +434,10 @@ instance Ord GlobalReg where
compare _ (LongReg _) = GT
compare (XmmReg _) _ = LT
compare _ (XmmReg _) = GT
compare (YmmReg _) _ = LT
compare _ (YmmReg _) = GT
compare (ZmmReg _) _ = LT
compare _ (ZmmReg _) = GT
compare Sp _ = LT
compare _ Sp = GT
compare SpLim _ = LT
......@@ -467,6 +481,8 @@ globalRegType _ (FloatReg _) = cmmFloat W32
globalRegType _ (DoubleReg _) = cmmFloat W64
globalRegType _ (LongReg _) = cmmBits W64
globalRegType _ (XmmReg _) = cmmVec 4 (cmmBits W32)
globalRegType _ (YmmReg _) = cmmVec 8 (cmmBits W32)
globalRegType _ (ZmmReg _) = cmmVec 16 (cmmBits W32)
globalRegType dflags Hp = gcWord dflags
-- The initialiser for all
......@@ -479,4 +495,6 @@ isArgReg (FloatReg {}) = True
isArgReg (DoubleReg {}) = True
isArgReg (LongReg {}) = True
isArgReg (XmmReg {}) = True
isArgReg (YmmReg {}) = True
isArgReg (ZmmReg {}) = True
isArgReg _ = False
......@@ -164,6 +164,8 @@ data CmmToken
| CmmT_bits32
| CmmT_bits64
| CmmT_bits128
| CmmT_bits256
| CmmT_bits512
| CmmT_float32
| CmmT_float64
| CmmT_gcptr
......@@ -244,6 +246,8 @@ reservedWordsFM = listToUFM $
( "bits32", CmmT_bits32 ),
( "bits64", CmmT_bits64 ),
( "bits128", CmmT_bits128 ),
( "bits256", CmmT_bits256 ),
( "bits512", CmmT_bits512 ),
( "float32", CmmT_float32 ),
( "float64", CmmT_float64 ),
-- New forms
......@@ -252,6 +256,8 @@ reservedWordsFM = listToUFM $
( "b32", CmmT_bits32 ),
( "b64", CmmT_bits64 ),
( "b128", CmmT_bits128 ),
( "b256", CmmT_bits256 ),
( "b512", CmmT_bits512 ),
( "f32", CmmT_float32 ),
( "f64", CmmT_float64 ),
( "gcptr", CmmT_gcptr )
......
......@@ -118,6 +118,10 @@ data MachOp
| MO_VS_Rem Length Width
| MO_VS_Neg Length Width
-- Unsigned vector multiply/divide
| MO_VU_Quot Length Width
| MO_VU_Rem Length Width
-- Floting point vector element insertion and extraction operations
| MO_VF_Insert Length Width -- Insert scalar into vector
| MO_VF_Extract Length Width -- Extract scalar from vector
......@@ -375,6 +379,9 @@ machOpResultType dflags mop tys =
MO_VS_Rem l w -> cmmVec l (cmmBits w)
MO_VS_Neg l w -> cmmVec l (cmmBits w)
MO_VU_Quot l w -> cmmVec l (cmmBits w)
MO_VU_Rem l w -> cmmVec l (cmmBits w)
MO_VF_Insert l w -> cmmVec l (cmmFloat w)
MO_VF_Extract _ w -> cmmFloat w
......@@ -461,6 +468,9 @@ machOpArgReps dflags op =
MO_VS_Rem _ r -> [r,r]
MO_VS_Neg _ r -> [r]
MO_VU_Quot _ r -> [r,r]
MO_VU_Rem _ r -> [r,r]
MO_VF_Insert l r -> [typeWidth (vec l (cmmFloat r)),r,wordWidth dflags]
MO_VF_Extract l r -> [typeWidth (vec l (cmmFloat r)),wordWidth dflags]
......
......@@ -289,6 +289,8 @@ import Data.Maybe
'bits32' { L _ (CmmT_bits32) }
'bits64' { L _ (CmmT_bits64) }
'bits128' { L _ (CmmT_bits128) }
'bits256' { L _ (CmmT_bits256) }
'bits512' { L _ (CmmT_bits512) }
'float32' { L _ (CmmT_float32) }
'float64' { L _ (CmmT_float64) }
'gcptr' { L _ (CmmT_gcptr) }
......@@ -777,6 +779,8 @@ typenot8 :: { CmmType }
| 'bits32' { b32 }
| 'bits64' { b64 }
| 'bits128' { b128 }
| 'bits256' { b256 }
| 'bits512' { b512 }
| 'float32' { f32 }
| 'float64' { f64 }
| 'gcptr' {% do dflags <- getDynFlags; return $ gcWord dflags }
......
module CmmType
( CmmType -- Abstract
, b8, b16, b32, b64, b128, f32, f64, bWord, bHalfWord, gcWord
, b8, b16, b32, b64, b128, b256, b512, f32, f64, bWord, bHalfWord, gcWord
, cInt, cLong
, cmmBits, cmmFloat
, typeWidth, cmmEqType, cmmEqType_ignoring_ptrhood
......@@ -107,12 +107,14 @@ cmmFloat = CmmType FloatCat
-------- Common CmmTypes ------------
-- Floats and words of specific widths
b8, b16, b32, b64, b128, f32, f64 :: CmmType
b8, b16, b32, b64, b128, b256, b512, f32, f64 :: CmmType
b8 = cmmBits W8
b16 = cmmBits W16
b32 = cmmBits W32
b64 = cmmBits W64
b128 = cmmBits W128
b256 = cmmBits W256
b512 = cmmBits W512
f32 = cmmFloat W32
f64 = cmmFloat W64
......@@ -166,6 +168,8 @@ data Width = W8 | W16 | W32 | W64
-- used in x86 native codegen only.
-- (we use Ord, so it'd better be in this order)
| W128
| W256
| W512
deriving (Eq, Ord, Show)
instance Outputable Width where
......@@ -177,6 +181,8 @@ mrStr W16 = sLit("W16")
mrStr W32 = sLit("W32")
mrStr W64 = sLit("W64")
mrStr W128 = sLit("W128")
mrStr W256 = sLit("W256")
mrStr W512 = sLit("W512")
mrStr W80 = sLit("W80")
......@@ -216,6 +222,8 @@ widthInBits W16 = 16
widthInBits W32 = 32
widthInBits W64 = 64
widthInBits W128 = 128
widthInBits W256 = 256
widthInBits W512 = 512
widthInBits W80 = 80
widthInBytes :: Width -> Int
......@@ -224,6 +232,8 @@ widthInBytes W16 = 2
widthInBytes W32 = 4
widthInBytes W64 = 8
widthInBytes W128 = 16
widthInBytes W256 = 32
widthInBytes W512 = 64
widthInBytes W80 = 10
widthFromBytes :: Int -> Width
......@@ -232,6 +242,8 @@ widthFromBytes 2 = W16
widthFromBytes 4 = W32
widthFromBytes 8 = W64
widthFromBytes 16 = W128
widthFromBytes 32 = W256
widthFromBytes 64 = W512
widthFromBytes 10 = W80
widthFromBytes n = pprPanic "no width for given number of bytes" (ppr n)
......@@ -242,6 +254,8 @@ widthInLog W16 = 1
widthInLog W32 = 2
widthInLog W64 = 3
widthInLog W128 = 4
widthInLog W256 = 5
widthInLog W512 = 6
widthInLog W80 = panic "widthInLog: F80"
-- widening / narrowing
......
......@@ -651,6 +651,15 @@ pprMachOp_for_C mop = case mop of
(panic $ "PprC.pprMachOp_for_C: MO_VS_Neg"
++ " should have been handled earlier!")
MO_VU_Quot {} -> pprTrace "offending mop:"
(ptext $ sLit "MO_VU_Quot")
(panic $ "PprC.pprMachOp_for_C: MO_VU_Quot"
++ " should have been handled earlier!")
MO_VU_Rem {} -> pprTrace "offending mop:"
(ptext $ sLit "MO_VU_Rem")
(panic $ "PprC.pprMachOp_for_C: MO_VU_Rem"
++ " should have been handled earlier!")
MO_VF_Insert {} -> pprTrace "offending mop:"
(ptext $ sLit "MO_VF_Insert")
(panic $ "PprC.pprMachOp_for_C: MO_VF_Insert"
......
......@@ -256,6 +256,8 @@ pprGlobalReg gr
DoubleReg n -> char 'D' <> int n
LongReg n -> char 'L' <> int n
XmmReg n -> ptext (sLit "XMM") <> int n
YmmReg n -> ptext (sLit "YMM") <> int n
ZmmReg n -> ptext (sLit "ZMM") <> int n
Sp -> ptext (sLit "Sp")
SpLim -> ptext (sLit "SpLim")
Hp -> ptext (sLit "Hp")
......
......@@ -56,6 +56,20 @@ baseRegOffset dflags (XmmReg 4) = oFFSET_StgRegTable_rXMM4 dflags
baseRegOffset dflags (XmmReg 5) = oFFSET_StgRegTable_rXMM5 dflags
baseRegOffset dflags (XmmReg 6) = oFFSET_StgRegTable_rXMM6 dflags
baseRegOffset _ (XmmReg n) = panic ("Registers above XMM6 are not supported (tried to use XMM" ++ show n ++ ")")
baseRegOffset dflags (YmmReg 1) = oFFSET_StgRegTable_rYMM1 dflags
baseRegOffset dflags (YmmReg 2) = oFFSET_StgRegTable_rYMM2 dflags
baseRegOffset dflags (YmmReg 3) = oFFSET_StgRegTable_rYMM3 dflags
baseRegOffset dflags (YmmReg 4) = oFFSET_StgRegTable_rYMM4 dflags
baseRegOffset dflags (YmmReg 5) = oFFSET_StgRegTable_rYMM5 dflags
baseRegOffset dflags (YmmReg 6) = oFFSET_StgRegTable_rYMM6 dflags
baseRegOffset _ (YmmReg n) = panic ("Registers above YMM6 are not supported (tried to use YMM" ++ show n ++ ")")
baseRegOffset dflags (ZmmReg 1) = oFFSET_StgRegTable_rZMM1 dflags
baseRegOffset dflags (ZmmReg 2) = oFFSET_StgRegTable_rZMM2 dflags
baseRegOffset dflags (ZmmReg 3) = oFFSET_StgRegTable_rZMM3 dflags
baseRegOffset dflags (ZmmReg 4) = oFFSET_StgRegTable_rZMM4 dflags
baseRegOffset dflags (ZmmReg 5) = oFFSET_StgRegTable_rZMM5 dflags
baseRegOffset dflags (ZmmReg 6) = oFFSET_StgRegTable_rZMM6 dflags
baseRegOffset _ (ZmmReg n) = panic ("Registers above ZMM6 are not supported (tried to use ZMM" ++ show n ++ ")")
baseRegOffset dflags Sp = oFFSET_StgRegTable_rSp dflags
baseRegOffset dflags SpLim = oFFSET_StgRegTable_rSpLim dflags
baseRegOffset dflags (LongReg 1) = oFFSET_StgRegTable_rL1 dflags
......
......@@ -47,6 +47,8 @@ data ArgRep = P -- GC Ptr
| F -- Float
| D -- Double
| V16 -- 16-byte (128-bit) vectors of Float/Double/Int8/Word32/etc.
| V32 -- 32-byte (256-bit) vectors of Float/Double/Int8/Word32/etc.
| V64 -- 64-byte (512-bit) vectors of Float/Double/Int8/Word32/etc.
instance Outputable ArgRep where ppr = text . argRepString
argRepString :: ArgRep -> String
......@@ -57,6 +59,8 @@ argRepString V = "V"
argRepString F = "F"
argRepString D = "D"
argRepString V16 = "V16"
argRepString V32 = "V32"
argRepString V64 = "V64"
toArgRep :: PrimRep -> ArgRep
toArgRep VoidRep = V
......@@ -68,9 +72,11 @@ toArgRep Int64Rep = L
toArgRep Word64Rep = L
toArgRep FloatRep = F
toArgRep DoubleRep = D
toArgRep (VecRep len elem)
| len*primElemRepSizeB elem == 16 = V16
| otherwise = error "toArgRep: bad vector primrep"
toArgRep (VecRep len elem) = case len*primElemRepSizeB elem of
16 -> V16
32 -> V32
64 -> V64
_ -> error "toArgRep: bad vector primrep"
isNonV :: ArgRep -> Bool
isNonV V = False
......@@ -84,6 +90,8 @@ argRepSizeW dflags L = wORD64_SIZE `quot` wORD_SIZE dflags
argRepSizeW dflags D = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
argRepSizeW _ V = 0
argRepSizeW dflags V16 = 16 `quot` wORD_SIZE dflags
argRepSizeW dflags V32 = 32 `quot` wORD_SIZE dflags
argRepSizeW dflags V64 = 64 `quot` wORD_SIZE dflags
idArgRep :: Id -> ArgRep
idArgRep = toArgRep . idPrimRep
......@@ -132,4 +140,6 @@ slowCallPattern (F: _) = (fsLit "stg_ap_f", 1)
slowCallPattern (D: _) = (fsLit "stg_ap_d", 1)
slowCallPattern (L: _) = (fsLit "stg_ap_l", 1)
slowCallPattern (V16: _) = (fsLit "stg_ap_v16", 1)
slowCallPattern (V32: _) = (fsLit "stg_ap_v32", 1)
slowCallPattern (V64: _) = (fsLit "stg_ap_v64", 1)
slowCallPattern [] = (fsLit "stg_ap_0", 0)
......@@ -385,6 +385,8 @@ stdPattern reps
[D] -> Just ARG_D
[L] -> Just ARG_L
[V16] -> Just ARG_V16
[V32] -> Just ARG_V32
[V64] -> Just ARG_V64
[N,N] -> Just ARG_NN
[N,P] -> Just ARG_NP
......
This diff is collapsed.
......@@ -250,8 +250,12 @@ PRIMOP_BITS_NAMES = primop-data-decl.hs-incl \
primop-code-size.hs-incl \
primop-can-fail.hs-incl \
primop-strictness.hs-incl \
primop-fixity.hs-incl \
primop-primop-info.hs-incl
primop-fixity.hs-incl \
primop-primop-info.hs-incl \
primop-vector-uniques.hs-incl \
primop-vector-tys.hs-incl \
primop-vector-tys-exports.hs-incl \
primop-vector-tycons.hs-incl
PRIMOP_BITS_STAGE1 = $(addprefix compiler/stage1/build/,$(PRIMOP_BITS_NAMES))
PRIMOP_BITS_STAGE2 = $(addprefix compiler/stage2/build/,$(PRIMOP_BITS_NAMES))
......@@ -290,6 +294,14 @@ compiler/stage$1/build/primop-fixity.hs-incl: compiler/stage$1/build/primops.txt
"$$(genprimopcode_INPLACE)" --fixity < $$< > $$@
compiler/stage$1/build/primop-primop-info.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
"$$(genprimopcode_INPLACE)" --primop-primop-info < $$< > $$@
compiler/stage$1/build/primop-vector-uniques.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
"$$(genprimopcode_INPLACE)" --primop-vector-uniques < $$< > $$@
compiler/stage$1/build/primop-vector-tys.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
"$$(genprimopcode_INPLACE)" --primop-vector-tys < $$< > $$@
compiler/stage$1/build/primop-vector-tys-exports.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
"$$(genprimopcode_INPLACE)" --primop-vector-tys-exports < $$< > $$@
compiler/stage$1/build/primop-vector-tycons.hs-incl: compiler/stage$1/build/primops.txt $$$$(genprimopcode_INPLACE)
"$$(genprimopcode_INPLACE)" --primop-vector-tycons < $$< > $$@
# Usages aren't used any more; but the generator
# can still generate them if we want them back
......
......@@ -462,6 +462,8 @@ push_alts L = bci_PUSH_ALTS_L
push_alts F = bci_PUSH_ALTS_F
push_alts D = bci_PUSH_ALTS_D
push_alts V16 = error "push_alts: vector"
push_alts V32 = error "push_alts: vector"
push_alts V64 = error "push_alts: vector"
return_ubx :: ArgRep -> Word16
return_ubx V = bci_RETURN_V
......@@ -471,6 +473,8 @@ return_ubx L = bci_RETURN_L
return_ubx F = bci_RETURN_F
return_ubx D = bci_RETURN_D
return_ubx V16 = error "return_ubx: vector"
return_ubx V32 = error "return_ubx: vector"
return_ubx V64 = error "return_ubx: vector"
-- Make lists of host-sized words for literals, so that when the
-- words are placed in memory at increasing addresses, the
......
......@@ -155,6 +155,8 @@ llvmFunArgs dflags live =
isSSE (FloatReg _) = True
isSSE (DoubleReg _) = True
isSSE (XmmReg _) = True
isSSE (YmmReg _) = True
isSSE (ZmmReg _) = True
isSSE _ = False
-- | Llvm standard fun attributes
......
......@@ -967,6 +967,9 @@ genMachOp _ op [x] = case op of
MO_VS_Quot _ _ -> panicOp
MO_VS_Rem _ _ -> panicOp
MO_VU_Quot _ _ -> panicOp
MO_VU_Rem _ _ -> panicOp
MO_VF_Insert _ _ -> panicOp
MO_VF_Extract _ _ -> panicOp
......@@ -1140,6 +1143,9 @@ genMachOp_slow opt op [x, y] = case op of
MO_VS_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SDiv
MO_VS_Rem l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_SRem
MO_VU_Quot l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_UDiv
MO_VU_Rem l w -> genCastBinMach (LMVector l (widthToLlvmInt w)) LM_MO_URem
MO_VF_Add l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FAdd
MO_VF_Sub l w -> genCastBinMach (LMVector l (widthToLlvmFloat w)) LM_MO_FSub
......@@ -1527,6 +1533,8 @@ funEpilogue live = do
isSSE (FloatReg _) = True
isSSE (DoubleReg _) = True
isSSE (XmmReg _) = True
isSSE (YmmReg _) = True
isSSE (ZmmReg _) = True
isSSE _ = False
-- Set to value or "undef" depending on whether the register is
......
......@@ -62,6 +62,18 @@ lmGlobalReg dflags suf reg
XmmReg 4 -> xmmGlobal $ "XMM4" ++ suf
XmmReg 5 -> xmmGlobal $ "XMM5" ++ suf
XmmReg 6 -> xmmGlobal $ "XMM6" ++ suf
YmmReg 1 -> ymmGlobal $ "YMM1" ++ suf
YmmReg 2 -> ymmGlobal $ "YMM2" ++ suf
YmmReg 3 -> ymmGlobal $ "YMM3" ++ suf
YmmReg 4 -> ymmGlobal $ "YMM4" ++ suf
YmmReg 5 -> ymmGlobal $ "YMM5" ++ suf
YmmReg 6 -> ymmGlobal $ "YMM6" ++ suf
ZmmReg 1 -> zmmGlobal $ "ZMM1" ++ suf
ZmmReg 2 -> zmmGlobal $ "ZMM2" ++ suf
ZmmReg 3 -> zmmGlobal $ "ZMM3" ++ suf
ZmmReg 4 -> zmmGlobal $ "ZMM4" ++ suf
ZmmReg 5 -> zmmGlobal $ "ZMM5" ++ suf
ZmmReg 6 -> zmmGlobal $ "ZMM6" ++ suf
_other -> panic $ "LlvmCodeGen.Reg: GlobalReg (" ++ (show reg)
++ ") not supported!"
-- LongReg, HpLim, CCSS, CurrentTSO, CurrentNusery, HpAlloc
......@@ -72,6 +84,8 @@ lmGlobalReg dflags suf reg
floatGlobal name = LMNLocalVar (fsLit name) LMFloat
doubleGlobal name = LMNLocalVar (fsLit name) LMDouble
xmmGlobal name = LMNLocalVar (fsLit name) (LMVector 4 (LMInt 32))
ymmGlobal name = LMNLocalVar (fsLit name) (LMVector 8 (LMInt 32))
zmmGlobal name = LMNLocalVar (fsLit name) (LMVector 16 (LMInt 32))
-- | A list of STG Registers that should always be considered alive
alwaysLive :: [GlobalReg]
......
......@@ -20,6 +20,10 @@ import System.IO
import Data.List ( sortBy )
import Data.Function ( on )
#if x86_64_TARGET_ARCH
#define REWRITE_AVX
#endif
-- Magic Strings
secStmt, infoSec, newLine, textStmt, dataStmt, syntaxUnified :: B.ByteString
secStmt = B.pack "\t.section\t"
......@@ -47,7 +51,7 @@ llvmFixupAsm dflags f1 f2 = {-# SCC "llvm_mangler" #-} do
w <- openBinaryFile f2 WriteMode
ss <- readSections r w
hClose r
let fixed = fixTables ss
let fixed = (map rewriteAVX . fixTables) ss
mapM_ (writeSection w) fixed
hClose w
return ()
......@@ -90,6 +94,39 @@ writeSection w (hdr, cts) = do
B.hPutStrLn w hdr
B.hPutStrLn w cts
#if REWRITE_AVX
rewriteAVX :: Section -> Section
rewriteAVX = rewriteVmovaps . rewriteVmovdqa
rewriteVmovdqa :: Section -> Section
rewriteVmovdqa = rewriteInstructions vmovdqa vmovdqu
where
vmovdqa, vmovdqu :: B.ByteString
vmovdqa = B.pack "vmovdqa"
vmovdqu = B.pack "vmovdqu"
rewriteVmovap :: Section -> Section
rewriteVmovap = rewriteInstructions vmovap vmovup
where
vmovap, vmovup :: B.ByteString
vmovap = B.pack "vmovap"
vmovup = B.pack "vmovup"
rewriteInstructions :: B.ByteString -> B.ByteString -> Section -> Section
rewriteInstructions matchBS replaceBS (hdr, cts) =
(hdr, loop cts)
where
loop :: B.ByteString -> B.ByteString
loop cts =
case B.breakSubstring cts matchBS of
(hd,tl) | B.null tl -> hd
| otherwise -> hd `B.append` replaceBS `B.append`
loop (B.drop (B.length matchBS) tl)
#else /* !REWRITE_AVX */
rewriteAVX :: Section -> Section
rewriteAVX = id
#endif /* !REWRITE_SSE */
-- | Reorder and convert sections so info tables end up next to the
-- code. Also does stack fixups.
fixTables :: [Section] -> [Section]
......
......@@ -220,7 +220,7 @@ processOneArg opt_kind rest arg args
Just min <- parseInt min_s -> Right (f maj min, args)
| [maj_s] <- split '.' rest_no_eq,
Just maj <- parseInt maj_s -> Right (f maj 0, args)
| otherwise -> Left ("malformed version argument in " ++ dash_arg)
| otherwise -> Right (f 1 0, args)
findArg :: [Flag m] -> String -> Maybe (String, OptKind m)
......
......@@ -1380,7 +1380,10 @@ runPhase (RealPhase LlvmLlc) input_fn dflags
++ [SysTools.Option tbaa]
++ map SysTools.Option fpOpts
++ map SysTools.Option abiOpts
++ map SysTools.Option sseOpts)