Commit fd740140 authored by Austin Seipp's avatar Austin Seipp

Add support for prefetch with locality levels.

This patch adds support for several new primitive operations which
support using processor-specific instructions to help guide data and
cache locality decisions. We have levels ranging from [0..3]

For LLVM, we generate llvm.prefetch intrinsics at the proper locality
level (similar to GCC.)

For x86 we generate prefetch{NTA, t2, t1, t0} instructions. On SPARC and
PowerPC, the locality levels are ignored.

This closes #8256.
Authored-by: Carter Schonwald's avatarCarter Tazio Schonwald <carter.schonwald@gmail.com>
Signed-off-by: default avatarAustin Seipp <austin@well-typed.com>
parent 627d1e00
......@@ -107,10 +107,10 @@ data MachOp
-- Vector element insertion and extraction operations
| MO_V_Insert Length Width -- Insert scalar into vector
| MO_V_Extract Length Width -- Extract scalar from vector
-- Integer vector operations
| MO_V_Add Length Width
| MO_V_Sub Length Width
| MO_V_Add Length Width
| MO_V_Sub Length Width
| MO_V_Mul Length Width
-- Signed vector multiply/divide
......@@ -127,8 +127,8 @@ data MachOp
| MO_VF_Extract Length Width -- Extract scalar from vector
-- Floating point vector operations
| MO_VF_Add Length Width
| MO_VF_Sub Length Width
| MO_VF_Add Length Width
| MO_VF_Sub Length Width
| MO_VF_Neg Length Width -- unary -
| MO_VF_Mul Length Width
| MO_VF_Quot Length Width
......@@ -528,8 +528,14 @@ data CallishMachOp
| MO_Touch -- Keep variables live (when using interior pointers)
-- Prefetch
| MO_Prefetch_Data -- Prefetch hint. May change program performance but not
| MO_Prefetch_Data Int -- Prefetch hint. May change program performance but not
-- program behavior.
-- the Int can be 0-3. Needs to be known at compile time
-- to interact with code generation correctly.
-- TODO: add support for prefetch WRITES,
-- currently only exposes prefetch reads, which
-- would the majority of use cases in ghc anyways
-- Note that these three MachOps all take 1 extra parameter than the
-- standard C lib versions. The extra (last) parameter contains
......
......@@ -952,8 +952,16 @@ callishMachOps = listToUFM $
( "write_barrier", MO_WriteBarrier ),
( "memcpy", MO_Memcpy ),
( "memset", MO_Memset ),
( "memmove", MO_Memmove )
( "memmove", MO_Memmove ),
("prefetch0",MO_Prefetch_Data 0),
("prefetch1",MO_Prefetch_Data 1),
("prefetch2",MO_Prefetch_Data 2),
("prefetch3",MO_Prefetch_Data 3)
-- ToDo: the rest, maybe
-- edit: which rest?
-- also: how do we tell CMM Lint how to type check callish macops?
]
parseSafety :: String -> P Safety
......
......@@ -759,7 +759,9 @@ pprCallishMachOp_for_C mop
MO_Add2 {} -> unsupported
MO_U_Mul2 {} -> unsupported
MO_Touch -> unsupported
MO_Prefetch_Data -> unsupported
(MO_Prefetch_Data _ ) -> unsupported
--- we could support prefetch via "__builtin_prefetch"
--- Not adding it for now
where unsupported = panic ("pprCallishMachOp_for_C: " ++ show mop
++ " not supported!")
......
......@@ -255,15 +255,6 @@ emitPrimOp dflags [res] SizeofMutableByteArrayOp [arg]
emitPrimOp _ res@[] TouchOp args@[_arg]
= do emitPrimCall res MO_Touch args
emitPrimOp _ res@[] PrefetchByteArrayOp args@[_arg]
= do emitPrimCall res MO_Prefetch_Data args
emitPrimOp _ res@[] PrefetchMutableByteArrayOp args@[_arg]
= do emitPrimCall res MO_Prefetch_Data args
emitPrimOp _ res@[] PrefetchAddrOp args@[_arg]
= do emitPrimCall res MO_Prefetch_Data args
-- #define byteArrayContentszh(r,a) r = BYTE_ARR_CTS(a)
emitPrimOp dflags [res] ByteArrayContents_Char [arg]
= emitAssign (CmmLocal res) (cmmOffsetB dflags arg (arrWordsHdrSize dflags))
......@@ -656,9 +647,22 @@ emitPrimOp dflags res (VecWriteScalarOffAddrOp vcat n w) args = do
ty = vecCmmCat vcat w
-- Prefetch
emitPrimOp _ res PrefetchByteArrayOp args = doPrefetchByteArrayOp res args
emitPrimOp _ res PrefetchMutableByteArrayOp args = doPrefetchByteArrayOp res args
emitPrimOp _ res PrefetchAddrOp args = doPrefetchAddrOp res args
emitPrimOp _ res PrefetchByteArrayOp3 args = doPrefetchByteArrayOp 3 res args
emitPrimOp _ res PrefetchMutableByteArrayOp3 args = doPrefetchByteArrayOp 3 res args
emitPrimOp _ res PrefetchAddrOp3 args = doPrefetchAddrOp 3 res args
emitPrimOp _ res PrefetchByteArrayOp2 args = doPrefetchByteArrayOp 2 res args
emitPrimOp _ res PrefetchMutableByteArrayOp2 args = doPrefetchByteArrayOp 2 res args
emitPrimOp _ res PrefetchAddrOp2 args = doPrefetchAddrOp 2 res args
emitPrimOp _ res PrefetchByteArrayOp1 args = doPrefetchByteArrayOp 1 res args
emitPrimOp _ res PrefetchMutableByteArrayOp1 args = doPrefetchByteArrayOp 1 res args
emitPrimOp _ res PrefetchAddrOp1 args = doPrefetchAddrOp 1 res args
emitPrimOp _ res PrefetchByteArrayOp0 args = doPrefetchByteArrayOp 0 res args
emitPrimOp _ res PrefetchMutableByteArrayOp0 args = doPrefetchByteArrayOp 0 res args
emitPrimOp _ res PrefetchAddrOp0 args = doPrefetchAddrOp 0 res args
-- The rest just translate straightforwardly
emitPrimOp dflags [res] op [arg]
......@@ -1370,31 +1374,34 @@ doVecInsertOp maybe_pre_write_cast ty src e idx res = do
------------------------------------------------------------------------------
-- Helpers for translating prefetching.
doPrefetchByteArrayOp :: [LocalReg]
doPrefetchByteArrayOp :: Int
-> [LocalReg]
-> [CmmExpr]
-> FCode ()
doPrefetchByteArrayOp res [addr,idx]
doPrefetchByteArrayOp locality res [addr,idx]
= do dflags <- getDynFlags
mkBasicPrefetch (arrWordsHdrSize dflags) res addr idx
doPrefetchByteArrayOp _ _
mkBasicPrefetch locality (arrWordsHdrSize dflags) res addr idx
doPrefetchByteArrayOp _ _ _
= panic "StgCmmPrim: doPrefetchByteArrayOp"
doPrefetchAddrOp :: [LocalReg]
doPrefetchAddrOp ::Int
-> [LocalReg]
-> [CmmExpr]
-> FCode ()
doPrefetchAddrOp res [addr,idx]
= mkBasicPrefetch 0 res addr idx
doPrefetchAddrOp _ _
doPrefetchAddrOp locality res [addr,idx]
= mkBasicPrefetch locality 0 res addr idx
doPrefetchAddrOp _ _ _
= panic "StgCmmPrim: doPrefetchAddrOp"
mkBasicPrefetch :: ByteOff -- Initial offset in bytes
mkBasicPrefetch :: Int -- Locality level 0-3
-> ByteOff -- Initial offset in bytes
-> [LocalReg] -- Destination
-> CmmExpr -- Base address
-> CmmExpr -- Index
-> FCode ()
mkBasicPrefetch off res base idx
mkBasicPrefetch locality off res base idx
= do dflags <- getDynFlags
emitPrimCall [] MO_Prefetch_Data [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx]
emitPrimCall [] (MO_Prefetch_Data locality) [cmmIndexExpr dflags W8 (cmmOffsetB dflags base off) idx]
case res of
[] -> return ()
[reg] -> emitAssign (CmmLocal reg) base
......
......@@ -200,7 +200,8 @@ genCall (PrimTarget (MO_UF_Conv _)) [_] args =
"Can only handle 1, given" ++ show (length args) ++ "."
-- Handle prefetching data
genCall t@(PrimTarget MO_Prefetch_Data) [] args = do
genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
| 0 <= localityInt && localityInt <= 3 = do
ver <- getLlvmVer
let argTy | ver <= 29 = [i8Ptr, i32, i32]
| otherwise = [i8Ptr, i32, i32, i32]
......@@ -214,12 +215,13 @@ genCall t@(PrimTarget MO_Prefetch_Data) [] args = do
(argVars', stmts3) <- castVars $ zip argVars argTy
trash <- getTrashStmts
let argSuffix | ver <= 29 = [mkIntLit i32 0, mkIntLit i32 3]
| otherwise = [mkIntLit i32 0, mkIntLit i32 3, mkIntLit i32 1]
let argSuffix | ver <= 29 = [mkIntLit i32 0, mkIntLit i32 localityInt]
| otherwise = [mkIntLit i32 0, mkIntLit i32 localityInt, mkIntLit i32 1]
call = Expr $ Call StdCall fptr (argVars' ++ argSuffix) []
stmts = stmts1 `appOL` stmts2 `appOL` stmts3
`appOL` trash `snocOL` call
return (stmts, top1 ++ top2)
| otherwise = panic $ "prefetch locality level integer must be between 0 and 3, given: " ++ (show localityInt)
-- Handle PopCnt and BSwap that need to only convert arg and return types
genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
......@@ -545,7 +547,8 @@ cmmPrimOpFunctions mop = do
(MO_PopCnt w) -> fsLit $ "llvm.ctpop." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
(MO_BSwap w) -> fsLit $ "llvm.bswap." ++ showSDoc dflags (ppr $ widthToLlvmInt w)
MO_Prefetch_Data -> fsLit "llvm.prefetch"
(MO_Prefetch_Data _ )-> fsLit "llvm.prefetch"
MO_S_QuotRem {} -> unsupported
MO_U_QuotRem {} -> unsupported
......
......@@ -912,6 +912,9 @@ genCCall' _ _ (PrimTarget MO_WriteBarrier) _ _
genCCall' _ _ (PrimTarget MO_Touch) _ _
= return $ nilOL
genCCall' _ _ (PrimTarget (MO_Prefetch_Data _)) _ _
= return $ nilOL
genCCall' dflags gcp target dest_regs args0
= ASSERT(not $ any (`elem` [II16]) $ map cmmTypeSize argReps)
-- we rely on argument promotion in the codeGen
......@@ -1165,7 +1168,7 @@ genCCall' dflags gcp target dest_regs args0
MO_U_Mul2 {} -> unsupported
MO_WriteBarrier -> unsupported
MO_Touch -> unsupported
MO_Prefetch_Data -> unsupported
(MO_Prefetch_Data _ ) -> unsupported
unsupported = panic ("outOfLineCmmOp: " ++ show mop
++ " not supported")
......
......@@ -392,7 +392,10 @@ genCCall
-- In the SPARC case we don't need a barrier.
--
genCCall (PrimTarget MO_WriteBarrier) _ _
= do return nilOL
= return $ nilOL
genCCall (PrimTarget (MO_Prefetch_Data _)) _ _
= return $ nilOL
genCCall target dest_regs args0
= do
......@@ -657,7 +660,7 @@ outOfLineMachOp_table mop
MO_U_Mul2 {} -> unsupported
MO_WriteBarrier -> unsupported
MO_Touch -> unsupported
MO_Prefetch_Data -> unsupported
(MO_Prefetch_Data _) -> unsupported
where unsupported = panic ("outOfLineCmmOp: " ++ show mop
++ " not supported here")
......@@ -1658,7 +1658,26 @@ genCCall _ (PrimTarget MO_WriteBarrier) _ _ = return nilOL
genCCall _ (PrimTarget MO_Touch) _ _ = return nilOL
genCCall _ (PrimTarget MO_Prefetch_Data) _ _ = return nilOL
genCCall is32bit (PrimTarget (MO_Prefetch_Data n )) _ [src] =
case n of
0 -> genPrefetch src $ PREFETCH NTA size
1 -> genPrefetch src $ PREFETCH Lvl2 size
2 -> genPrefetch src $ PREFETCH Lvl1 size
3 -> genPrefetch src $ PREFETCH Lvl0 size
l -> panic $ "unexpected prefetch level in genCCall MO_Prefetch_Data: " ++ (show l)
-- the c / llvm prefetch convention is 0, 1, 2, and 3
-- the x86 corresponding names are : NTA, 2 , 1, and 0
where
size = archWordSize is32bit
-- need to know what register width for pointers!
genPrefetch inRegSrc prefetchCTor =
do
code_src <- getAnyReg inRegSrc
src_r <- getNewRegNat size
return $ code_src src_r `appOL`
(unitOL (prefetchCTor (OpAddr
((AddrBaseIndex (EABaseReg src_r ) EAIndexNone (ImmInt 0)))) ))
-- prefetch always takes an address
genCCall is32Bit (PrimTarget (MO_BSwap width)) [dst] [src] = do
dflags <- getDynFlags
......@@ -2361,7 +2380,7 @@ outOfLineCmmOp mop res args
MO_U_Mul2 {} -> unsupported
MO_WriteBarrier -> unsupported
MO_Touch -> unsupported
MO_Prefetch_Data -> unsupported
(MO_Prefetch_Data _ ) -> unsupported
unsupported = panic ("outOfLineCmmOp: " ++ show mop
++ " not supported here")
......
......@@ -9,7 +9,7 @@
#include "HsVersions.h"
#include "nativeGen/NCG.h"
module X86.Instr (Instr(..), Operand(..), JumpDest,
module X86.Instr (Instr(..), Operand(..), PrefetchVariant(..), JumpDest,
getJumpDestBlockId, canShortcut, shortcutStatics,
shortcutJump, i386_insert_ffrees, allocMoreStack,
maxSpillSlots, archWordSize)
......@@ -319,7 +319,14 @@ data Instr
-- 1: popl %reg
-- SSE4.2
| POPCNT Size Operand Reg -- src, dst
| POPCNT Size Operand Reg -- src, dst
-- prefetch
| PREFETCH PrefetchVariant Size Operand -- prefetch Variant, addr size, address to prefetch
-- variant can be NTA, Lvl0, Lvl1, or Lvl2
data PrefetchVariant = NTA | Lvl0 | Lvl1 | Lvl2
data Operand
= OpReg Reg -- register
......@@ -417,6 +424,9 @@ x86_regUsageOfInstr platform instr
POPCNT _ src dst -> mkRU (use_R src []) [dst]
-- note: might be a better way to do this
PREFETCH _ _ src -> mkRU (use_R src []) []
_other -> panic "regUsage: unrecognised instr"
where
......@@ -557,6 +567,8 @@ x86_patchRegsOfInstr instr env
POPCNT sz src dst -> POPCNT sz (patchOp src) (env dst)
PREFETCH lvl size src -> PREFETCH lvl size (patchOp src)
_other -> panic "patchRegs: unrecognised instr"
where
......
......@@ -577,6 +577,11 @@ pprInstr (XOR size src dst) = pprSizeOpOp (sLit "xor") size src dst
pprInstr (POPCNT size src dst) = pprOpOp (sLit "popcnt") size src (OpReg dst)
pprInstr (PREFETCH NTA size src ) = pprSizeOp_ (sLit "prefetchnta") size src
pprInstr (PREFETCH Lvl0 size src) = pprSizeOp_ (sLit "prefetcht0") size src
pprInstr (PREFETCH Lvl1 size src) = pprSizeOp_ (sLit "prefetcht1") size src
pprInstr (PREFETCH Lvl2 size src) = pprSizeOp_ (sLit "prefetcht2") size src
pprInstr (NOT size op) = pprSizeOp (sLit "not") size op
pprInstr (BSWAP size op) = pprSizeOp (sLit "bswap") size (OpReg op)
pprInstr (NEGI size op) = pprSizeOp (sLit "neg") size op
......@@ -1025,6 +1030,13 @@ pprSizeImmOp name size imm op1
]
pprSizeOp_ :: LitString -> Size -> Operand -> SDoc
pprSizeOp_ name size op1
= hcat [
pprMnemonic_ name ,
pprOperand size op1
]
pprSizeOp :: LitString -> Size -> Operand -> SDoc
pprSizeOp name size op1
= hcat [
......
......@@ -2596,22 +2596,91 @@ primop VecWriteScalarOffAddrOp "writeOffAddrAs#" GenPrimOp
vector = ALL_VECTOR_TYPES
------------------------------------------------------------------------
section "Prefetch"
{Prefetch operations}
{Prefetch operations: Note how every prefetch operation has a name
with the pattern prefetch*N#, where N is either 0,1,2, or 3.
This suffix number, N, is the "locality level" of the prefetch, following the
convention in GCC and other compilers.
Higher locality numbers correspond to the memory being loaded in more
levels of the cpu cache, and being retained after initial use.
On the LLVM backend, prefetch*N# uses the LLVM prefetch intrinsic
with locality level N. The code generated by LLVM is target architecture
dependent, but should agree with the GHC NCG on x86 systems.
On the Sparc and PPC native backends, prefetch*N is a No-Op.
On the x86 NCG, N=0 will generate prefetchNTA,
N=1 generates prefetcht2, N=2 generates prefetcht1, and
N=3 generates prefetcht0.
For streaming workloads, the prefetch*0 operations are recommended.
For workloads which do many reads or writes to a memory location in a short period of time,
prefetch*3 operations are recommended.
}
------------------------------------------------------------------------
primop PrefetchByteArrayOp "prefetchByteArray#" GenPrimOp
--- the Int# argument for prefetch is the byte offset on the byteArray or Addr#
---
primop PrefetchByteArrayOp3 "prefetchByteArray3#" GenPrimOp
ByteArray# -> Int# -> ByteArray#
with llvm_only = True
with can_fail = True
primop PrefetchMutableByteArrayOp "prefetchMutableByteArray#" GenPrimOp
primop PrefetchMutableByteArrayOp3 "prefetchMutableByteArray3#" GenPrimOp
MutableByteArray# s -> Int# -> State# s -> State# s
with has_side_effects = True
llvm_only = True
with can_fail = True
primop PrefetchAddrOp3 "prefetchAddr3#" GenPrimOp
Addr# -> Int# -> Addr#
with can_fail = True
primop PrefetchAddrOp "prefetchAddr#" GenPrimOp
----
primop PrefetchByteArrayOp2 "prefetchByteArray2#" GenPrimOp
ByteArray# -> Int# -> ByteArray#
with can_fail = True
primop PrefetchMutableByteArrayOp2 "prefetchMutableByteArray2#" GenPrimOp
MutableByteArray# s -> Int# -> State# s -> State# s
with can_fail = True
primop PrefetchAddrOp2 "prefetchAddr2#" GenPrimOp
Addr# -> Int# -> Addr#
with llvm_only = True
with can_fail = True
----
primop PrefetchByteArrayOp1 "prefetchByteArray1#" GenPrimOp
ByteArray# -> Int# -> ByteArray#
with can_fail = True
primop PrefetchMutableByteArrayOp1 "prefetchMutableByteArray1#" GenPrimOp
MutableByteArray# s -> Int# -> State# s -> State# s
with can_fail = True
primop PrefetchAddrOp1 "prefetchAddr1#" GenPrimOp
Addr# -> Int# -> Addr#
with can_fail = True
----
primop PrefetchByteArrayOp0 "prefetchByteArray0#" GenPrimOp
ByteArray# -> Int# -> ByteArray#
with can_fail = True
primop PrefetchMutableByteArrayOp0 "prefetchMutableByteArray0#" GenPrimOp
MutableByteArray# s -> Int# -> State# s -> State# s
with can_fail = True
primop PrefetchAddrOp0 "prefetchAddr0#" GenPrimOp
Addr# -> Int# -> Addr#
with can_fail = True
------------------------------------------------------------------------
--- ---
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment