Commit 5b105389 authored by Edward Z. Yang's avatar Edward Z. Yang
Browse files

Port "Make array copy primops inline" and related patches to new codegen.

The following patches were ported:

    d0faaa6f Fix segfault in array copy primops on 32-bit
    18691d44 Make assignTemp_ less pessimistic
    9c23f06f

 Make array copy primops inline
Signed-off-by: Edward Z. Yang's avatarEdward Z. Yang <ezyang@mit.edu>
parent f1fc7698
......@@ -109,6 +109,9 @@ emitForeignCall results (CCall (CCallSpec target cconv safety)) args live
-- alternative entry point, used by CmmParse
-- the new code generator has utility function emitCCall and emitPrimCall
-- which should be used instead of this (the equivalent emitForeignCall
-- is not presently exported.)
emitForeignCall'
:: Safety
-> HintedCmmFormals -- where to put the results
......
......@@ -701,6 +701,8 @@ whenC :: Bool -> Code -> Code
whenC True code = code
whenC False _ = nopC
-- Corresponds to 'emit' in new code generator with a smart constructor
-- from cmm/MkGraph.hs
stmtC :: CmmStmt -> Code
stmtC stmt = emitCgStmt (CgStmt stmt)
......
......@@ -638,6 +638,13 @@ setInfo closure_ptr info_ptr = CmmStore closure_ptr info_ptr
-- ----------------------------------------------------------------------------
-- Copying pointer arrays
-- EZY: This code has an unusually high amount of assignTemp calls, seen
-- nowhere else in the code generator. This is mostly because these
-- "primitive" ops result in a surprisingly large amount of code. It
-- will likely be worthwhile to optimize what is emitted here, so that
-- our optimization passes don't waste time repeatedly optimizing the
-- same bits of code.
-- | Takes a source 'Array#', an offset in the source array, a
-- destination 'MutableArray#', an offset into the destination array,
-- and the number of elements to copy. Copies the given number of
......
......@@ -17,7 +17,11 @@ import StgCmmForeign
import StgCmmEnv
import StgCmmMonad
import StgCmmUtils
import StgCmmTicky
import StgCmmHeap
import StgCmmProf
import BasicTypes
import MkGraph
import StgSyn
import CmmDecl
......@@ -281,6 +285,21 @@ emitPrimOp [res] UnsafeFreezeArrayOp [arg]
emitPrimOp [res] UnsafeFreezeByteArrayOp [arg]
= emit (mkAssign (CmmLocal res) arg)
-- Copying pointer arrays
emitPrimOp [] CopyArrayOp [src,src_off,dst,dst_off,n] =
doCopyArrayOp src src_off dst dst_off n
emitPrimOp [] CopyMutableArrayOp [src,src_off,dst,dst_off,n] =
doCopyMutableArrayOp src src_off dst dst_off n
emitPrimOp [res] CloneArrayOp [src,src_off,n] =
emitCloneArray mkMAP_FROZEN_infoLabel res src src_off n
emitPrimOp [res] CloneMutableArrayOp [src,src_off,n] =
emitCloneArray mkMAP_DIRTY_infoLabel res src src_off n
emitPrimOp [res] FreezeArrayOp [src,src_off,n] =
emitCloneArray mkMAP_FROZEN_infoLabel res src src_off n
emitPrimOp [res] ThawArrayOp [src,src_off,n] =
emitCloneArray mkMAP_DIRTY_infoLabel res src src_off n
-- Reading/writing pointer arrays
emitPrimOp [r] ReadArrayOp [obj,ix] = doReadPtrArrayOp r obj ix
......@@ -684,3 +703,193 @@ cmmLoadIndexOffExpr off ty base idx
setInfo :: CmmExpr -> CmmExpr -> CmmAGraph
setInfo closure_ptr info_ptr = mkStore closure_ptr info_ptr
-- ----------------------------------------------------------------------------
-- Copying pointer arrays
-- EZY: This code has an unusually high amount of assignTemp calls, seen
-- nowhere else in the code generator. This is mostly because these
-- "primitive" ops result in a surprisingly large amount of code. It
-- will likely be worthwhile to optimize what is emitted here, so that
-- our optimization passes don't waste time repeatedly optimizing the
-- same bits of code.
-- More closely imitates 'assignTemp' from the old code generator, which
-- returns a CmmExpr rather than a LocalReg.
assignTempE :: CmmExpr -> FCode CmmExpr
assignTempE e = do
t <- assignTemp e
return (CmmReg (CmmLocal t))
-- | Takes a source 'Array#', an offset in the source array, a
-- destination 'MutableArray#', an offset into the destination array,
-- and the number of elements to copy. Copies the given number of
-- elements from the source array to the destination array.
doCopyArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ()
doCopyArrayOp = emitCopyArray copy
where
-- Copy data (we assume the arrays aren't overlapping since
-- they're of different types)
copy _src _dst = emitMemcpyCall
-- | Takes a source 'MutableArray#', an offset in the source array, a
-- destination 'MutableArray#', an offset into the destination array,
-- and the number of elements to copy. Copies the given number of
-- elements from the source array to the destination array.
doCopyMutableArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ()
doCopyMutableArrayOp = emitCopyArray copy
where
-- The only time the memory might overlap is when the two arrays
-- we were provided are the same array!
-- TODO: Optimize branch for common case of no aliasing.
copy src dst dst_p src_p bytes = do
[moveCall, cpyCall] <- forkAlts [
getCode $ emitMemmoveCall dst_p src_p bytes,
getCode $ emitMemcpyCall dst_p src_p bytes
]
emit $ mkCmmIfThenElse (cmmEqWord src dst) moveCall cpyCall
emitCopyArray :: (CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ())
-> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ()
emitCopyArray copy src0 src_off0 dst0 dst_off0 n0 = do
-- Passed as arguments (be careful)
src <- assignTempE src0
src_off <- assignTempE src_off0
dst <- assignTempE dst0
dst_off <- assignTempE dst_off0
n <- assignTempE n0
-- Set the dirty bit in the header.
emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel)))
dst_elems_p <- assignTempE $ cmmOffsetB dst arrPtrsHdrSize
dst_p <- assignTempE $ cmmOffsetExprW dst_elems_p dst_off
src_p <- assignTempE $ cmmOffsetExprW (cmmOffsetB src arrPtrsHdrSize) src_off
bytes <- assignTempE $ cmmMulWord n (CmmLit (mkIntCLit wORD_SIZE))
copy src dst dst_p src_p bytes
-- The base address of the destination card table
dst_cards_p <- assignTempE $ cmmOffsetExprW dst_elems_p (loadArrPtrsSize dst)
emitSetCards dst_off dst_cards_p n
-- | Takes an info table label, a register to return the newly
-- allocated array in, a source array, an offset in the source array,
-- and the number of elements to copy. Allocates a new array and
-- initializes it form the source array.
emitCloneArray :: CLabel -> CmmFormal -> CmmExpr -> CmmExpr -> CmmExpr
-> FCode ()
emitCloneArray info_p res_r src0 src_off0 n0 = do
-- Passed as arguments (be careful)
src <- assignTempE src0
src_off <- assignTempE src_off0
n <- assignTempE n0
card_words <- assignTempE $ (n `cmmUShrWord`
(CmmLit (mkIntCLit mUT_ARR_PTRS_CARD_BITS)))
`cmmAddWord` CmmLit (mkIntCLit 1)
size <- assignTempE $ n `cmmAddWord` card_words
words <- assignTempE $ arrPtrsHdrSizeW `cmmAddWord` size
arr_r <- newTemp bWord
emitAllocateCall arr_r myCapability words
tickyAllocPrim (CmmLit (mkIntCLit arrPtrsHdrSize)) (n `cmmMulWord` wordSize)
(CmmLit $ mkIntCLit 0)
let arr = CmmReg (CmmLocal arr_r)
emitSetDynHdr arr (CmmLit (CmmLabel info_p)) curCCSAddr
emit $ mkStore (cmmOffsetB arr (fixedHdrSize * wORD_SIZE +
oFFSET_StgMutArrPtrs_ptrs)) n
emit $ mkStore (cmmOffsetB arr (fixedHdrSize * wORD_SIZE +
oFFSET_StgMutArrPtrs_size)) size
dst_p <- assignTempE $ cmmOffsetB arr arrPtrsHdrSize
src_p <- assignTempE $ cmmOffsetExprW (cmmOffsetB src arrPtrsHdrSize)
src_off
emitMemcpyCall dst_p src_p (n `cmmMulWord` wordSize)
emitMemsetCall (cmmOffsetExprW dst_p n)
(CmmLit (mkIntCLit 1))
(card_words `cmmMulWord` wordSize)
emit $ mkAssign (CmmLocal res_r) arr
where
arrPtrsHdrSizeW = CmmLit $ mkIntCLit $ fixedHdrSize +
(sIZEOF_StgMutArrPtrs_NoHdr `div` wORD_SIZE)
wordSize = CmmLit (mkIntCLit wORD_SIZE)
myCapability = CmmReg baseReg `cmmSubWord`
CmmLit (mkIntCLit oFFSET_Capability_r)
-- | Takes and offset in the destination array, the base address of
-- the card table, and the number of elements affected (*not* the
-- number of cards). Marks the relevant cards as dirty.
emitSetCards :: CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
emitSetCards dst_start dst_cards_start n = do
start_card <- assignTempE $ card dst_start
emitMemsetCall (dst_cards_start `cmmAddWord` start_card)
(CmmLit (mkIntCLit 1))
((card (dst_start `cmmAddWord` n) `cmmSubWord` start_card)
`cmmAddWord` CmmLit (mkIntCLit 1))
where
-- Convert an element index to a card index
card i = i `cmmUShrWord` (CmmLit (mkIntCLit mUT_ARR_PTRS_CARD_BITS))
-- | Emit a call to @memcpy@.
emitMemcpyCall :: CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
emitMemcpyCall dst src n = do
emitCCall
[ {-no results-} ]
memcpy
[ (dst, AddrHint)
, (src, AddrHint)
, (n, NoHint)
]
where
memcpy = CmmLit (CmmLabel (mkForeignLabel (fsLit "memcpy") Nothing
ForeignLabelInExternalPackage IsFunction))
-- | Emit a call to @memmove@.
emitMemmoveCall :: CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
emitMemmoveCall dst src n = do
emitCCall
[ {- no results -} ]
memmove
[ (dst, AddrHint)
, (src, AddrHint)
, (n, NoHint)
]
where
memmove = CmmLit (CmmLabel (mkForeignLabel (fsLit "memmove") Nothing
ForeignLabelInExternalPackage IsFunction))
-- | Emit a call to @memset@. The second argument must fit inside an
-- unsigned char.
emitMemsetCall :: CmmExpr -> CmmExpr -> CmmExpr -> FCode ()
emitMemsetCall dst c n = do
emitCCall
[ {- no results -} ]
memset
[ (dst, AddrHint)
, (c, NoHint)
, (n, NoHint)
]
where
memset = CmmLit (CmmLabel (mkForeignLabel (fsLit "memset") Nothing
ForeignLabelInExternalPackage IsFunction))
-- | Emit a call to @allocate@.
emitAllocateCall :: LocalReg -> CmmExpr -> CmmExpr -> FCode ()
emitAllocateCall res cap n = do
emitCCall
[ (res, AddrHint) ]
allocate
[ (cap, AddrHint)
, (n, NoHint)
]
where
allocate = CmmLit (CmmLabel (mkForeignLabel (fsLit "allocate") Nothing
ForeignLabelInExternalPackage IsFunction))
......@@ -23,7 +23,7 @@ module StgCmmUtils (
callerSaves, callerSaveVolatileRegs, get_GlobalReg_addr,
cmmAndWord, cmmOrWord, cmmNegate, cmmEqWord, cmmNeWord,
cmmUGtWord,
cmmUGtWord, cmmSubWord, cmmMulWord, cmmAddWord, cmmUShrWord,
cmmOffsetExprW, cmmOffsetExprB,
cmmRegOffW, cmmRegOffB,
cmmLabelOffW, cmmLabelOffB,
......@@ -160,7 +160,8 @@ cmmLoadIndexW base off ty = CmmLoad (cmmOffsetW base off) ty
-----------------------
cmmULtWord, cmmUGeWord, cmmUGtWord, cmmSubWord,
cmmNeWord, cmmEqWord, cmmOrWord, cmmAndWord
cmmNeWord, cmmEqWord, cmmOrWord, cmmAndWord,
cmmUShrWord, cmmAddWord, cmmMulWord
:: CmmExpr -> CmmExpr -> CmmExpr
cmmOrWord e1 e2 = CmmMachOp mo_wordOr [e1, e2]
cmmAndWord e1 e2 = CmmMachOp mo_wordAnd [e1, e2]
......@@ -170,8 +171,10 @@ cmmULtWord e1 e2 = CmmMachOp mo_wordULt [e1, e2]
cmmUGeWord e1 e2 = CmmMachOp mo_wordUGe [e1, e2]
cmmUGtWord e1 e2 = CmmMachOp mo_wordUGt [e1, e2]
--cmmShlWord e1 e2 = CmmMachOp mo_wordShl [e1, e2]
--cmmUShrWord e1 e2 = CmmMachOp mo_wordUShr [e1, e2]
cmmUShrWord e1 e2 = CmmMachOp mo_wordUShr [e1, e2]
cmmAddWord e1 e2 = CmmMachOp mo_wordAdd [e1, e2]
cmmSubWord e1 e2 = CmmMachOp mo_wordSub [e1, e2]
cmmMulWord e1 e2 = CmmMachOp mo_wordMul [e1, e2]
cmmNegate :: CmmExpr -> CmmExpr
cmmNegate (CmmLit (CmmInt n rep)) = CmmLit (CmmInt (-n) rep)
......@@ -550,7 +553,13 @@ mkByteStringCLit bytes
-------------------------------------------------------------------------
assignTemp :: CmmExpr -> FCode LocalReg
-- Make sure the argument is in a local register
-- Make sure the argument is in a local register.
-- We don't bother being particularly aggressive with avoiding
-- unnecessary local registers, since we can rely on a later
-- optimization pass to inline as necessary (and skipping out
-- on things like global registers can be a little dangerous
-- due to them being trashed on foreign calls--though it means
-- the optimization pass doesn't have to do as much work)
assignTemp (CmmReg (CmmLocal reg)) = return reg
assignTemp e = do { uniq <- newUnique
; let reg = LocalReg uniq (cmmExprType e)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment