Commit 0b0a41f9 authored by Simon Marlow's avatar Simon Marlow

Teach the linear register allocator how to allocate more stack if necessary

This squashes the "out of spill slots" panic that occasionally happens
on x86, by adding instructions to bump and retreat the C stack pointer
as necessary.  The panic has become more common since the new codegen,
because we lump code into larger blocks, and the register allocator
isn't very good at reusing stack slots for spilling (see Note [extra
spill slots]).
parent 7bff9fa8
......@@ -133,16 +133,17 @@ The machine-dependent bits break down as follows:
data NcgImpl statics instr jumpDest = NcgImpl {
cmmTopCodeGen :: RawCmmDecl -> NatM [NatCmmDecl statics instr],
generateJumpTableForInstr :: DynFlags -> instr -> Maybe (NatCmmDecl statics instr),
generateJumpTableForInstr :: instr -> Maybe (NatCmmDecl statics instr),
getJumpDestBlockId :: jumpDest -> Maybe BlockId,
canShortcut :: instr -> Maybe jumpDest,
shortcutStatics :: (BlockId -> Maybe jumpDest) -> statics -> statics,
shortcutJump :: (BlockId -> Maybe jumpDest) -> instr -> instr,
pprNatCmmDecl :: NatCmmDecl statics instr -> SDoc,
maxSpillSlots :: DynFlags -> Int,
allocatableRegs :: Platform -> [RealReg],
maxSpillSlots :: Int,
allocatableRegs :: [RealReg],
ncg_x86fp_kludge :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
ncgExpandTop :: [NatCmmDecl statics instr] -> [NatCmmDecl statics instr],
ncgAllocMoreStack :: Int -> NatCmmDecl statics instr -> NatCmmDecl statics instr,
ncgMakeFarBranches :: [NatBasicBlock instr] -> [NatBasicBlock instr]
}
......@@ -154,15 +155,16 @@ nativeCodeGen dflags h us cmms
nCG' ncgImpl = nativeCodeGen' dflags ncgImpl h us cmms
x86NcgImpl = NcgImpl {
cmmTopCodeGen = X86.CodeGen.cmmTopCodeGen
,generateJumpTableForInstr = X86.CodeGen.generateJumpTableForInstr
,generateJumpTableForInstr = X86.CodeGen.generateJumpTableForInstr dflags
,getJumpDestBlockId = X86.Instr.getJumpDestBlockId
,canShortcut = X86.Instr.canShortcut
,shortcutStatics = X86.Instr.shortcutStatics
,shortcutJump = X86.Instr.shortcutJump
,pprNatCmmDecl = X86.Ppr.pprNatCmmDecl
,maxSpillSlots = X86.Instr.maxSpillSlots
,allocatableRegs = X86.Regs.allocatableRegs
,maxSpillSlots = X86.Instr.maxSpillSlots dflags
,allocatableRegs = X86.Regs.allocatableRegs platform
,ncg_x86fp_kludge = id
,ncgAllocMoreStack = X86.Instr.allocMoreStack platform
,ncgExpandTop = id
,ncgMakeFarBranches = id
}
......@@ -172,30 +174,32 @@ nativeCodeGen dflags h us cmms
ArchPPC ->
nCG' $ NcgImpl {
cmmTopCodeGen = PPC.CodeGen.cmmTopCodeGen
,generateJumpTableForInstr = PPC.CodeGen.generateJumpTableForInstr
,generateJumpTableForInstr = PPC.CodeGen.generateJumpTableForInstr dflags
,getJumpDestBlockId = PPC.RegInfo.getJumpDestBlockId
,canShortcut = PPC.RegInfo.canShortcut
,shortcutStatics = PPC.RegInfo.shortcutStatics
,shortcutJump = PPC.RegInfo.shortcutJump
,pprNatCmmDecl = PPC.Ppr.pprNatCmmDecl
,maxSpillSlots = PPC.Instr.maxSpillSlots
,allocatableRegs = PPC.Regs.allocatableRegs
,maxSpillSlots = PPC.Instr.maxSpillSlots dflags
,allocatableRegs = PPC.Regs.allocatableRegs platform
,ncg_x86fp_kludge = id
,ncgAllocMoreStack = noAllocMoreStack
,ncgExpandTop = id
,ncgMakeFarBranches = makeFarBranches
}
ArchSPARC ->
nCG' $ NcgImpl {
cmmTopCodeGen = SPARC.CodeGen.cmmTopCodeGen
,generateJumpTableForInstr = SPARC.CodeGen.generateJumpTableForInstr
,generateJumpTableForInstr = SPARC.CodeGen.generateJumpTableForInstr dflags
,getJumpDestBlockId = SPARC.ShortcutJump.getJumpDestBlockId
,canShortcut = SPARC.ShortcutJump.canShortcut
,shortcutStatics = SPARC.ShortcutJump.shortcutStatics
,shortcutJump = SPARC.ShortcutJump.shortcutJump
,pprNatCmmDecl = SPARC.Ppr.pprNatCmmDecl
,maxSpillSlots = SPARC.Instr.maxSpillSlots
,allocatableRegs = \_ -> SPARC.Regs.allocatableRegs
,maxSpillSlots = SPARC.Instr.maxSpillSlots dflags
,allocatableRegs = SPARC.Regs.allocatableRegs
,ncg_x86fp_kludge = id
,ncgAllocMoreStack = noAllocMoreStack
,ncgExpandTop = map SPARC.CodeGen.Expand.expandTop
,ncgMakeFarBranches = id
}
......@@ -206,6 +210,23 @@ nativeCodeGen dflags h us cmms
ArchUnknown ->
panic "nativeCodeGen: No NCG for unknown arch"
--
-- Allocating more stack space for spilling is currently only
-- supported for the linear register allocator on x86/x86_64, the rest
-- default to the panic below. To support allocating extra stack on
-- more platforms provide a definition of ncgAllocMoreStack.
--
noAllocMoreStack :: Int -> NatCmmDecl statics instr -> NatCmmDecl statics instr
noAllocMoreStack amount _
= panic $ "Register allocator: out of stack slots (need " ++ show amount ++ ")\n"
++ " If you are trying to compile SHA1.hs from the crypto library then this\n"
++ " is a known limitation in the linear allocator.\n"
++ "\n"
++ " Try enabling the graph colouring allocator with -fregs-graph instead."
++ " You can still file a bug report if you like.\n"
nativeCodeGen' :: (Outputable statics, Outputable instr, Instruction instr)
=> DynFlags
-> NcgImpl statics instr jumpDest
......@@ -419,7 +440,7 @@ cmmNativeGen dflags ncgImpl us cmm count
= foldr (\r -> plusUFM_C unionUniqSets
$ unitUFM (targetClassOfRealReg platform r) (unitUniqSet r))
emptyUFM
$ allocatableRegs ncgImpl platform
$ allocatableRegs ncgImpl
-- do the graph coloring register allocation
let ((alloced, regAllocStats), usAlloc)
......@@ -428,7 +449,7 @@ cmmNativeGen dflags ncgImpl us cmm count
$ Color.regAlloc
dflags
alloc_regs
(mkUniqSet [0 .. maxSpillSlots ncgImpl dflags])
(mkUniqSet [0 .. maxSpillSlots ncgImpl])
withLiveness
-- dump out what happened during register allocation
......@@ -457,11 +478,20 @@ cmmNativeGen dflags ncgImpl us cmm count
else do
-- do linear register allocation
let reg_alloc proc = do
(alloced, maybe_more_stack, ra_stats) <-
Linear.regAlloc dflags proc
case maybe_more_stack of
Nothing -> return ( alloced, ra_stats )
Just amount ->
return ( ncgAllocMoreStack ncgImpl amount alloced
, ra_stats )
let ((alloced, regAllocStats), usAlloc)
= {-# SCC "RegAlloc" #-}
initUs usLive
$ liftM unzip
$ mapM (Linear.regAlloc dflags) withLiveness
$ mapM reg_alloc withLiveness
dumpIfSet_dyn dflags
Opt_D_dump_asm_regalloc "Registers allocated"
......@@ -490,7 +520,7 @@ cmmNativeGen dflags ncgImpl us cmm count
---- generate jump tables
let tabled =
{-# SCC "generateJumpTables" #-}
generateJumpTables dflags ncgImpl kludged
generateJumpTables ncgImpl kludged
---- shortcut branches
let shorted =
......@@ -711,12 +741,12 @@ makeFarBranches blocks
-- Analyzes all native code and generates data sections for all jump
-- table instructions.
generateJumpTables
:: DynFlags -> NcgImpl statics instr jumpDest
-> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
generateJumpTables dflags ncgImpl xs = concatMap f xs
:: NcgImpl statics instr jumpDest
-> [NatCmmDecl statics instr] -> [NatCmmDecl statics instr]
generateJumpTables ncgImpl xs = concatMap f xs
where f p@(CmmProc _ _ (ListGraph xs)) = p : concatMap g xs
f p = [p]
g (BasicBlock _ xs) = catMaybes (map (generateJumpTableForInstr ncgImpl dflags) xs)
g (BasicBlock _ xs) = catMaybes (map (generateJumpTableForInstr ncgImpl) xs)
-- -----------------------------------------------------------------------------
-- Shortcut branches
......
......@@ -163,3 +163,16 @@ class Instruction instr where
-> [instr]
-- Subtract an amount from the C stack pointer
mkStackAllocInstr
:: Platform -- TODO: remove (needed by x86/x86_64
-- because they share an Instr type)
-> Int
-> instr
-- Add an amount to the C stack pointer
mkStackDeallocInstr
:: Platform -- TODO: remove (needed by x86/x86_64
-- because they share an Instr type)
-> Int
-> instr
......@@ -64,6 +64,8 @@ instance Instruction Instr where
mkRegRegMoveInstr _ = ppc_mkRegRegMoveInstr
takeRegRegMoveInstr = ppc_takeRegRegMoveInstr
mkJumpInstr = ppc_mkJumpInstr
mkStackAllocInstr = panic "no ppc_mkStackAllocInstr"
mkStackDeallocInstr = panic "no ppc_mkStackDeallocInstr"
-- -----------------------------------------------------------------------------
......
......@@ -139,22 +139,27 @@ regAlloc
:: (Outputable instr, Instruction instr)
=> DynFlags
-> LiveCmmDecl statics instr
-> UniqSM (NatCmmDecl statics instr, Maybe RegAllocStats)
-> UniqSM ( NatCmmDecl statics instr
, Maybe Int -- number of extra stack slots required,
-- beyond maxSpillSlots
, Maybe RegAllocStats)
regAlloc _ (CmmData sec d)
= return
( CmmData sec d
, Nothing
, Nothing )
regAlloc _ (CmmProc (LiveInfo info _ _ _) lbl [])
= return ( CmmProc info lbl (ListGraph [])
, Nothing
, Nothing )
regAlloc dflags (CmmProc static lbl sccs)
| LiveInfo info (Just first_id) (Just block_live) _ <- static
= do
-- do register allocation on each component.
(final_blocks, stats)
(final_blocks, stats, stack_use)
<- linearRegAlloc dflags first_id block_live sccs
-- make sure the block that was first in the input list
......@@ -162,7 +167,15 @@ regAlloc dflags (CmmProc static lbl sccs)
let ((first':_), rest')
= partition ((== first_id) . blockId) final_blocks
let max_spill_slots = maxSpillSlots dflags
extra_stack
| stack_use > max_spill_slots
= Just (stack_use - max_spill_slots)
| otherwise
= Nothing
return ( CmmProc info lbl (ListGraph (first' : rest'))
, extra_stack
, Just stats)
-- bogus. to make non-exhaustive match warning go away.
......@@ -184,7 +197,7 @@ linearRegAlloc
-> BlockId -- ^ the first block
-> BlockMap RegSet -- ^ live regs on entry to each basic block
-> [SCC (LiveBasicBlock instr)] -- ^ instructions annotated with "deaths"
-> UniqSM ([NatBasicBlock instr], RegAllocStats)
-> UniqSM ([NatBasicBlock instr], RegAllocStats, Int)
linearRegAlloc dflags first_id block_live sccs
= let platform = targetPlatform dflags
......@@ -204,14 +217,14 @@ linearRegAlloc'
-> BlockId -- ^ the first block
-> BlockMap RegSet -- ^ live regs on entry to each basic block
-> [SCC (LiveBasicBlock instr)] -- ^ instructions annotated with "deaths"
-> UniqSM ([NatBasicBlock instr], RegAllocStats)
-> UniqSM ([NatBasicBlock instr], RegAllocStats, Int)
linearRegAlloc' dflags initFreeRegs first_id block_live sccs
= do us <- getUs
let (_, _, stats, blocks) =
let (_, stack, stats, blocks) =
runR dflags emptyBlockMap initFreeRegs emptyRegMap (emptyStackMap dflags) us
$ linearRA_SCCs first_id block_live [] sccs
return (blocks, stats)
return (blocks, stats, getStackUse stack)
linearRA_SCCs :: (FR freeRegs, Instruction instr, Outputable instr)
......
......@@ -21,7 +21,8 @@ module RegAlloc.Linear.StackMap (
StackSlot,
StackMap(..),
emptyStackMap,
getStackSlotFor
getStackSlotFor,
getStackUse
)
where
......@@ -40,7 +41,7 @@ type StackSlot = Int
data StackMap
= StackMap
{ -- | The slots that are still available to be allocated.
stackMapFreeSlots :: [StackSlot]
stackMapNextFreeSlot :: !Int
-- | Assignment of vregs to stack slots.
, stackMapAssignment :: UniqFM StackSlot }
......@@ -48,7 +49,7 @@ data StackMap
-- | An empty stack map, with all slots available.
emptyStackMap :: DynFlags -> StackMap
emptyStackMap dflags = StackMap [0 .. maxSpillSlots dflags] emptyUFM
emptyStackMap dflags = StackMap 0 emptyUFM
-- | If this vreg unique already has a stack assignment then return the slot number,
......@@ -56,24 +57,13 @@ emptyStackMap dflags = StackMap [0 .. maxSpillSlots dflags] emptyUFM
--
getStackSlotFor :: StackMap -> Unique -> (StackMap, Int)
getStackSlotFor (StackMap [] _) _
-- This happens all the time when trying to compile darcs' SHA1.hs, see Track #1993
-- SHA1.lhs has also been added to the Crypto library on Hackage,
-- so we see this all the time.
--
-- It would be better to automatically invoke the graph allocator, or do something
-- else besides panicing, but that's a job for a different day. -- BL 2009/02
--
= panic $ "RegAllocLinear.getStackSlotFor: out of stack slots\n"
++ " If you are trying to compile SHA1.hs from the crypto library then this\n"
++ " is a known limitation in the linear allocator.\n"
++ "\n"
++ " Try enabling the graph colouring allocator with -fregs-graph instead."
++ " You can still file a bug report if you like.\n"
getStackSlotFor fs@(StackMap (freeSlot:stack') reserved) reg =
case lookupUFM reserved reg of
Just slot -> (fs, slot)
Nothing -> (StackMap stack' (addToUFM reserved reg freeSlot), freeSlot)
getStackSlotFor fs@(StackMap _ reserved) reg
| Just slot <- lookupUFM reserved reg = (fs, slot)
getStackSlotFor (StackMap freeSlot reserved) reg =
(StackMap (freeSlot+1) (addToUFM reserved reg freeSlot), freeSlot)
-- | Return the number of stack slots that were allocated
getStackUse :: StackMap -> Int
getStackUse (StackMap freeSlot _) = freeSlot
......@@ -108,6 +108,8 @@ instance Instruction Instr where
mkRegRegMoveInstr = sparc_mkRegRegMoveInstr
takeRegRegMoveInstr = sparc_takeRegRegMoveInstr
mkJumpInstr = sparc_mkJumpInstr
mkStackAllocInstr = panic "no sparc_mkStackAllocInstr"
mkStackDeallocInstr = panic "no sparc_mkStackDeallocInstr"
-- | SPARC instruction set.
......
......@@ -11,7 +11,7 @@
module X86.Instr (Instr(..), Operand(..),
getJumpDestBlockId, canShortcut, shortcutStatics,
shortcutJump, i386_insert_ffrees,
shortcutJump, i386_insert_ffrees, allocMoreStack,
maxSpillSlots, archWordSize)
where
......@@ -58,6 +58,8 @@ instance Instruction Instr where
mkRegRegMoveInstr = x86_mkRegRegMoveInstr
takeRegRegMoveInstr = x86_takeRegRegMoveInstr
mkJumpInstr = x86_mkJumpInstr
mkStackAllocInstr = x86_mkStackAllocInstr
mkStackDeallocInstr = x86_mkStackDeallocInstr
-- -----------------------------------------------------------------------------
......@@ -620,14 +622,13 @@ x86_mkSpillInstr
-> Instr
x86_mkSpillInstr dflags reg delta slot
= let off = spillSlotToOffset dflags slot
= let off = spillSlotToOffset dflags slot - delta
in
let off_w = (off - delta) `div` (if is32Bit then 4 else 8)
in case targetClassOfReg platform reg of
case targetClassOfReg platform reg of
RcInteger -> MOV (archWordSize is32Bit)
(OpReg reg) (OpAddr (spRel dflags off_w))
RcDouble -> GST FF80 reg (spRel dflags off_w) {- RcFloat/RcDouble -}
RcDoubleSSE -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off_w))
(OpReg reg) (OpAddr (spRel dflags off))
RcDouble -> GST FF80 reg (spRel dflags off) {- RcFloat/RcDouble -}
RcDoubleSSE -> MOV FF64 (OpReg reg) (OpAddr (spRel dflags off))
_ -> panic "X86.mkSpillInstr: no match"
where platform = targetPlatform dflags
is32Bit = target32Bit platform
......@@ -641,14 +642,13 @@ x86_mkLoadInstr
-> Instr
x86_mkLoadInstr dflags reg delta slot
= let off = spillSlotToOffset dflags slot
= let off = spillSlotToOffset dflags slot - delta
in
let off_w = (off-delta) `div` (if is32Bit then 4 else 8)
in case targetClassOfReg platform reg of
case targetClassOfReg platform reg of
RcInteger -> MOV (archWordSize is32Bit)
(OpAddr (spRel dflags off_w)) (OpReg reg)
RcDouble -> GLD FF80 (spRel dflags off_w) reg {- RcFloat/RcDouble -}
RcDoubleSSE -> MOV FF64 (OpAddr (spRel dflags off_w)) (OpReg reg)
(OpAddr (spRel dflags off)) (OpReg reg)
RcDouble -> GLD FF80 (spRel dflags off) reg {- RcFloat/RcDouble -}
RcDoubleSSE -> MOV FF64 (OpAddr (spRel dflags off)) (OpReg reg)
_ -> panic "X86.x86_mkLoadInstr"
where platform = targetPlatform dflags
is32Bit = target32Bit platform
......@@ -666,12 +666,7 @@ maxSpillSlots dflags
-- the C stack pointer.
spillSlotToOffset :: DynFlags -> Int -> Int
spillSlotToOffset dflags slot
| slot >= 0 && slot < maxSpillSlots dflags
= 64 + spillSlotSize dflags * slot
| otherwise
= pprPanic "spillSlotToOffset:"
( text "invalid spill location: " <> int slot
$$ text "maxSpillSlots: " <> int (maxSpillSlots dflags))
--------------------------------------------------------------------------------
......@@ -744,8 +739,25 @@ x86_mkJumpInstr id
= [JXX ALWAYS id]
x86_mkStackAllocInstr
:: Platform
-> Int
-> Instr
x86_mkStackAllocInstr platform amount
= case platformArch platform of
ArchX86 -> SUB II32 (OpImm (ImmInt amount)) (OpReg esp)
ArchX86_64 -> SUB II64 (OpImm (ImmInt amount)) (OpReg rsp)
_ -> panic "x86_mkStackAllocInstr"
x86_mkStackDeallocInstr
:: Platform
-> Int
-> Instr
x86_mkStackDeallocInstr platform amount
= case platformArch platform of
ArchX86 -> ADD II32 (OpImm (ImmInt amount)) (OpReg esp)
ArchX86_64 -> ADD II64 (OpImm (ImmInt amount)) (OpReg rsp)
_ -> panic "x86_mkStackDeallocInstr"
i386_insert_ffrees
:: [GenBasicBlock Instr]
......@@ -753,18 +765,12 @@ i386_insert_ffrees
i386_insert_ffrees blocks
| or (map (any is_G_instr) [ instrs | BasicBlock _ instrs <- blocks ])
= map ffree_before_nonlocal_transfers blocks
= map insertGFREEs blocks
| otherwise
= blocks
where
ffree_before_nonlocal_transfers (BasicBlock id insns)
= BasicBlock id (foldr p [] insns)
where p insn r = case insn of
CALL _ _ -> GFREE : insn : r
JMP _ _ -> GFREE : insn : r
JXX_GBL _ _ -> panic "i386_insert_ffrees: cannot handle JXX_GBL"
_ -> insn : r
where
insertGFREEs (BasicBlock id insns)
= BasicBlock id (insertBeforeNonlocalTransfers GFREE insns)
-- if you ever add a new FP insn to the fake x86 FP insn set,
-- you must update this too
......@@ -796,6 +802,57 @@ is_G_instr instr
_ -> False
--
-- Note [extra spill slots]
--
-- If the register allocator used more spill slots than we have
-- pre-allocated (rESERVED_C_STACK_BYTES), then we must allocate more
-- C stack space on entry and exit from this proc. Therefore we
-- insert a "sub $N, %rsp" at every entry point, and an "add $N, %rsp"
-- before every non-local jump.
--
-- This became necessary when the new codegen started bundling entire
-- functions together into one proc, because the register allocator
-- assigns a different stack slot to each virtual reg within a proc.
-- To avoid using so many slots we could also:
--
-- - split up the proc into connected components before code generator
--
-- - rename the virtual regs, so that we re-use vreg names and hence
-- stack slots for non-overlapping vregs.
--
allocMoreStack
:: Platform
-> Int
-> NatCmmDecl statics X86.Instr.Instr
-> NatCmmDecl statics X86.Instr.Instr
allocMoreStack _ _ top@(CmmData _ _) = top
allocMoreStack platform amount (CmmProc info lbl (ListGraph code)) =
CmmProc info lbl (ListGraph (map insert_stack_insns code))
where
alloc = mkStackAllocInstr platform amount
dealloc = mkStackDeallocInstr platform amount
is_entry_point id = id `mapMember` info
insert_stack_insns (BasicBlock id insns)
| is_entry_point id = BasicBlock id (alloc : block')
| otherwise = BasicBlock id block'
where
block' = insertBeforeNonlocalTransfers dealloc insns
insertBeforeNonlocalTransfers :: Instr -> [Instr] -> [Instr]
insertBeforeNonlocalTransfers insert insns
= foldr p [] insns
where p insn r = case insn of
CALL _ _ -> insert : insn : r
JMP _ _ -> insert : insn : r
JXX_GBL _ _ -> panic "insertBeforeNonlocalTransfers: cannot handle JXX_GBL"
_ -> insn : r
data JumpDest = DestBlockId BlockId | DestImm Imm
getJumpDestBlockId :: JumpDest -> Maybe BlockId
......
......@@ -196,13 +196,13 @@ addrModeRegs _ = []
spRel :: DynFlags
-> Int -- ^ desired stack offset in words, positive or negative
-> Int -- ^ desired stack offset in bytes, positive or negative
-> AddrMode
spRel dflags n
| target32Bit (targetPlatform dflags)
= AddrBaseIndex (EABaseReg esp) EAIndexNone (ImmInt (n * wORD_SIZE dflags))
= AddrBaseIndex (EABaseReg esp) EAIndexNone (ImmInt n)
| otherwise
= AddrBaseIndex (EABaseReg rsp) EAIndexNone (ImmInt (n * wORD_SIZE dflags))
= AddrBaseIndex (EABaseReg rsp) EAIndexNone (ImmInt n)
-- The register numbers must fit into 32 bits on x86, so that we can
-- use a Word32 to represent the set of free registers in the register
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment