Refactor MachRegs.trivColorable to do unboxed accumulation

trivColorable was soaking up total 31% time, 41% alloc when
compiling SHA1.lhs with -O2 -fregs-graph on x86.

Refactoring to use unboxed accumulators and walk directly
over the UniqFM holding the set of conflicts reduces this 
to 17% time, 6% alloc.
parent 16dc208a
......@@ -278,7 +278,7 @@ cmmNativeGen dflags us cmm
-- graph coloring register allocation
let ((alloced, regAllocStats), usAlloc)
= {-# SCC "regAlloc(color)" #-}
= {-# SCC "RegAlloc(color)" #-}
initUs usLive
$ Color.regAlloc
generateRegAllocStats
......@@ -312,7 +312,7 @@ cmmNativeGen dflags us cmm
else do
-- do linear register allocation
let ((alloced, regAllocStats), usAlloc)
= {-# SCC "regAlloc(linear)" #-}
= {-# SCC "RegAlloc(linear)" #-}
initUs usLive
$ liftM unzip
$ mapUs Linear.regAlloc withLiveness
......
......@@ -82,7 +82,7 @@ data Node k cls color
, nodeConflicts :: UniqSet k
-- | Colors that cannot be used by this node.
, nodeExclusions :: UniqSet color
, nodeExclusions :: UniqSet color
-- | Colors that this node would prefer to be, in decending order.
, nodePreference :: [color]
......
......@@ -28,7 +28,6 @@ import UniqFM
import Data.List hiding (union)
import Data.Maybe
-- | Lookup a node from the graph.
lookupNode
:: Uniquable k
......@@ -447,6 +446,7 @@ setColor u color
u
{-# INLINE adjustWithDefaultUFM #-}
adjustWithDefaultUFM
:: Uniquable k
=> (a -> a) -> a -> k
......@@ -458,7 +458,7 @@ adjustWithDefaultUFM f def k map
map
k def
{-# INLINE adjustUFM #-}
adjustUFM
:: Uniquable k
=> (a -> a)
......
......@@ -103,6 +103,9 @@ import Unique
import UniqSet
import Constants
import FastTypes
import UniqFM
import GHC.Exts
#if powerpc_TARGET_ARCH
import Data.Word ( Word8, Word16, Word32 )
......@@ -444,24 +447,30 @@ instance Outputable Reg where
-- NOTE: This only works for arcitectures with just RcInteger and RcDouble
-- (which are disjoint) ie. x86, x86_64 and ppc
--
-- BL 2007/09
-- Doing a nice fold over the UniqSet makes trivColorable use
-- 32% of total compile time and 42% of total alloc when compiling SHA1.lhs from darcs.
{-
trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
trivColorable classN conflicts exclusions
= let
= let
acc :: Reg -> (Int, Int) -> (Int, Int)
acc r (cd, cf)
= case regClass r of
RcInteger -> (cd+1, cf)
RcDouble -> (cd, cf+1)
_ -> panic "MachRegs.trivColorable: reg class not handled"
tmp = foldUniqSet acc (0, 0) conflicts
(rsD, rsFP) = foldUniqSet acc tmp exclusions
tmp = foldUniqSet acc (0, 0) conflicts
(countInt, countFloat) = foldUniqSet acc tmp exclusions
squeese = worst rsD classN RcInteger
+ worst rsFP classN RcDouble
squeese = worst countInt classN RcInteger
+ worst countFloat classN RcDouble
in squeese < allocatableRegsInClass classN
-- | Worst case displacement
-- node N of classN has n neighbors of class C.
--
......@@ -480,6 +489,69 @@ worst n classN classC
-> case classC of
RcDouble -> min n (allocatableRegsInClass RcDouble)
RcInteger -> 0
-}
-- The number of allocatable regs is hard coded here so we can do a fast comparision
-- in trivColorable. It's ok if these numbers are _less_ than the actual number of
-- free regs, but they can't be more or the register conflict graph won't color.
--
-- There is an allocatableRegsInClass :: RegClass -> Int, but doing the unboxing
-- is too slow for us here.
--
-- Compare MachRegs.freeRegs and MachRegs.h to get these numbers.
--
#if i386_TARGET_ARCH
#define ALLOCATABLE_REGS_INTEGER 3#
#define ALLOCATABLE_REGS_DOUBLE 6#
#endif
#if x86_64_TARGET_ARCH
#define ALLOCATABLE_REGS_INTEGER 5#
#define ALLOCATABLE_REGS_DOUBLE 2#
#endif
#if powerpc_TARGET_ARCH
#define ALLOCATABLE_REGS_INTEGER 16#
#define ALLOCATABLE_REGS_DOUBLE 26#
#endif
{-# INLINE regClass #-}
trivColorable :: RegClass -> UniqSet Reg -> UniqSet Reg -> Bool
trivColorable classN conflicts exclusions
= {-# SCC "trivColorable" #-}
let
{-# INLINE isSqueesed #-}
isSqueesed cI cF ufm
= case ufm of
NodeUFM _ _ left right
-> case isSqueesed cI cF right of
(# s, cI', cF' #)
-> case s of
False -> isSqueesed cI' cF' left
True -> (# True, cI', cF' #)
LeafUFM _ reg
-> case regClass reg of
RcInteger
-> case cI +# 1# of
cI' -> (# cI' >=# ALLOCATABLE_REGS_INTEGER, cI', cF #)
RcDouble
-> case cF +# 1# of
cF' -> (# cF' >=# ALLOCATABLE_REGS_DOUBLE, cI, cF' #)
EmptyUFM
-> (# False, cI, cF #)
in case isSqueesed 0# 0# conflicts of
(# False, cI', cF' #)
-> case isSqueesed cI' cF' exclusions of
(# s, _, _ #) -> not s
(# True, _, _ #)
-> False
-- -----------------------------------------------------------------------------
......
......@@ -79,11 +79,12 @@ regAlloc_spin dump (spinCount :: Int) triv regsFree slotsFree debug_codeGraphs c
$$ text "slotsFree = " <> ppr (sizeUniqSet slotsFree))
-- build a conflict graph from the code.
graph <- buildGraph code
graph <- {-# SCC "BuildGraph" #-} buildGraph code
-- build a map of how many instructions each reg lives for.
-- this is lazy, it won't be computed unless we need to spill
let fmLife = plusUFMs_C (\(r1, l1) (_, l2) -> (r1, l1 + l2))
let fmLife = {-# SCC "LifetimeCount" #-} plusUFMs_C (\(r1, l1) (_, l2) -> (r1, l1 + l2))
$ map lifetimeCount code
-- record startup state
......@@ -101,7 +102,7 @@ regAlloc_spin dump (spinCount :: Int) triv regsFree slotsFree debug_codeGraphs c
-- try and color the graph
let (graph_colored, rsSpill, rmCoalesce)
= Color.colorGraph regsFree triv spill graph
= {-# SCC "ColorGraph" #-} Color.colorGraph regsFree triv spill graph
-- rewrite regs in the code that have been coalesced
let patchF reg = case lookupUFM rmCoalesce reg of
......@@ -147,7 +148,7 @@ regAlloc_spin dump (spinCount :: Int) triv regsFree slotsFree debug_codeGraphs c
-- spill the uncolored regs
(code_spilled, slotsFree', spillStats)
<- regSpill code_coalesced slotsFree rsSpill
-- recalculate liveness
let code_nat = map stripLive code_spilled
code_relive <- mapM regLiveness code_nat
......
......@@ -224,6 +224,7 @@ emptyStackMap = StackMap [0..maxSpillSlots] emptyUFM
getStackSlotFor :: StackMap -> Unique -> (StackMap,Int)
getStackSlotFor (StackMap [] _) _
= panic "RegAllocLinear.getStackSlotFor: out of stack slots"
getStackSlotFor fs@(StackMap (freeSlot:stack') reserved) reg =
case lookupUFM reserved reg of
Just slot -> (fs,slot)
......
......@@ -21,7 +21,8 @@ Basically, the things need to be in class @Uniquable@, and we use the
-- for details
module UniqFM (
UniqFM, -- abstract type
UniqFM(..), -- abstract type
-- (de-abstracted for MachRegs.trivColorable optimisation BL 2007/09)
emptyUFM,
unitUFM,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment