Commit d61c3ac1 authored by Jan Stolarek's avatar Jan Stolarek

Optimize self-recursive tail calls

This patch implements loopification optimization. It was described
in "Low-level code optimisations in the Glasgow Haskell Compiler" by
Krzysztof Woś, but we use a different approach here. Krzysztof's
approach was to perform optimization as a Cmm-to-Cmm pass. Our
approach is to generate properly optimized tail calls in the code
generator, which saves us the trouble of processing Cmm. This idea
was proposed by Simon Marlow. Implementation details are explained
in Note [Self-recursive tail calls].

Performance of most nofib benchmarks is not affected. There are
some benchmarks that show 5-7% improvement, with an average improvement
of 2.6%. It would require some further investigation to check if this
is related to benchamrking noise or does this optimization really
help make some class of programs faster.

As a minor cleanup, this patch renames forkProc to forkLneBody.
It also moves some data declarations from StgCmmMonad to
StgCmmClosure, because they are needed there and it seems that
StgCmmClosure is on top of the whole StgCmm* hierarchy.
parent 1d1ab12d
......@@ -30,6 +30,7 @@ import StgCmmForeign (emitPrimCall)
import MkGraph
import CoreSyn ( AltCon(..) )
import SMRep
import BlockId
import Cmm
import CmmInfo
import CmmUtils
......@@ -476,7 +477,17 @@ closureCodeBody top_lvl bndr cl_info cc args arity body fv_details
; let node_points = nodeMustPointToIt dflags lf_info
node' = if node_points then Just node else Nothing
; when node_points (ldvEnterClosure cl_info)
-- Emit new label that might potentially be a header
-- of a self-recursive tail call. See Note
-- [Self-recursive tail calls] in StgCmmExpr
; u <- newUnique
; let loop_header_id = mkBlockId u
; emitLabel loop_header_id
-- Extend reader monad with information that
-- self-recursive tail calls can be optimized into local
-- jumps
; withSelfLoop (bndr, loop_header_id, arg_regs) $ do
{
-- Main payload
; entryHeapCheck cl_info node' arity arg_regs $ do
{ -- ticky after heap check to avoid double counting
......@@ -490,7 +501,8 @@ closureCodeBody top_lvl bndr cl_info cc args arity body fv_details
-- heap check, to reduce live vars over check
; when node_points $ load_fvs node lf_info fv_bindings
; void $ cgExpr body
}}
}}}
}
-- A function closure pointer may be tagged, so we
......
......@@ -27,10 +27,9 @@ module StgCmmClosure (
lfDynTag,
maybeIsLFCon, isLFThunk, isLFReEntrant, lfUpdatable,
nodeMustPointToIt,
CallMethod(..), getCallMethod,
isKnownFun, funTag, tagForArity,
-- * Used by other modules
CgLoc(..), SelfLoopInfo, CallMethod(..),
nodeMustPointToIt, isKnownFun, funTag, tagForArity, getCallMethod,
-- * ClosureInfo
ClosureInfo,
......@@ -69,11 +68,14 @@ module StgCmmClosure (
import StgSyn
import SMRep
import Cmm
import PprCmmExpr()
import BlockId
import CLabel
import Id
import IdInfo
import DataCon
import FastString
import Name
import Type
import TypeRep
......@@ -84,6 +86,37 @@ import Outputable
import DynFlags
import Util
-----------------------------------------------------------------------------
-- Data types and synonyms
-----------------------------------------------------------------------------
-- These data types are mostly used by other modules, especially StgCmmMonad,
-- but we define them here because some functions in this module need to
-- have access to them as well
data CgLoc
= CmmLoc CmmExpr -- A stable CmmExpr; that is, one not mentioning
-- Hp, so that it remains valid across calls
| LneLoc BlockId [LocalReg] -- A join point
-- A join point (= let-no-escape) should only
-- be tail-called, and in a saturated way.
-- To tail-call it, assign to these locals,
-- and branch to the block id
instance Outputable CgLoc where
ppr (CmmLoc e) = ptext (sLit "cmm") <+> ppr e
ppr (LneLoc b rs) = ptext (sLit "lne") <+> ppr b <+> ppr rs
type SelfLoopInfo = (Id, BlockId, [LocalReg])
-- used by ticky profiling
isKnownFun :: LambdaFormInfo -> Bool
isKnownFun (LFReEntrant _ _ _ _) = True
isKnownFun LFLetNoEscape = True
isKnownFun _ = False
-----------------------------------------------------------------------------
-- Representations
-----------------------------------------------------------------------------
......@@ -465,49 +498,65 @@ When black-holing, single-entry closures could also be entered via node
(rather than directly) to catch double-entry. -}
data CallMethod
= EnterIt -- No args, not a function
= EnterIt -- No args, not a function
| JumpToIt -- A join point
| JumpToIt BlockId [LocalReg] -- A join point or a header of a local loop
| ReturnIt -- It's a value (function, unboxed value,
| ReturnIt -- It's a value (function, unboxed value,
-- or constructor), so just return it.
| SlowCall -- Unknown fun, or known fun with
-- too few args.
| DirectEntry -- Jump directly, with args in regs
CLabel -- The code label
RepArity -- Its arity
CLabel -- The code label
RepArity -- Its arity
getCallMethod :: DynFlags
-> Name -- Function being applied
-> CafInfo -- Can it refer to CAF's?
-> LambdaFormInfo -- Its info
-> RepArity -- Number of available arguments
-> Id -- Function Id used to chech if it can refer to
-- CAF's and whether the function is tail-calling
-- itself
-> LambdaFormInfo -- Its info
-> RepArity -- Number of available arguments
-> CgLoc -- Passed in from cgIdApp so that we can
-- handle let-no-escape bindings and self-recursive
-- tail calls using the same data constructor,
-- JumpToIt. This saves us one case branch in
-- cgIdApp
-> Maybe SelfLoopInfo -- can we perform a self-recursive tail call?
-> CallMethod
getCallMethod dflags _name _ lf_info _n_args
getCallMethod _ _ id _ n_args _cg_loc (Just (self_loop_id, block_id, args))
| id == self_loop_id, n_args == length args
-- If these patterns match then we know that:
-- * function is performing a self-recursive call in a tail position
-- * number of parameters of the function matches functions arity.
-- See Note [Self-recursive tail calls] in StgCmmExpr for more details
= JumpToIt block_id args
getCallMethod dflags _name _ lf_info _n_args _cg_loc _self_loop_info
| nodeMustPointToIt dflags lf_info && gopt Opt_Parallel dflags
= -- If we're parallel, then we must always enter via node.
= -- If we're parallel, then we must always enter via node.
-- The reason is that the closure may have been
-- fetched since we allocated it.
EnterIt
getCallMethod dflags name caf (LFReEntrant _ arity _ _) n_args
getCallMethod dflags name id (LFReEntrant _ arity _ _) n_args _cg_loc _self_loop_info
| n_args == 0 = ASSERT( arity /= 0 )
ReturnIt -- No args at all
| n_args < arity = SlowCall -- Not enough args
| otherwise = DirectEntry (enterIdLabel dflags name caf) arity
| otherwise = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity
getCallMethod _ _name _ LFUnLifted n_args
getCallMethod _ _name _ LFUnLifted n_args _cg_loc _self_loop_info
= ASSERT( n_args == 0 ) ReturnIt
getCallMethod _ _name _ (LFCon _) n_args
getCallMethod _ _name _ (LFCon _) n_args _cg_loc _self_loop_info
= ASSERT( n_args == 0 ) ReturnIt
getCallMethod dflags name caf (LFThunk _ _ updatable std_form_info is_fun) n_args
| is_fun -- it *might* be a function, so we must "call" it (which is always safe)
= SlowCall -- We cannot just enter it [in eval/apply, the entry code
getCallMethod dflags name id (LFThunk _ _ updatable std_form_info is_fun) n_args _cg_loc _self_loop_info
| is_fun -- it *might* be a function, so we must "call" it (which is always safe)
= SlowCall -- We cannot just enter it [in eval/apply, the entry code
-- is the fast-entry code]
-- Since is_fun is False, we are *definitely* looking at a data value
......@@ -527,27 +576,24 @@ getCallMethod dflags name caf (LFThunk _ _ updatable std_form_info is_fun) n_arg
| otherwise -- Jump direct to code for single-entry thunks
= ASSERT( n_args == 0 )
DirectEntry (thunkEntryLabel dflags name caf std_form_info updatable) 0
DirectEntry (thunkEntryLabel dflags name (idCafInfo id) std_form_info updatable) 0
getCallMethod _ _name _ (LFUnknown True) _n_args
getCallMethod _ _name _ (LFUnknown True) _n_arg _cg_locs _self_loop_info
= SlowCall -- might be a function
getCallMethod _ name _ (LFUnknown False) n_args
getCallMethod _ name _ (LFUnknown False) n_args _cg_loc _self_loop_info
= ASSERT2( n_args == 0, ppr name <+> ppr n_args )
EnterIt -- Not a function
getCallMethod _ _name _ LFBlackHole _n_args
= SlowCall -- Presumably the black hole has by now
getCallMethod _ _name _ LFBlackHole _n_args _cg_loc _self_loop_info
= SlowCall -- Presumably the black hole has by now
-- been updated, but we don't know with
-- what, so we slow call it
getCallMethod _ _name _ LFLetNoEscape _n_args
= JumpToIt
getCallMethod _ _name _ LFLetNoEscape _n_args (LneLoc blk_id lne_regs) _self_loop_info
= JumpToIt blk_id lne_regs
isKnownFun :: LambdaFormInfo -> Bool
isKnownFun (LFReEntrant _ _ _ _) = True
isKnownFun LFLetNoEscape = True
isKnownFun _ = False
getCallMethod _ _ _ _ _ _ _ = panic "Unknown call method"
-----------------------------------------------------------------------------
-- staticClosureRequired
......
......@@ -160,7 +160,7 @@ cgLetNoEscapeClosure bndr cc_slot _unused_cc args body
return ( lneIdInfo dflags bndr args
, code )
where
code = forkProc $ do {
code = forkLneBody $ do {
; withNewTickyCounterLNE (idName bndr) args $ do
; restoreCurrentCostCentre cc_slot
; arg_regs <- bindArgsToRegs args
......@@ -632,19 +632,20 @@ cgConApp con stg_args
cgIdApp :: Id -> [StgArg] -> FCode ReturnKind
cgIdApp fun_id [] | isVoidId fun_id = emitReturn []
cgIdApp fun_id args = do
dflags <- getDynFlags
fun_info <- getCgIdInfo fun_id
dflags <- getDynFlags
fun_info <- getCgIdInfo fun_id
self_loop_info <- getSelfLoop
let cg_fun_id = cg_id fun_info
-- NB: use (cg_id fun_info) instead of fun_id, because
-- the former may be externalised for -split-objs.
-- See Note [Externalise when splitting] in StgCmmMonad
fun_arg = StgVarArg cg_fun_id
fun_name = idName cg_fun_id
fun = idInfoToAmode fun_info
lf_info = cg_lf fun_info
fun_name = idName cg_fun_id
fun = idInfoToAmode fun_info
lf_info = cg_lf fun_info
node_points dflags = nodeMustPointToIt dflags lf_info
case (getCallMethod dflags fun_name (idCafInfo cg_fun_id) lf_info (length args)) of
case (getCallMethod dflags fun_name cg_fun_id lf_info (length args) (cg_loc fun_info) self_loop_info) of
-- A value in WHNF, so we can just return it.
ReturnIt -> emitReturn [fun] -- ToDo: does ReturnIt guarantee tagged?
......@@ -664,14 +665,87 @@ cgIdApp fun_id args = do
then directCall NativeNodeCall lbl arity (fun_arg:args)
else directCall NativeDirectCall lbl arity args }
-- Let-no-escape call
JumpToIt -> let (LneLoc blk_id lne_regs) = cg_loc fun_info
in do
{ adjustHpBackwards -- always do this before a tail-call
; cmm_args <- getNonVoidArgAmodes args
; emitMultiAssign lne_regs cmm_args
; emit (mkBranch blk_id)
; return AssignedDirectly }
-- Let-no-escape call or self-recursive tail-call
JumpToIt blk_id lne_regs -> do
{ adjustHpBackwards -- always do this before a tail-call
; cmm_args <- getNonVoidArgAmodes args
; emitMultiAssign lne_regs cmm_args
; emit (mkBranch blk_id)
; return AssignedDirectly }
-- Note [Self-recursive tail calls]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-- Self-recursive tail calls can be optimized into a local jump in the same
-- way as let-no-escape bindings (see Note [What is a non-escaping let] in
-- stgSyn/CoreToStg.lhs). Consider this:
--
-- foo.info:
-- a = R1 // calling convention
-- b = R2
-- goto L1
-- L1: ...
-- ...
-- ...
-- L2: R1 = x
-- R2 = y
-- call foo(R1,R2)
--
-- Instead of putting x and y into registers (or other locations required by the
-- calling convention) and performing a call we can put them into local
-- variables a and b and perform jump to L1:
--
-- foo.info:
-- a = R1
-- b = R2
-- goto L1
-- L1: ...
-- ...
-- ...
-- L2: a = x
-- b = y
-- goto L1
--
-- This can be done only when function is calling itself in a tail position
-- and only if the call passes number of parameters equal to function's arity.
-- Note that this cannot be performed if a function calls itself with a
-- continuation.
--
-- This in fact implements optimization known as "loopification". It was
-- described in "Low-level code optimizations in the Glasgow Haskell Compiler"
-- by Krzysztof Woś, though we use different approach. Krzysztof performed his
-- optimization at the Cmm level, whereas we perform ours during code generation
-- (Stg-to-Cmm pass) essentially making sure that optimized Cmm code is
-- generated in the first place.
--
-- Implementation is spread across a couple of places in the code:
--
-- * FCode monad stores additional information in its reader environment
-- (cgd_self_loop field). This information tells us which function can
-- tail call itself in an optimized way (it is the function currently
-- being compiled), what is the label of a loop header (L1 in example above)
-- and information about local registers in which we should arguments
-- before making a call (this would be a and b in example above).
--
-- * Whenever we are compiling a function, we set that information to reflect
-- the fact that function currently being compiled can be jumped to, instead
-- of called. We also have to emit a label to which we will be jumping. Both
-- things are done in closureCodyBody in StgCmmBind.
--
-- * When we began compilation of another closure we remove the additional
-- information from the environment. This is done by forkClosureBody
-- in StgCmmMonad. Other functions that duplicate the environment -
-- forkLneBody, forkAlts, codeOnly - duplicate that information. In other
-- words, we only need to clean the environment of the self-loop information
-- when compiling right hand side of a closure (binding).
--
-- * When compiling a call (cgIdApp) we use getCallMethod to decide what kind
-- of call will be generated. getCallMethod decides to generate a self
-- recursive tail call when (a) environment stores information about
-- possible self tail-call; (b) that tail call is to a function currently
-- being compiled; (c) number of passed arguments is equal to function's
-- arity.
emitEnter :: CmmExpr -> FCode ReturnKind
emitEnter fun = do
......
......@@ -26,7 +26,7 @@ module StgCmmMonad (
mkCmmIfThenElse, mkCmmIfThen, mkCmmIfGoto,
mkCall, mkCmmCall,
forkClosureBody, forkAlts, forkProc, codeOnly,
forkClosureBody, forkLneBody, forkAlts, codeOnly,
ConTagZ,
......@@ -44,10 +44,10 @@ module StgCmmMonad (
getModuleName,
-- ideally we wouldn't export these, but some other modules access internal state
getState, setState, getInfoDown, getDynFlags, getThisPackage,
getState, setState, getSelfLoop, withSelfLoop, getInfoDown, getDynFlags, getThisPackage,
-- more localised access to monad state
CgIdInfo(..), CgLoc(..),
CgIdInfo(..),
getBinds, setBinds,
-- out of general friendliness, we also export ...
......@@ -60,6 +60,7 @@ import Cmm
import StgCmmClosure
import DynFlags
import Hoopl
import Maybes
import MkGraph
import BlockId
import CLabel
......@@ -100,11 +101,10 @@ infixr 9 `thenFC`
-- - A reader monad, for CgInfoDownwards, containing
-- - DynFlags,
-- - the current Module
-- - the static top-level environmnet
-- - the update-frame offset
-- - the ticky counter label
-- - the Sequel (the continuation to return to)
-- - the self-recursive tail call information
--------------------------------------------------------
......@@ -169,11 +169,15 @@ fixC fcode = FCode (
data CgInfoDownwards -- information only passed *downwards* by the monad
= MkCgInfoDown {
cgd_dflags :: DynFlags,
cgd_mod :: Module, -- Module being compiled
cgd_updfr_off :: UpdFrameOffset, -- Size of current update frame
cgd_ticky :: CLabel, -- Current destination for ticky counts
cgd_sequel :: Sequel -- What to do at end of basic block
cgd_dflags :: DynFlags,
cgd_mod :: Module, -- Module being compiled
cgd_updfr_off :: UpdFrameOffset, -- Size of current update frame
cgd_ticky :: CLabel, -- Current destination for ticky counts
cgd_sequel :: Sequel, -- What to do at end of basic block
cgd_self_loop :: Maybe SelfLoopInfo -- Which tail calls can be compiled
-- as local jumps? See Note
-- [Self-recursive tail calls] in
-- StgCmmExpr
}
type CgBindings = IdEnv CgIdInfo
......@@ -195,25 +199,10 @@ data CgIdInfo
-- use the externalised one in any C label we use which refers to this
-- name.
data CgLoc
= CmmLoc CmmExpr -- A stable CmmExpr; that is, one not mentioning
-- Hp, so that it remains valid across calls
| LneLoc BlockId [LocalReg] -- A join point
-- A join point (= let-no-escape) should only
-- be tail-called, and in a saturated way.
-- To tail-call it, assign to these locals,
-- and branch to the block id
instance Outputable CgIdInfo where
ppr (CgIdInfo { cg_id = id, cg_loc = loc })
= ppr id <+> ptext (sLit "-->") <+> ppr loc
instance Outputable CgLoc where
ppr (CmmLoc e) = ptext (sLit "cmm") <+> ppr e
ppr (LneLoc b rs) = ptext (sLit "lne") <+> ppr b <+> ppr rs
-- Sequel tells what to do with the result of this expression
data Sequel
= Return Bool -- Return result(s) to continuation found on the stack.
......@@ -308,7 +297,8 @@ initCgInfoDown dflags mod
, cgd_mod = mod
, cgd_updfr_off = initUpdFrameOff dflags
, cgd_ticky = mkTopTickyCtrLabel
, cgd_sequel = initSequel }
, cgd_sequel = initSequel
, cgd_self_loop = Nothing }
initSequel :: Sequel
initSequel = Return False
......@@ -455,6 +445,16 @@ newUnique = do
getInfoDown :: FCode CgInfoDownwards
getInfoDown = FCode $ \info_down state -> (# info_down,state #)
getSelfLoop :: FCode (Maybe SelfLoopInfo)
getSelfLoop = do
info_down <- getInfoDown
return $ cgd_self_loop info_down
withSelfLoop :: SelfLoopInfo -> FCode a -> FCode a
withSelfLoop self_loop code = do
info_down <- getInfoDown
withInfoDown code (info_down {cgd_self_loop = Just self_loop})
instance HasDynFlags FCode where
getDynFlags = liftM cgd_dflags getInfoDown
......@@ -481,7 +481,7 @@ getModuleName = do { info <- getInfoDown; return (cgd_mod info) }
withSequel :: Sequel -> FCode a -> FCode a
withSequel sequel code
= do { info <- getInfoDown
; withInfoDown code (info {cgd_sequel = sequel }) }
; withInfoDown code (info {cgd_sequel = sequel, cgd_self_loop = Nothing }) }
getSequel :: FCode Sequel
getSequel = do { info <- getInfoDown
......@@ -526,15 +526,12 @@ setTickyCtrLabel ticky code = do
--------------------------------------------------------
forkClosureBody :: FCode () -> FCode ()
-- forkClosureBody takes a code, $c$, and compiles it in a
-- fresh environment, except that:
-- - compilation info and statics are passed in unchanged.
-- - local bindings are passed in unchanged
-- (it's up to the enclosed code to re-bind the
-- free variables to a field of the closure)
--
-- The current state is passed on completely unaltered, except that
-- C-- from the fork is incorporated.
-- forkClosureBody compiles body_code in environment where:
-- - sequel, update stack frame and self loop info are
-- set to fresh values
-- - state is set to a fresh value, except for local bindings
-- that are passed in unchanged. It's up to the enclosed code to
-- re-bind the free variables to a field of the closure.
forkClosureBody body_code
= do { dflags <- getDynFlags
......@@ -542,26 +539,25 @@ forkClosureBody body_code
; us <- newUniqSupply
; state <- getState
; let body_info_down = info { cgd_sequel = initSequel
, cgd_updfr_off = initUpdFrameOff dflags }
, cgd_updfr_off = initUpdFrameOff dflags
, cgd_self_loop = Nothing }
fork_state_in = (initCgState us) { cgs_binds = cgs_binds state }
((),fork_state_out) = doFCode body_code body_info_down fork_state_in
; setState $ state `addCodeBlocksFrom` fork_state_out }
forkProc :: FCode a -> FCode a
-- 'forkProc' takes a code and compiles it in the *current* environment,
-- returning the graph thus constructed.
forkLneBody :: FCode a -> FCode a
-- 'forkLneBody' takes a body of let-no-escape binding and compiles
-- it in the *current* environment, returning the graph thus constructed.
--
-- The current environment is passed on completely unchanged to
-- the successor. In particular, any heap usage from the enclosed
-- code is discarded; it should deal with its own heap consumption.
-- forkProc is used to compile let-no-escape bindings.
forkProc body_code
forkLneBody body_code
= do { info_down <- getInfoDown
; us <- newUniqSupply
; state <- getState
; let info_down' = info_down -- { cgd_sequel = initSequel }
fork_state_in = (initCgState us) { cgs_binds = cgs_binds state }
(result, fork_state_out) = doFCode body_code info_down' fork_state_in
; let fork_state_in = (initCgState us) { cgs_binds = cgs_binds state }
(result, fork_state_out) = doFCode body_code info_down fork_state_in
; setState $ state `addCodeBlocksFrom` fork_state_out
; return result }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment