CoreUnfold.hs 59.3 KB
Newer Older
Austin Seipp's avatar
Austin Seipp committed
1 2 3 4
{-
(c) The University of Glasgow 2006
(c) The AQUA Project, Glasgow University, 1994-1998

Simon Marlow's avatar
Simon Marlow committed
5 6

Core-syntax unfoldings
7 8 9 10 11 12 13 14 15

Unfoldings (which can travel across module boundaries) are in Core
syntax (namely @CoreExpr@s).

The type @Unfolding@ sits ``above'' simply-Core-expressions
unfoldings, capturing ``higher-level'' things we know about a binding,
usually things that the simplifier found out (e.g., ``it's a
literal'').  In the corner of a @CoreUnfolding@ unfolding, you will
find, unsurprisingly, a Core expression.
Austin Seipp's avatar
Austin Seipp committed
16
-}
17

18
{-# LANGUAGE CPP #-}
Ian Lynagh's avatar
Ian Lynagh committed
19

20
module CoreUnfold (
21
        Unfolding, UnfoldingGuidance,   -- Abstract types
22

23
        noUnfolding, mkImplicitUnfolding,
24
        mkUnfolding, mkCoreUnfolding,
25 26 27
        mkTopUnfolding, mkSimpleUnfolding, mkWorkerUnfolding,
        mkInlineUnfolding, mkInlinableUnfolding, mkWwInlineRule,
        mkCompulsoryUnfolding, mkDFunUnfolding,
Simon Peyton Jones's avatar
Simon Peyton Jones committed
28
        specUnfolding,
29

30
        interestingArg, ArgSummary(..),
31

32 33
        couldBeSmallEnoughToInline, inlineBoringOk,
        certainlyWillInline, smallEnoughToInline,
34

35
        callSiteInline, CallCtxt(..),
36

37 38
        -- Reexport from CoreSubst (it only live there so it can be used
        -- by the Very Simple Optimiser)
39
        exprIsConApp_maybe, exprIsLiteral_maybe
40 41
    ) where

42 43
#include "HsVersions.h"

Simon Marlow's avatar
Simon Marlow committed
44
import DynFlags
45
import CoreSyn
46
import PprCore          ()      -- Instances
47
import OccurAnal        ( occurAnalyseExpr )
48
import CoreSubst hiding( substTy )
49
import CoreArity       ( manifestArity, exprBotStrictness_maybe )
Simon Marlow's avatar
Simon Marlow committed
50 51 52 53 54 55
import CoreUtils
import Id
import DataCon
import Literal
import PrimOp
import IdInfo
56
import BasicTypes       ( Arity )
57
import Type
Simon Marlow's avatar
Simon Marlow committed
58
import PrelNames
59
import TysPrim          ( realWorldStatePrimTy )
60
import Bag
61
import Util
62
import FastTypes
63
import FastString
64
import Outputable
65 66
import ForeignCall

67
import qualified Data.ByteString as BS
68
import Data.Maybe
69

Austin Seipp's avatar
Austin Seipp committed
70 71 72
{-
************************************************************************
*                                                                      *
73
\subsection{Making unfoldings}
Austin Seipp's avatar
Austin Seipp committed
74 75 76
*                                                                      *
************************************************************************
-}
77

78 79
mkTopUnfolding :: DynFlags -> Bool -> CoreExpr -> Unfolding
mkTopUnfolding dflags = mkUnfolding dflags InlineRhs True {- Top level -}
80

81
mkImplicitUnfolding :: DynFlags -> CoreExpr -> Unfolding
82
-- For implicit Ids, do a tiny bit of optimising first
83 84
mkImplicitUnfolding dflags expr
    = mkTopUnfolding dflags False (simpleOptExpr expr)
Simon Marlow's avatar
Simon Marlow committed
85

86 87 88 89 90
-- Note [Top-level flag on inline rules]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- Slight hack: note that mk_inline_rules conservatively sets the
-- top-level flag to True.  It gets set more accurately by the simplifier
-- Simplify.simplUnfolding.
91

92 93
mkSimpleUnfolding :: DynFlags -> CoreExpr -> Unfolding
mkSimpleUnfolding dflags = mkUnfolding dflags InlineRhs False False
94

95
mkDFunUnfolding :: [Var] -> DataCon -> [CoreExpr] -> Unfolding
96 97 98 99 100
mkDFunUnfolding bndrs con ops
  = DFunUnfolding { df_bndrs = bndrs
                  , df_con = con
                  , df_args = map occurAnalyseExpr ops }
                  -- See Note [Occurrrence analysis of unfoldings]
Simon Marlow's avatar
Simon Marlow committed
101

102 103
mkWwInlineRule :: CoreExpr -> Arity -> Unfolding
mkWwInlineRule expr arity
104
  = mkCoreUnfolding InlineStable True
Simon Peyton Jones's avatar
Simon Peyton Jones committed
105 106 107
                   (simpleOptExpr expr)
                   (UnfWhen { ug_arity = arity, ug_unsat_ok = unSaturatedOk
                            , ug_boring_ok = boringCxtNotOk })
108

twanvl's avatar
twanvl committed
109
mkCompulsoryUnfolding :: CoreExpr -> Unfolding
110
mkCompulsoryUnfolding expr         -- Used for things that absolutely must be unfolded
111
  = mkCoreUnfolding InlineCompulsory True
Simon Peyton Jones's avatar
Simon Peyton Jones committed
112 113 114
                    (simpleOptExpr expr)
                    (UnfWhen { ug_arity = 0    -- Arity of unfolding doesn't matter
                             , ug_unsat_ok = unSaturatedOk, ug_boring_ok = boringCxtOk })
115

116 117 118 119 120 121 122 123 124 125 126 127 128
mkWorkerUnfolding :: DynFlags -> (CoreExpr -> CoreExpr) -> Unfolding -> Unfolding
-- See Note [Worker-wrapper for INLINABLE functions] in WorkWrap
mkWorkerUnfolding dflags work_fn
                  (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
                                 , uf_is_top = top_lvl })
  | isStableSource src
  = mkCoreUnfolding src top_lvl new_tmpl guidance
  where
    new_tmpl = simpleOptExpr (work_fn tmpl)
    guidance = calcUnfoldingGuidance dflags new_tmpl

mkWorkerUnfolding _ _ _ = noUnfolding

129
mkInlineUnfolding :: Maybe Arity -> CoreExpr -> Unfolding
Simon Peyton Jones's avatar
Simon Peyton Jones committed
130
mkInlineUnfolding mb_arity expr
131
  = mkCoreUnfolding InlineStable
132
                    True         -- Note [Top-level flag on inline rules]
Simon Peyton Jones's avatar
Simon Peyton Jones committed
133
                    expr' guide
134 135
  where
    expr' = simpleOptExpr expr
Simon Peyton Jones's avatar
Simon Peyton Jones committed
136 137 138 139 140 141 142
    guide = case mb_arity of
              Nothing    -> UnfWhen { ug_arity = manifestArity expr'
                                    , ug_unsat_ok = unSaturatedOk
                                    , ug_boring_ok = boring_ok }
              Just arity -> UnfWhen { ug_arity = arity
                                    , ug_unsat_ok = needSaturated
                                    , ug_boring_ok = boring_ok }
143
    boring_ok = inlineBoringOk expr'
144

145 146 147
mkInlinableUnfolding :: DynFlags -> CoreExpr -> Unfolding
mkInlinableUnfolding dflags expr
  = mkUnfolding dflags InlineStable True is_bot expr'
148
  where
149 150
    expr' = simpleOptExpr expr
    is_bot = isJust (exprBotStrictness_maybe expr')
Simon Peyton Jones's avatar
Simon Peyton Jones committed
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185

specUnfolding :: DynFlags -> Subst -> [Var] -> [CoreExpr] -> Unfolding -> Unfolding
-- See Note [Specialising unfoldings]
specUnfolding _ subst new_bndrs spec_args
              df@(DFunUnfolding { df_bndrs = bndrs, df_con = con , df_args = args })
  = ASSERT2( length bndrs >= length spec_args, ppr df $$ ppr spec_args $$ ppr new_bndrs )
    mkDFunUnfolding (new_bndrs ++ extra_bndrs) con
                    (map (substExpr spec_doc subst2) args)
  where
    subst1 = extendSubstList subst (bndrs `zip` spec_args)
    (subst2, extra_bndrs) = substBndrs subst1 (dropList spec_args bndrs)

specUnfolding _dflags subst new_bndrs spec_args
              (CoreUnfolding { uf_src = src, uf_tmpl = tmpl
                             , uf_is_top = top_lvl
                             , uf_guidance = old_guidance })
 | isStableSource src  -- See Note [Specialising unfoldings]
 , UnfWhen { ug_arity = old_arity
           , ug_unsat_ok = unsat_ok
           , ug_boring_ok = boring_ok } <- old_guidance
 = let guidance = UnfWhen { ug_arity = old_arity - count isValArg spec_args
                                     + count isId new_bndrs
                          , ug_unsat_ok = unsat_ok
                          , ug_boring_ok = boring_ok }
       new_tmpl = simpleOptExpr $ mkLams new_bndrs $
                  mkApps (substExpr spec_doc subst tmpl) spec_args
                   -- The beta-redexes created here will be simplified
                   -- away by simplOptExpr in mkUnfolding

   in mkCoreUnfolding src top_lvl new_tmpl guidance

specUnfolding _ _ _ _ _ = noUnfolding

spec_doc :: SDoc
spec_doc = ptext (sLit "specUnfolding")
186

Austin Seipp's avatar
Austin Seipp committed
187
{-
Simon Peyton Jones's avatar
Simon Peyton Jones committed
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
Note [Specialising unfoldings]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When we specialise a function for some given type-class arguments, we use
specUnfolding to specialise its unfolding.  Some important points:

* If the original function has a DFunUnfolding, the specialised one
  must do so too!  Otherwise we lose the magic rules that make it
  interact with ClassOps

* There is a bit of hack for INLINABLE functions:
     f :: Ord a => ....
     f = <big-rhs>
     {- INLINEABLE f #-}
  Now if we specialise f, should the specialised version still have
  an INLINEABLE pragma?  If it does, we'll capture a specialised copy
  of <big-rhs> as its unfolding, and that probaby won't inline.  But
  if we don't, the specialised version of <big-rhs> might be small
  enough to inline at a call site. This happens with Control.Monad.liftM3,
  and can cause a lot more allocation as a result (nofib n-body shows this).

  Moreover, keeping the INLINEABLE thing isn't much help, because
  the specialised function (probaby) isn't overloaded any more.

  Conclusion: drop the INLINEALE pragma.  In practice what this means is:
     if a stable unfolding has UnfoldingGuidance of UnfWhen,
        we keep it (so the specialised thing too will always inline)
     if a stable unfolding has UnfoldingGuidance of UnfIfGoodArgs
        (which arises from INLINEABLE), we discard it
Austin Seipp's avatar
Austin Seipp committed
216
-}
Simon Peyton Jones's avatar
Simon Peyton Jones committed
217

218
mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
Simon Peyton Jones's avatar
Simon Peyton Jones committed
219
                -> UnfoldingGuidance -> Unfolding
220
-- Occurrence-analyses the expression before capturing it
Simon Peyton Jones's avatar
Simon Peyton Jones committed
221
mkCoreUnfolding src top_lvl expr guidance
222
  = CoreUnfolding { uf_tmpl         = occurAnalyseExpr expr,
223
                      -- See Note [Occurrrence analysis of unfoldings]
224 225 226
                    uf_src          = src,
                    uf_is_top       = top_lvl,
                    uf_is_value     = exprIsHNF        expr,
227
                    uf_is_conlike   = exprIsConLike    expr,
228 229 230
                    uf_is_work_free = exprIsWorkFree   expr,
                    uf_expandable   = exprIsExpandable expr,
                    uf_guidance     = guidance }
231

232 233
mkUnfolding :: DynFlags -> UnfoldingSource -> Bool -> Bool -> CoreExpr
            -> Unfolding
234 235
-- Calculates unfolding guidance
-- Occurrence-analyses the expression before capturing it
236
mkUnfolding dflags src top_lvl is_bottoming expr
237 238 239 240
  | top_lvl && is_bottoming
  , not (exprIsTrivial expr)
  = NoUnfolding    -- See Note [Do not inline top-level bottoming functions]
  | otherwise
241
  = CoreUnfolding { uf_tmpl         = occurAnalyseExpr expr,
242
                      -- See Note [Occurrrence analysis of unfoldings]
243 244 245
                    uf_src          = src,
                    uf_is_top       = top_lvl,
                    uf_is_value     = exprIsHNF        expr,
246
                    uf_is_conlike   = exprIsConLike    expr,
247 248 249
                    uf_expandable   = exprIsExpandable expr,
                    uf_is_work_free = exprIsWorkFree   expr,
                    uf_guidance     = guidance }
250
  where
Simon Peyton Jones's avatar
Simon Peyton Jones committed
251
    guidance = calcUnfoldingGuidance dflags expr
252
        -- NB: *not* (calcUnfoldingGuidance (occurAnalyseExpr expr))!
253
        -- See Note [Calculate unfolding guidance on the non-occ-anal'd expression]
254

Austin Seipp's avatar
Austin Seipp committed
255
{-
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
Note [Occurrence analysis of unfoldings]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We do occurrence-analysis of unfoldings once and for all, when the
unfolding is built, rather than each time we inline them.

But given this decision it's vital that we do
*always* do it.  Consider this unfolding
    \x -> letrec { f = ...g...; g* = f } in body
where g* is (for some strange reason) the loop breaker.  If we don't
occ-anal it when reading it in, we won't mark g as a loop breaker, and
we may inline g entirely in body, dropping its binding, and leaving
the occurrence in f out of scope. This happened in Trac #8892, where
the unfolding in question was a DFun unfolding.

But more generally, the simplifier is designed on the
basis that it is looking at occurrence-analysed expressions, so better
ensure that they acutally are.

274 275 276 277 278 279 280 281 282 283
Note [Calculate unfolding guidance on the non-occ-anal'd expression]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Notice that we give the non-occur-analysed expression to
calcUnfoldingGuidance.  In some ways it'd be better to occur-analyse
first; for example, sometimes during simplification, there's a large
let-bound thing which has been substituted, and so is now dead; so
'expr' contains two copies of the thing while the occurrence-analysed
expression doesn't.

Nevertheless, we *don't* and *must not* occ-analyse before computing
284
the size because
285 286 287 288

a) The size computation bales out after a while, whereas occurrence
   analysis does not.

289 290
b) Residency increases sharply if you occ-anal first.  I'm not
   100% sure why, but it's a large effect.  Compiling Cabal went
291 292 293 294 295 296 297
   from residency of 534M to over 800M with this one change.

This can occasionally mean that the guidance is very pessimistic;
it gets fixed up next round.  And it should be rare, because large
let-bound things that are dead are usually caught by preInlineUnconditionally


Austin Seipp's avatar
Austin Seipp committed
298 299
************************************************************************
*                                                                      *
300
\subsection{The UnfoldingGuidance type}
Austin Seipp's avatar
Austin Seipp committed
301 302 303
*                                                                      *
************************************************************************
-}
304

305 306
inlineBoringOk :: CoreExpr -> Bool
-- See Note [INLINE for small functions]
307
-- True => the result of inlining the expression is
308 309 310 311 312 313 314 315 316 317 318 319
--         no bigger than the expression itself
--     eg      (\x y -> f y x)
-- This is a quick and dirty version. It doesn't attempt
-- to deal with  (\x y z -> x (y z))
-- The really important one is (x `cast` c)
inlineBoringOk e
  = go 0 e
  where
    go :: Int -> CoreExpr -> Bool
    go credit (Lam x e) | isId x           = go (credit+1) e
                        | otherwise        = go credit e
    go credit (App f (Type {}))            = go credit f
320
    go credit (App f a) | credit > 0
321
                        , exprIsTrivial a  = go (credit-1) f
322
    go credit (Tick _ e)                 = go credit e -- dubious
323 324 325
    go credit (Cast e _)                   = go credit e
    go _      (Var {})                     = boringCxtOk
    go _      _                            = boringCxtNotOk
326

327
calcUnfoldingGuidance
328 329
        :: DynFlags
        -> CoreExpr    -- Expression to look at
Simon Peyton Jones's avatar
Simon Peyton Jones committed
330
        -> UnfoldingGuidance
331
calcUnfoldingGuidance dflags expr
Simon Peyton Jones's avatar
Simon Peyton Jones committed
332 333 334 335 336 337 338 339 340 341
  = case sizeExpr dflags (iUnbox bOMB_OUT_SIZE) val_bndrs body of
      TooBig -> UnfNever
      SizeIs size cased_bndrs scrut_discount
        | uncondInline expr n_val_bndrs (iBox size)
        -> UnfWhen { ug_unsat_ok = unSaturatedOk
                   , ug_boring_ok =  boringCxtOk
                   , ug_arity = n_val_bndrs }   -- Note [INLINE for small functions]
        | otherwise
        -> UnfIfGoodArgs { ug_args  = map (mk_discount cased_bndrs) val_bndrs
                         , ug_size  = iBox size
342
                         , ug_res   = iBox scrut_discount }
Simon Peyton Jones's avatar
Simon Peyton Jones committed
343 344 345 346 347 348 349 350 351 352

  where
    (bndrs, body) = collectBinders expr
    bOMB_OUT_SIZE = ufCreationThreshold dflags
           -- Bomb out if size gets bigger than this
    val_bndrs   = filter isId bndrs
    n_val_bndrs = length val_bndrs

    mk_discount :: Bag (Id,Int) -> Id -> Int
    mk_discount cbs bndr = foldlBag combine 0 cbs
353
           where
Simon Peyton Jones's avatar
Simon Peyton Jones committed
354
             combine acc (bndr', disc)
355 356
               | bndr == bndr' = acc `plus_disc` disc
               | otherwise     = acc
Simon Peyton Jones's avatar
Simon Peyton Jones committed
357

358 359 360 361
             plus_disc :: Int -> Int -> Int
             plus_disc | isFunTy (idType bndr) = max
                       | otherwise             = (+)
             -- See Note [Function and non-function discounts]
362

Austin Seipp's avatar
Austin Seipp committed
363
{-
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
Note [Computing the size of an expression]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The basic idea of sizeExpr is obvious enough: count nodes.  But getting the
heuristics right has taken a long time.  Here's the basic strategy:

    * Variables, literals: 0
      (Exception for string literals, see litSize.)

    * Function applications (f e1 .. en): 1 + #value args

    * Constructor applications: 1, regardless of #args

    * Let(rec): 1 + size of components

    * Note, cast: 0

Examples

382
  Size  Term
383
  --------------
384 385
    0     42#
    0     x
386
    0     True
387 388 389
    2     f x
    1     Just x
    4     f (g x)
390 391

Notice that 'x' counts 0, while (f x) counts 2.  That's deliberate: there's
392
a function call to account for.  Notice also that constructor applications
393 394
are very cheap, because exposing them to a caller is so valuable.

395 396 397 398
[25/5/11] All sizes are now multiplied by 10, except for primops
(which have sizes like 1 or 4.  This makes primops look fantastically
cheap, and seems to be almost unversally beneficial.  Done partly as a
result of #4978.
399 400 401

Note [Do not inline top-level bottoming functions]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
402
The FloatOut pass has gone to some trouble to float out calls to 'error'
403 404 405 406
and similar friends.  See Note [Bottoming floats] in SetLevels.
Do not re-inline them!  But we *do* still inline if they are very small
(the uncondInline stuff).

407 408
Note [INLINE for small functions]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
409
Consider        {-# INLINE f #-}
410 411 412 413 414 415 416 417 418
                f x = Just x
                g y = f y
Then f's RHS is no larger than its LHS, so we should inline it into
even the most boring context.  In general, f the function is
sufficiently small that its body is as small as the call itself, the
inline unconditionally, regardless of how boring the context is.

Things to note:

419 420
(1) We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
    than the thing it's replacing.  Notice that
421 422 423 424 425
      (f x) --> (g 3)             -- YES, unconditionally
      (f x) --> x : []            -- YES, *even though* there are two
                                  --      arguments to the cons
      x     --> g 3               -- NO
      x     --> Just v            -- NO
426

427 428 429 430 431 432 433 434 435 436 437
    It's very important not to unconditionally replace a variable by
    a non-atomic term.

(2) We do this even if the thing isn't saturated, else we end up with the
    silly situation that
       f x y = x
       ...map (f 3)...
    doesn't inline.  Even in a boring context, inlining without being
    saturated will give a lambda instead of a PAP, and will be more
    efficient at runtime.

Simon Peyton Jones's avatar
Simon Peyton Jones committed
438
(3) However, when the function's arity > 0, we do insist that it
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
    has at least one value argument at the call site.  (This check is
    made in the UnfWhen case of callSiteInline.) Otherwise we find this:
         f = /\a \x:a. x
         d = /\b. MkD (f b)
    If we inline f here we get
         d = /\b. MkD (\x:b. x)
    and then prepareRhs floats out the argument, abstracting the type
    variables, so we end up with the original again!

(4) We must be much more cautious about arity-zero things. Consider
       let x = y +# z in ...
    In *size* terms primops look very small, because the generate a
    single instruction, but we do not want to unconditionally replace
    every occurrence of x with (y +# z).  So we only do the
    unconditional-inline thing for *trivial* expressions.
Simon Peyton Jones's avatar
Simon Peyton Jones committed
454

455 456 457
    NB: you might think that PostInlineUnconditionally would do this
    but it doesn't fire for top-level things; see SimplUtils
    Note [Top level and postInlineUnconditionally]
Austin Seipp's avatar
Austin Seipp committed
458
-}
459

460
uncondInline :: CoreExpr -> Arity -> Int -> Bool
461 462
-- Inline unconditionally if there no size increase
-- Size of call is arity (+1 for the function)
463
-- See Note [INLINE for small functions]
464
uncondInline rhs arity size
465 466
  | arity > 0 = size <= 10 * (arity + 1) -- See Note [INLINE for small functions] (1)
  | otherwise = exprIsTrivial rhs        -- See Note [INLINE for small functions] (4)
467

468
sizeExpr :: DynFlags
469 470 471 472 473
         -> FastInt         -- Bomb out if it gets bigger than this
         -> [Id]            -- Arguments; we're interested in which of these
                            -- get case'd
         -> CoreExpr
         -> ExprSize
474

475 476
-- Note [Computing the size of an expression]

477
sizeExpr dflags bOMB_OUT_SIZE top_args expr
478 479
  = size_up expr
  where
480
    size_up (Cast e _) = size_up e
481
    size_up (Tick _ e) = size_up e
482
    size_up (Type _)   = sizeZero           -- Types cost nothing
483
    size_up (Coercion _) = sizeZero
484
    size_up (Lit lit)  = sizeN (litSize lit)
485 486 487 488
    size_up (Var f) | isRealWorldId f = sizeZero
                      -- Make sure we get constructor discounts even
                      -- on nullary constructors
                    | otherwise       = size_up_call f [] 0
Simon Marlow's avatar
Simon Marlow committed
489

490 491 492 493
    size_up (App fun arg)
      | isTyCoArg arg = size_up fun
      | otherwise     = size_up arg  `addSizeNSD`
                        size_up_app fun [arg] (if isRealWorldExpr arg then 1 else 0)
494

495 496 497
    size_up (Lam b e)
      | isId b && not (isRealWorldId b) = lamScrutDiscount dflags (size_up e `addSizeN` 10)
      | otherwise = size_up e
498 499

    size_up (Let (NonRec binder rhs) body)
500 501
      = size_up rhs             `addSizeNSD`
        size_up body            `addSizeN`
502
        (if isUnLiftedType (idType binder) then 0 else 10)
503 504
                -- For the allocation
                -- If the binder has an unlifted type there is no allocation
505 506

    size_up (Let (Rec pairs) body)
507
      = foldr (addSizeNSD . size_up . snd)
508
              (size_up body `addSizeN` (10 * length pairs))     -- (length pairs) for the allocation
509
              pairs
510

511 512 513 514 515 516 517 518 519 520 521 522 523 524
    size_up (Case (Var v) _ _ alts)
        | v `elem` top_args             -- We are scrutinising an argument variable
        = alts_size (foldr addAltSize sizeZero alt_sizes)
                    (foldr maxSize    sizeZero alt_sizes)
                -- Good to inline if an arg is scrutinised, because
                -- that may eliminate allocation in the caller
                -- And it eliminates the case itself
        where
          alt_sizes = map size_up_alt alts

                -- alts_size tries to compute a good discount for
                -- the case when we are scrutinising an argument variable
          alts_size (SizeIs tot tot_disc tot_scrut)  -- Size of all alternatives
                    (SizeIs max _        _)          -- Size of biggest alternative
525
                = SizeIs tot (unitBag (v, iBox (_ILIT(20) +# tot -# max)) `unionBags` tot_disc) tot_scrut
526 527 528 529 530 531
                        -- If the variable is known, we produce a discount that
                        -- will take us back to 'max', the size of the largest alternative
                        -- The 1+ is a little discount for reduced allocation in the caller
                        --
                        -- Notice though, that we return tot_disc, the total discount from
                        -- all branches.  I think that's right.
532

533
          alts_size tot_size _ = tot_size
534

Simon Marlow's avatar
Simon Marlow committed
535
    size_up (Case e _ _ alts) = size_up e  `addSizeNSD`
536 537 538
                                foldr (addAltSize . size_up_alt) case_size alts
      where
          case_size
539
           | is_inline_scrut e, not (lengthExceeds alts 1)  = sizeN (-10)
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
           | otherwise = sizeZero
                -- Normally we don't charge for the case itself, but
                -- we charge one per alternative (see size_up_alt,
                -- below) to account for the cost of the info table
                -- and comparisons.
                --
                -- However, in certain cases (see is_inline_scrut
                -- below), no code is generated for the case unless
                -- there are multiple alts.  In these cases we
                -- subtract one, making the first alt free.
                -- e.g. case x# +# y# of _ -> ...   should cost 1
                --      case touch# x# of _ -> ...  should cost 0
                -- (see #4978)
                --
                -- I would like to not have the "not (lengthExceeds alts 1)"
                -- condition above, but without that some programs got worse
                -- (spectral/hartel/event and spectral/para).  I don't fully
                -- understand why. (SDM 24/5/11)

                -- unboxed variables, inline primops and unsafe foreign calls
                -- are all "inline" things:
          is_inline_scrut (Var v) = isUnLiftedType (idType v)
          is_inline_scrut scrut
              | (Var f, _) <- collectArgs scrut
                = case idDetails f of
                    FCallId fc  -> not (isSafeForeignCall fc)
                    PrimOpId op -> not (primOpOutOfLine op)
                    _other      -> False
              | otherwise
                = False
570

571
    ------------
572
    -- size_up_app is used when there's ONE OR MORE value args
573
    size_up_app (App fun arg) args voids
574 575 576
        | isTyCoArg arg                  = size_up_app fun args voids
        | isRealWorldExpr arg            = size_up_app fun (arg:args) (voids + 1)
        | otherwise                      = size_up arg  `addSizeNSD`
577 578 579
                                           size_up_app fun (arg:args) voids
    size_up_app (Var fun)     args voids = size_up_call fun args voids
    size_up_app other         args voids = size_up other `addSizeN` (length args - voids)
580

581
    ------------
582 583
    size_up_call :: Id -> [CoreExpr] -> Int -> ExprSize
    size_up_call fun val_args voids
584
       = case idDetails fun of
585
           FCallId _        -> sizeN (10 * (1 + length val_args))
586 587
           DataConWorkId dc -> conSize    dc (length val_args)
           PrimOpId op      -> primOpSize op (length val_args)
588 589
           ClassOpId _      -> classOpSize dflags top_args val_args
           _                -> funSize dflags top_args fun (length val_args) voids
590

591
    ------------
592
    size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 10
593 594 595 596 597 598
        -- Don't charge for args, so that wrappers look cheap
        -- (See comments about wrappers with Case)
        --
        -- IMPORATANT: *do* charge 1 for the alternative, else we
        -- find that giant case nests are treated as practically free
        -- A good example is Foreign.C.Error.errrnoToIOError
599 600

    ------------
601 602
        -- These addSize things have to be here because
        -- I don't want to give them bOMB_OUT_SIZE as an argument
603
    addSizeN TooBig          _  = TooBig
604 605
    addSizeN (SizeIs n xs d) m  = mkSizeIs bOMB_OUT_SIZE (n +# iUnbox m) xs d

606
        -- addAltSize is used to add the sizes of case alternatives
607 608 609 610 611
    addAltSize TooBig            _      = TooBig
    addAltSize _                 TooBig = TooBig
    addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
        = mkSizeIs bOMB_OUT_SIZE (n1 +# n2)
                                 (xs `unionBags` ys)
612 613 614
                                 (d1 +# d2)   -- Note [addAltSize result discounts]

        -- This variant ignores the result discount from its LEFT argument
615 616 617 618 619 620
        -- It's used when the second argument isn't part of the result
    addSizeNSD TooBig            _      = TooBig
    addSizeNSD _                 TooBig = TooBig
    addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
        = mkSizeIs bOMB_OUT_SIZE (n1 +# n2)
                                 (xs `unionBags` ys)
621
                                 d2  -- Ignore d1
622 623 624 625 626 627

    isRealWorldId id = idType id `eqType` realWorldStatePrimTy

    -- an expression of type State# RealWorld must be a variable
    isRealWorldExpr (Var id) = isRealWorldId id
    isRealWorldExpr _        = False
628

629 630 631
-- | Finds a nominal size of a string literal.
litSize :: Literal -> Int
-- Used by CoreUnfold.sizeExpr
632
litSize (LitInteger {}) = 100   -- Note [Size of literal integers]
633
litSize (MachStr str)   = 10 + 10 * ((BS.length str + 3) `div` 4)
634 635 636
        -- If size could be 0 then @f "x"@ might be too small
        -- [Sept03: make literal strings a bit bigger to avoid fruitless
        --  duplication of little strings]
637
litSize _other = 0    -- Must match size of nullary constructors
638 639
                      -- Key point: if  x |-> 4, then x must inline unconditionally
                      --            (eg via case binding)
640

641
classOpSize :: DynFlags -> [Id] -> [CoreExpr] -> ExprSize
642
-- See Note [Conlike is interesting]
643
classOpSize _ _ []
644
  = sizeZero
645
classOpSize dflags top_args (arg1 : other_args)
646 647
  = SizeIs (iUnbox size) arg_discount (_ILIT(0))
  where
648
    size = 20 + (10 * length other_args)
649 650 651 652
    -- If the class op is scrutinising a lambda bound dictionary then
    -- give it a discount, to encourage the inlining of this function
    -- The actual discount is rather arbitrarily chosen
    arg_discount = case arg1 of
653 654 655 656
                     Var dict | dict `elem` top_args
                              -> unitBag (dict, ufDictDiscount dflags)
                     _other   -> emptyBag

657
funSize :: DynFlags -> [Id] -> Id -> Int -> Int -> ExprSize
658 659
-- Size for functions that are not constructors or primops
-- Note [Function applications]
660
funSize dflags top_args fun n_val_args voids
661 662 663 664 665 666
  | fun `hasKey` buildIdKey   = buildSize
  | fun `hasKey` augmentIdKey = augmentSize
  | otherwise = SizeIs (iUnbox size) arg_discount (iUnbox res_discount)
  where
    some_val_args = n_val_args > 0

667
    size | some_val_args = 10 * (1 + n_val_args - voids)
668
         | otherwise     = 0
669 670 671 672
        -- The 1+ is for the function itself
        -- Add 1 for each non-trivial arg;
        -- the allocation cost, as in let(rec)

673
        --                  DISCOUNTS
674 675
        --  See Note [Function and non-function discounts]
    arg_discount | some_val_args && fun `elem` top_args
676 677 678 679
                 = unitBag (fun, ufFunAppDiscount dflags)
                 | otherwise = emptyBag
        -- If the function is an argument and is applied
        -- to some values, give it an arg-discount
680

681
    res_discount | idArity fun > n_val_args = ufFunAppDiscount dflags
682
                 | otherwise                = 0
683 684
        -- If the function is partially applied, show a result discount

685 686
conSize :: DataCon -> Int -> ExprSize
conSize dc n_val_args
687
  | n_val_args == 0 = SizeIs (_ILIT(0)) emptyBag (_ILIT(10))    -- Like variables
688

689
-- See Note [Unboxed tuple size and result discount]
690
  | isUnboxedTupleCon dc = SizeIs (_ILIT(0)) emptyBag (iUnbox (10 * (1 + n_val_args)))
simonpj@microsoft.com's avatar
simonpj@microsoft.com committed
691

692 693
-- See Note [Constructor size and result discount]
  | otherwise = SizeIs (_ILIT(10)) emptyBag (iUnbox (10 * (1 + n_val_args)))
simonpj@microsoft.com's avatar
simonpj@microsoft.com committed
694

Austin Seipp's avatar
Austin Seipp committed
695
{-
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
Note [Constructor size and result discount]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Treat a constructors application as size 10, regardless of how many
arguments it has; we are keen to expose them (and we charge separately
for their args).  We can't treat them as size zero, else we find that
(Just x) has size 0, which is the same as a lone variable; and hence
'v' will always be replaced by (Just x), where v is bound to Just x.

The "result discount" is applied if the result of the call is
scrutinised (say by a case).  For a constructor application that will
mean the constructor application will disappear, so we don't need to
charge it to the function.  So the discount should at least match the
cost of the constructor application, namely 10.  But to give a bit
of extra incentive we give a discount of 10*(1 + n_val_args).

711
Simon M tried a MUCH bigger discount: (10 * (10 + n_val_args)),
712 713 714 715 716 717 718
and said it was an "unambiguous win", but its terribly dangerous
because a fuction with many many case branches, each finishing with
a constructor, can have an arbitrarily large discount.  This led to
terrible code bloat: see Trac #6099.

Note [Unboxed tuple size and result discount]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
719 720
However, unboxed tuples count as size zero. I found occasions where we had
        f x y z = case op# x y z of { s -> (# s, () #) }
721 722 723 724 725 726 727 728 729 730 731 732 733
and f wasn't getting inlined.

I tried giving unboxed tuples a *result discount* of zero (see the
commented-out line).  Why?  When returned as a result they do not
allocate, so maybe we don't want to charge so much for them If you
have a non-zero discount here, we find that workers often get inlined
back into wrappers, because it look like
    f x = case $wf x of (# a,b #) -> (a,b)
and we are keener because of the case.  However while this change
shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
more. All other changes were very small. So it's not a big deal but I
didn't adopt the idea.

734 735
Note [Function and non-function discounts]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
736 737 738 739 740
We want a discount if the function is applied. A good example is
monadic combinators with continuation arguments, where inlining is
quite important.

But we don't want a big discount when a function is called many times
741
(see the detailed comments with Trac #6048) because if the function is
742 743 744 745
big it won't be inlined at its many call sites and no benefit results.
Indeed, we can get exponentially big inlinings this way; that is what
Trac #6048 is about.

746 747 748 749 750 751 752 753 754
On the other hand, for data-valued arguments, if there are lots of
case expressions in the body, each one will get smaller if we apply
the function to a constructor application, so we *want* a big discount
if the argument is scrutinised by many case expressions.

Conclusion:
  - For functions, take the max of the discounts
  - For data values, take the sum of the discounts

755

756 757 758 759 760 761
Note [Literal integer size]
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Literal integers *can* be big (mkInteger [...coefficients...]), but
need not be (S# n).  We just use an aribitrary big-ish constant here
so that, in particular, we don't inline top-level defns like
   n = S# 5
Gabor Greif's avatar
Gabor Greif committed
762
There's no point in doing so -- any optimisations will see the S#
763 764 765
through n's unfolding.  Nor will a big size inhibit unfoldings functions
that mention a literal Integer, because the float-out pass will float
all those constants to top level.
Austin Seipp's avatar
Austin Seipp committed
766
-}
767

twanvl's avatar
twanvl committed
768
primOpSize :: PrimOp -> Int -> ExprSize
769
primOpSize op n_val_args
770 771 772 773 774
 = if primOpOutOfLine op
      then sizeN (op_size + n_val_args)
      else sizeN op_size
 where
   op_size = primOpCodeSize op
775

776

twanvl's avatar
twanvl committed
777
buildSize :: ExprSize
778
buildSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(40))
779 780 781 782 783 784
        -- We really want to inline applications of build
        -- build t (\cn -> e) should cost only the cost of e (because build will be inlined later)
        -- Indeed, we should add a result_discount becuause build is
        -- very like a constructor.  We don't bother to check that the
        -- build is saturated (it usually is).  The "-2" discounts for the \c n,
        -- The "4" is rather arbitrary.
785

twanvl's avatar
twanvl committed
786
augmentSize :: ExprSize
787
augmentSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(40))
788 789
        -- Ditto (augment t (\cn -> e) ys) should cost only the cost of
        -- e plus ys. The -2 accounts for the \cn
twanvl's avatar
twanvl committed
790

791
-- When we return a lambda, give a discount if it's used (applied)
792 793 794
lamScrutDiscount :: DynFlags -> ExprSize -> ExprSize
lamScrutDiscount dflags (SizeIs n vs _) = SizeIs n vs (iUnbox (ufFunAppDiscount dflags))
lamScrutDiscount _      TooBig          = TooBig
795

Austin Seipp's avatar
Austin Seipp committed
796
{-
797 798 799 800 801
Note [addAltSize result discounts]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When adding the size of alternatives, we *add* the result discounts
too, rather than take the *maximum*.  For a multi-branch case, this
gives a discount for each branch that returns a constructor, making us
802
keener to inline.  I did try using 'max' instead, but it makes nofib
803 804 805
'rewrite' and 'puzzle' allocate significantly more, and didn't make
binary sizes shrink significantly either.

806 807
Note [Discounts and thresholds]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
808 809
Constants for discounts and thesholds are defined in main/DynFlags,
all of form ufXxxx.   They are:
810

811
ufCreationThreshold
812 813 814
     At a definition site, if the unfolding is bigger than this, we
     may discard it altogether

815
ufUseThreshold
816 817 818
     At a call site, if the unfolding, less discounts, is smaller than
     this, then it's small enough inline

819
ufKeenessFactor
820
     Factor by which the discounts are multiplied before
821 822
     subtracting from size

823
ufDictDiscount
824 825 826 827
     The discount for each occurrence of a dictionary argument
     as an argument of a class method.  Should be pretty small
     else big functions may get inlined

828
ufFunAppDiscount
829 830 831
     Discount for a function argument that is applied.  Quite
     large, because if we inline we avoid the higher-order call.

832
ufDearOp
833 834
     The size of a foreign call or not-dupable PrimOp

835

836 837 838 839
Note [Function applications]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In a function application (f a b)

840
  - If 'f' is an argument to the function being analysed,
841 842 843 844
    and there's at least one value arg, record a FunAppDiscount for f

  - If the application if a PAP (arity > 2 in this example)
    record a *result* discount (because inlining
845
    with "extra" args in the call may mean that we now
846 847 848
    get a saturated application)

Code for manipulating sizes
Austin Seipp's avatar
Austin Seipp committed
849
-}
850 851

data ExprSize = TooBig
852 853 854 855
              | SizeIs FastInt          -- Size found
                       !(Bag (Id,Int))  -- Arguments cased herein, and discount for each such
                       FastInt          -- Size to subtract if result is scrutinised
                                        -- by a case expression
856 857 858 859 860 861 862

instance Outputable ExprSize where
  ppr TooBig         = ptext (sLit "TooBig")
  ppr (SizeIs a _ c) = brackets (int (iBox a) <+> int (iBox c))

-- subtract the discount before deciding whether to bale out. eg. we
-- want to inline a large constructor application into a selector:
863 864
--      tup = (a_1, ..., a_99)
--      x = case tup of ...
865 866 867
--
mkSizeIs :: FastInt -> FastInt -> Bag (Id, Int) -> FastInt -> ExprSize
mkSizeIs max n xs d | (n -# d) ># max = TooBig
868 869
                    | otherwise       = SizeIs n xs d

870
maxSize :: ExprSize -> ExprSize -> ExprSize
871 872
maxSize TooBig         _                                  = TooBig
maxSize _              TooBig                             = TooBig
873
maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 ># n2  = s1
874
                                              | otherwise = s2
875

876
sizeZero :: ExprSize
877 878 879 880 881
sizeN :: Int -> ExprSize

sizeZero = SizeIs (_ILIT(0))  emptyBag (_ILIT(0))
sizeN n  = SizeIs (iUnbox n) emptyBag (_ILIT(0))

Austin Seipp's avatar
Austin Seipp committed
882 883 884
{-
************************************************************************
*                                                                      *
885
\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
Austin Seipp's avatar
Austin Seipp committed
886 887
*                                                                      *
************************************************************************
888

889 890 891 892
We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
we ``couldn't possibly use'' on the other side.  Can be overridden w/
flaggery.  Just the same as smallEnoughToInline, except that it has no
actual arguments.
Austin Seipp's avatar
Austin Seipp committed
893
-}
894

895
couldBeSmallEnoughToInline :: DynFlags -> Int -> CoreExpr -> Bool
896
couldBeSmallEnoughToInline dflags threshold rhs
897
  = case sizeExpr dflags (iUnbox threshold) [] body of
898 899 900 901
       TooBig -> False
       _      -> True
  where
    (_, body) = collectBinders rhs
902

903
----------------
904 905 906 907
smallEnoughToInline :: DynFlags -> Unfolding -> Bool
smallEnoughToInline dflags (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
  = size <= ufUseThreshold dflags
smallEnoughToInline _ _
908
  = False
909 910

----------------
911 912 913 914
certainlyWillInline :: DynFlags -> Unfolding -> Maybe Unfolding
-- Sees if the unfolding is pretty certain to inline
-- If so, return a *stable* unfolding for it, that will always inline
certainlyWillInline dflags unf@(CoreUnfolding { uf_guidance = guidance, uf_tmpl = expr })
915
  = case guidance of
916 917 918 919 920 921
      UnfNever   -> Nothing
      UnfWhen {} -> Just (unf { uf_src = InlineStable })

      -- The UnfIfGoodArgs case seems important.  If we w/w small functions
      -- binary sizes go up by 10%!  (This is with SplitObjs.)  I'm not totally
      -- sure whyy.
Simon Peyton Jones's avatar
Simon Peyton Jones committed
922
      UnfIfGoodArgs { ug_size = size, ug_args = args }
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
         | not (null args)  -- See Note [certainlyWillInline: be careful of thunks]
         , let arity = length args
         , size - (10 * (arity + 1)) <= ufUseThreshold dflags
         -> Just (unf { uf_src      = InlineStable
                      , uf_guidance = UnfWhen { ug_arity     = arity
                                              , ug_unsat_ok  = unSaturatedOk
                                              , ug_boring_ok = inlineBoringOk expr } })
                -- Note the "unsaturatedOk". A function like  f = \ab. a
                -- will certainly inline, even if partially applied (f e), so we'd
                -- better make sure that the transformed inlining has the same property

      _  -> Nothing

certainlyWillInline _ unf@(DFunUnfolding {})
  = Just unf
938

939
certainlyWillInline _ _
940
  = Nothing
941

Austin Seipp's avatar
Austin Seipp committed
942
{-
943
Note [certainlyWillInline: be careful of thunks]
944 945 946 947 948 949
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Don't claim that thunks will certainly inline, because that risks work
duplication.  Even if the work duplication is not great (eg is_cheap
holds), it can make a big difference in an inner loop In Trac #5623 we
found that the WorkWrap phase thought that
       y = case x of F# v -> F# (v +# v)
950
was certainlyWillInline, so the addition got duplicated.
951 952


Austin Seipp's avatar
Austin Seipp committed
953 954
************************************************************************
*                                                                      *
955
\subsection{callSiteInline}
Austin Seipp's avatar
Austin Seipp committed
956 957
*                                                                      *
************************************************************************
958 959 960 961 962 963

This is the key function.  It decides whether to inline a variable at a call site

callSiteInline is used at call sites, so it is a bit more generous.
It's a very important function that embodies lots of heuristics.
A non-WHNF can be inlined if it doesn't occur inside a lambda,
964
and occurs exactly once or
965 966
    occurs once in each branch of a case and is small

967
If the thing is in WHNF, there's no danger of duplicating work,
968 969
so we can inline if it occurs once, or is small

970
NOTE: we don't want to inline top-level functions that always diverge.
971