CmmRewriteAssignments.hs 28.7 KB
Newer Older
1 2 3 4 5
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE FlexibleContexts #-}

{-# OPTIONS_GHC -fno-warn-warnings-deprecations #-}
Ian Lynagh's avatar
Ian Lynagh committed
6

Ian Lynagh's avatar
Ian Lynagh committed
7
-- TODO: Get rid of this flag:
8
{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}
9

10 11 12 13
-- This module implements generalized code motion for assignments to
-- local registers, inlining and sinking when possible.  It also does
-- some amount of rewriting for stores to register slots, which are
-- effectively equivalent to local registers.
14 15 16 17 18
module CmmRewriteAssignments
  ( rewriteAssignments
  ) where

import Cmm
19
import CmmUtils
20
import CmmOpt
21 22 23 24
import OptimizationFuel
import StgCmmUtils

import Control.Monad
Ian Lynagh's avatar
Ian Lynagh committed
25
import Platform
26 27
import UniqFM
import Unique
28
import BlockId
29

Simon Marlow's avatar
Simon Marlow committed
30
import Hoopl
31 32 33
import Data.Maybe
import Prelude hiding (succ, zip)

34 35 36
----------------------------------------------------------------
--- Main function

Ian Lynagh's avatar
Ian Lynagh committed
37 38
rewriteAssignments :: Platform -> CmmGraph -> FuelUniqSM CmmGraph
rewriteAssignments platform g = do
39 40 41 42 43 44
  -- Because we need to act on forwards and backwards information, we
  -- first perform usage analysis and bake this information into the
  -- graph (backwards transform), and then do a forwards transform
  -- to actually perform inlining and sinking.
  g'  <- annotateUsage g
  g'' <- liftM fst $ dataflowPassFwd g' [(g_entry g, fact_bot assignmentLattice)] $
45 46
                                     analRewFwd assignmentLattice
                                                assignmentTransfer
Ian Lynagh's avatar
Ian Lynagh committed
47
                                                (assignmentRewrite `thenFwdRw` machOpFoldRewrite platform)
48 49
  return (modifyGraph eraseRegUsage g'')

50 51 52
----------------------------------------------------------------
--- Usage information

53 54 55 56 57 58 59 60 61 62 63 64
-- We decorate all register assignments with approximate usage
-- information, that is, the maximum number of times the register is
-- referenced while it is live along all outgoing control paths.
-- This analysis provides a precise upper bound for usage, so if a
-- register is never referenced, we can remove it, as that assignment is
-- dead.
--
-- This analysis is very similar to liveness analysis; we just keep a
-- little extra info. (Maybe we should move it to CmmLive, and subsume
-- the old liveness analysis.)
--
-- There are a few subtleties here:
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
--
--  - If a register goes dead, and then becomes live again, the usages
--    of the disjoint live range don't count towards the original range.
--
--          a = 1; // used once
--          b = a;
--          a = 2; // used once
--          c = a;
--
--  - A register may be used multiple times, but these all reside in
--    different control paths, such that any given execution only uses
--    it once. In that case, the usage count may still be 1.
--
--          a = 1; // used once
--          if (b) {
--              c = a + 3;
--          } else {
--              c = a + 1;
--          }
--
--    This policy corresponds to an inlining strategy that does not
--    duplicate computation but may increase binary size.
--
--  - If we naively implement a usage count, we have a counting to
--    infinity problem across joins.  Furthermore, knowing that
--    something is used 2 or more times in one runtime execution isn't
--    particularly useful for optimizations (inlining may be beneficial,
--    but there's no way of knowing that without register pressure
--    information.)
--
--          while (...) {
--              // first iteration, b used once
--              // second iteration, b used twice
--              // third iteration ...
--              a = b;
--          }
--          // b used zero times
--
--    There is an orthogonal question, which is that for every runtime
--    execution, the register may be used only once, but if we inline it
--    in every conditional path, the binary size might increase a lot.
--    But tracking this information would be tricky, because it violates
--    the finite lattice restriction Hoopl requires for termination;
--    we'd thus need to supply an alternate proof, which is probably
--    something we should defer until we actually have an optimization
--    that would take advantage of this.  (This might also interact
--    strangely with liveness information.)
--
--          a = ...;
--          // a is used one time, but in X different paths
--          case (b) of
--              1 -> ... a ...
--              2 -> ... a ...
--              3 -> ... a ...
--              ...
--
121 122 123 124 125 126 127 128 129 130 131
--  - Memory stores to local register slots (CmmStore (CmmStackSlot
--    (LocalReg _) 0) _) have similar behavior to local registers,
--    in that these locations are all disjoint from each other.  Thus,
--    we attempt to inline them too. Note that because these are only
--    generated as part of the spilling process, most of the time this
--    will refer to a local register and the assignment will immediately
--    die on the subsequent call.  However, if we manage to replace that
--    local register with a memory location, it means that we've managed
--    to preserve a value on the stack without having to move it to
--    another memory location again!  We collect usage information just
--    to be safe in case extra computation is involved.
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250

data RegUsage = SingleUse | ManyUse
    deriving (Ord, Eq, Show)
-- Absence in map = ZeroUse

{-
-- minBound is bottom, maxBound is top, least-upper-bound is max
-- ToDo: Put this in Hoopl.  Note that this isn't as useful as I
-- originally hoped, because you usually want to leave out the bottom
-- element when you have things like this put in maps.  Maybe f is
-- useful on its own as a combining function.
boundedOrdLattice :: (Bounded a, Ord a) => String -> DataflowLattice a
boundedOrdLattice n = DataflowLattice n minBound f
    where f _ (OldFact x) (NewFact y)
            | x >= y    = (NoChange,   x)
            | otherwise = (SomeChange, y)
-}

-- Custom node type we'll rewrite to.  CmmAssign nodes to local
-- registers are replaced with AssignLocal nodes.
data WithRegUsage n e x where
    -- Plain will not contain CmmAssign nodes immediately after
    -- transformation, but as we rewrite assignments, we may have
    -- assignments here: these are assignments that should not be
    -- rewritten!
    Plain       :: n e x -> WithRegUsage n e x
    AssignLocal :: LocalReg -> CmmExpr -> RegUsage -> WithRegUsage n O O

instance UserOfLocalRegs (n e x) => UserOfLocalRegs (WithRegUsage n e x) where
    foldRegsUsed f z (Plain n) = foldRegsUsed f z n
    foldRegsUsed f z (AssignLocal _ e _) = foldRegsUsed f z e

instance DefinerOfLocalRegs (n e x) => DefinerOfLocalRegs (WithRegUsage n e x) where
    foldRegsDefd f z (Plain n) = foldRegsDefd f z n
    foldRegsDefd f z (AssignLocal r _ _) = foldRegsDefd f z r

instance NonLocal n => NonLocal (WithRegUsage n) where
    entryLabel (Plain n) = entryLabel n
    successors (Plain n) = successors n

liftRegUsage :: Graph n e x -> Graph (WithRegUsage n) e x
liftRegUsage = mapGraph Plain

eraseRegUsage :: Graph (WithRegUsage CmmNode) e x -> Graph CmmNode e x
eraseRegUsage = mapGraph f
    where f :: WithRegUsage CmmNode e x -> CmmNode e x
          f (AssignLocal l e _) = CmmAssign (CmmLocal l) e
          f (Plain n) = n

type UsageMap = UniqFM RegUsage

usageLattice :: DataflowLattice UsageMap
usageLattice = DataflowLattice "usage counts for registers" emptyUFM (joinUFM f)
    where f _ (OldFact x) (NewFact y)
            | x >= y    = (NoChange,   x)
            | otherwise = (SomeChange, y)

-- We reuse the names 'gen' and 'kill', although we're doing something
-- slightly different from the Dragon Book
usageTransfer :: BwdTransfer (WithRegUsage CmmNode) UsageMap
usageTransfer = mkBTransfer3 first middle last
    where first _ f = f
          middle :: WithRegUsage CmmNode O O -> UsageMap -> UsageMap
          middle n f = gen_kill n f
          last :: WithRegUsage CmmNode O C -> FactBase UsageMap -> UsageMap
          -- Checking for CmmCall/CmmForeignCall is unnecessary, because
          -- spills/reloads have already occurred by the time we do this
          -- analysis.
          -- XXX Deprecated warning is puzzling: what label are we
          -- supposed to use?
          -- ToDo: With a bit more cleverness here, we can avoid
          -- disappointment and heartbreak associated with the inability
          -- to inline into CmmCall and CmmForeignCall by
          -- over-estimating the usage to be ManyUse.
          last n f = gen_kill n (joinOutFacts usageLattice n f)
          gen_kill :: WithRegUsage CmmNode e x -> UsageMap -> UsageMap
          gen_kill a = gen a . kill a
          gen :: WithRegUsage CmmNode e x -> UsageMap -> UsageMap
          gen  a f = foldRegsUsed increaseUsage f a
          kill :: WithRegUsage CmmNode e x -> UsageMap -> UsageMap
          kill a f = foldRegsDefd delFromUFM f a
          increaseUsage f r = addToUFM_C combine f r SingleUse
            where combine _ _ = ManyUse

usageRewrite :: BwdRewrite FuelUniqSM (WithRegUsage CmmNode) UsageMap
usageRewrite = mkBRewrite3 first middle last
    where first  _ _ = return Nothing
          middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O))
          middle (Plain (CmmAssign (CmmLocal l) e)) f
                     = return . Just
                     $ case lookupUFM f l of
                            Nothing    -> emptyGraph
                            Just usage -> mkMiddle (AssignLocal l e usage)
          middle _ _ = return Nothing
          last   _ _ = return Nothing

type CmmGraphWithRegUsage = GenCmmGraph (WithRegUsage CmmNode)
annotateUsage :: CmmGraph -> FuelUniqSM (CmmGraphWithRegUsage)
annotateUsage vanilla_g =
    let g = modifyGraph liftRegUsage vanilla_g
    in liftM fst $ dataflowPassBwd g [(g_entry g, fact_bot usageLattice)] $
                                   analRewBwd usageLattice usageTransfer usageRewrite

----------------------------------------------------------------
--- Assignment tracking

-- The idea is to maintain a map of local registers do expressions,
-- such that the value of that register is the same as the value of that
-- expression at any given time.  We can then do several things,
-- as described by Assignment.

-- Assignment describes the various optimizations that are valid
-- at a given point in the program.
data Assignment =
-- This assignment can always be inlined.  It is cheap or single-use.
                  AlwaysInline CmmExpr
-- This assignment should be sunk down to its first use.  (This will
-- increase code size if the register is used in multiple control flow
-- paths, but won't increase execution time, and the reduction of
251
-- register pressure is worth it, I think.)
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406
                | AlwaysSink CmmExpr
-- We cannot safely optimize occurrences of this local register. (This
-- corresponds to top in the lattice structure.)
                | NeverOptimize

-- Extract the expression that is being assigned to
xassign :: Assignment -> Maybe CmmExpr
xassign (AlwaysInline e) = Just e
xassign (AlwaysSink e)   = Just e
xassign NeverOptimize    = Nothing

-- Extracts the expression, but only if they're the same constructor
xassign2 :: (Assignment, Assignment) -> Maybe (CmmExpr, CmmExpr)
xassign2 (AlwaysInline e, AlwaysInline e') = Just (e, e')
xassign2 (AlwaysSink e, AlwaysSink e')     = Just (e, e')
xassign2 _ = Nothing

-- Note: We'd like to make decisions about "not optimizing" as soon as
-- possible, because this will make running the transfer function more
-- efficient.
type AssignmentMap = UniqFM Assignment

assignmentLattice :: DataflowLattice AssignmentMap
assignmentLattice = DataflowLattice "assignments for registers" emptyUFM (joinUFM add)
    where add _ (OldFact old) (NewFact new)
            = case (old, new) of
                (NeverOptimize, _) -> (NoChange,   NeverOptimize)
                (_, NeverOptimize) -> (SomeChange, NeverOptimize)
                (xassign2 -> Just (e, e'))
                    | e == e'   -> (NoChange, old)
                    | otherwise -> (SomeChange, NeverOptimize)
                _ -> (SomeChange, NeverOptimize)

-- Deletes sinks from assignment map, because /this/ is the place
-- where it will be sunk to.
deleteSinks :: UserOfLocalRegs n => n -> AssignmentMap -> AssignmentMap
deleteSinks n m = foldRegsUsed (adjustUFM f) m n
  where f (AlwaysSink _) = NeverOptimize
        f old = old

-- Invalidates any expressions that use a register.
invalidateUsersOf :: CmmReg -> AssignmentMap -> AssignmentMap
-- foldUFM_Directly :: (Unique -> elt -> a -> a) -> a -> UniqFM elt -> a
invalidateUsersOf reg m = foldUFM_Directly f m m -- [foldUFM performance]
    where f u (xassign -> Just e) m | reg `regUsedIn` e = addToUFM_Directly m u NeverOptimize
          f _ _ m = m
{- This requires the entire spine of the map to be continually rebuilt,
 - which causes crazy memory usage!
invalidateUsersOf reg = mapUFM (invalidateUsers' reg)
  where invalidateUsers' reg (xassign -> Just e) | reg `regUsedIn` e = NeverOptimize
        invalidateUsers' _ old = old
-}

-- Note [foldUFM performance]
-- These calls to fold UFM no longer leak memory, but they do cause
-- pretty killer amounts of allocation.  So they'll be something to
-- optimize; we need an algorithmic change to prevent us from having to
-- traverse the /entire/ map continually.

middleAssignment :: WithRegUsage CmmNode O O -> AssignmentMap -> AssignmentMap

-- Algorithm for annotated assignments:
--  1. Delete any sinking assignments that were used by this instruction
--  2. Add the assignment to our list of valid local assignments with
--     the correct optimization policy.
--  3. Look for all assignments that reference that register and
--     invalidate them.
middleAssignment n@(AssignLocal r e usage) assign
    = invalidateUsersOf (CmmLocal r) . add . deleteSinks n $ assign
      where add m = addToUFM m r
                  $ case usage of
                        SingleUse -> AlwaysInline e
                        ManyUse   -> decide e
            decide CmmLit{}       = AlwaysInline e
            decide CmmReg{}       = AlwaysInline e
            decide CmmLoad{}      = AlwaysSink e
            decide CmmStackSlot{} = AlwaysSink e
            decide CmmMachOp{}    = AlwaysSink e
            -- We'll always inline simple operations on the global
            -- registers, to reduce register pressure: Sp - 4 or Hp - 8
            -- EZY: Justify this optimization more carefully.
            decide CmmRegOff{}    = AlwaysInline e

-- Algorithm for unannotated assignments of global registers:
-- 1. Delete any sinking assignments that were used by this instruction
-- 2. Look for all assignments that reference this register and
--    invalidate them.
middleAssignment (Plain n@(CmmAssign reg@(CmmGlobal _) _)) assign
    = invalidateUsersOf reg . deleteSinks n $ assign

-- Algorithm for unannotated assignments of *local* registers: do
-- nothing (it's a reload, so no state should have changed)
middleAssignment (Plain (CmmAssign (CmmLocal _) _)) assign = assign

-- Algorithm for stores:
--  1. Delete any sinking assignments that were used by this instruction
--  2. Look for all assignments that load from memory locations that
--     were clobbered by this store and invalidate them.
middleAssignment (Plain n@(CmmStore lhs rhs)) assign
    = let m = deleteSinks n assign
      in foldUFM_Directly f m m -- [foldUFM performance]
      where f u (xassign -> Just x) m | (lhs, rhs) `clobbers` (u, x) = addToUFM_Directly m u NeverOptimize
            f _ _ m = m
{- Also leaky
    = mapUFM_Directly p . deleteSinks n $ assign
      -- ToDo: There's a missed opportunity here: even if a memory
      -- access we're attempting to sink gets clobbered at some
      -- location, it's still /better/ to sink it to right before the
      -- point where it gets clobbered.  How might we do this?
      -- Unfortunately, it's too late to change the assignment...
      where p r (xassign -> Just x) | (lhs, rhs) `clobbers` (r, x) = NeverOptimize
            p _ old = old
-}

-- Assumption: Unsafe foreign calls don't clobber memory
-- Since foreign calls clobber caller saved registers, we need
-- invalidate any assignments that reference those global registers.
-- This is kind of expensive. (One way to optimize this might be to
-- store extra information about expressions that allow this and other
-- checks to be done cheaply.)
middleAssignment (Plain n@(CmmUnsafeForeignCall{})) assign
    = deleteCallerSaves (foldRegsDefd (\m r -> addToUFM m r NeverOptimize) (deleteSinks n assign) n)
    where deleteCallerSaves m = foldUFM_Directly f m m
          f u (xassign -> Just x) m | wrapRecExpf g x False = addToUFM_Directly m u NeverOptimize
          f _ _ m = m
          g (CmmReg (CmmGlobal r)) _      | callerSaves r = True
          g (CmmRegOff (CmmGlobal r) _) _ | callerSaves r = True
          g _ b = b

middleAssignment (Plain (CmmComment {})) assign
    = assign

-- Assumptions:
--  * Writes using Hp do not overlap with any other memory locations
--    (An important invariant being relied on here is that we only ever
--    use Hp to allocate values on the heap, which appears to be the
--    case given hpReg usage, and that our heap writing code doesn't
--    do anything stupid like overlapping writes.)
--  * Stack slots do not overlap with any other memory locations
--  * Stack slots for different areas do not overlap
--  * Stack slots within the same area and different offsets may
--    overlap; we need to do a size check (see 'overlaps').
--  * Register slots only overlap with themselves.  (But this shouldn't
--    happen in practice, because we'll fail to inline a reload across
--    the next spill.)
--  * Non stack-slot stores always conflict with each other.  (This is
--    not always the case; we could probably do something special for Hp)
clobbers :: (CmmExpr, CmmExpr) -- (lhs, rhs) of clobbering CmmStore
         -> (Unique,  CmmExpr) -- (register, expression) that may be clobbered
         -> Bool
clobbers (CmmRegOff (CmmGlobal Hp) _, _) (_, _) = False
clobbers (CmmReg (CmmGlobal Hp), _) (_, _) = False
-- ToDo: Also catch MachOp case
clobbers (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _)
    | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?)
Simon Marlow's avatar
Simon Marlow committed
407 408
clobbers (CmmStackSlot a o, rhs) (_, expr) = f expr
    where f (CmmLoad (CmmStackSlot a' o') t)
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
            = (a, o, widthInBytes (cmmExprWidth rhs)) `overlaps` (a', o', widthInBytes (typeWidth t))
          f (CmmLoad e _)    = containsStackSlot e
          f (CmmMachOp _ es) = or (map f es)
          f _                = False
          -- Maybe there's an invariant broken if this actually ever
          -- returns True
          containsStackSlot (CmmLoad{})      = True -- load of a load, all bets off
          containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es)
          containsStackSlot (CmmStackSlot{}) = True
          containsStackSlot _ = False
clobbers _ (_, e) = f e
    where f (CmmLoad (CmmStackSlot _ _) _) = False
          f (CmmLoad{}) = True -- conservative
          f (CmmMachOp _ es) = or (map f es)
          f _ = False

-- Check for memory overlapping.
-- Diagram:
--      4      8     12
--      s -w-  o
--      [ I32  ]
--      [    F64     ]
--      s'   -w'-    o'
Simon Marlow's avatar
Simon Marlow committed
432
type CallSubArea = (Area, Int, Int) -- area, offset, width
433 434 435 436 437 438 439 440
overlaps :: CallSubArea -> CallSubArea -> Bool
overlaps (a, _, _) (a', _, _) | a /= a' = False
overlaps (_, o, w) (_, o', w') =
    let s  = o  - w
        s' = o' - w'
    in (s' < o) && (s < o) -- Not LTE, because [ I32  ][ I32  ] is OK

lastAssignment :: WithRegUsage CmmNode O C -> AssignmentMap -> [(Label, AssignmentMap)]
441 442
lastAssignment (Plain (CmmCall _ (Just k) _ _ _)) assign = [(k, invalidateVolatile k assign)]
lastAssignment (Plain (CmmForeignCall {succ=k}))  assign = [(k, invalidateVolatile k assign)]
443 444
lastAssignment l assign = map (\id -> (id, deleteSinks l assign)) $ successors l

445 446 447 448 449 450 451 452
-- Invalidates any expressions that have volatile contents: essentially,
-- all terminals volatile except for literals and loads of stack slots
-- that do not correspond to the call area for 'k' (the current call
-- area is volatile because overflow return parameters may be written
-- there.)
-- Note: mapUFM could be expensive, but hopefully block boundaries
-- aren't too common.  If it is a problem, replace with something more
-- clever.
453
invalidateVolatile :: BlockId -> AssignmentMap -> AssignmentMap
454 455 456
invalidateVolatile k m = mapUFM p m
  where p (AlwaysInline e) = if exp e then AlwaysInline e else NeverOptimize
            where exp CmmLit{} = True
Simon Marlow's avatar
Simon Marlow committed
457
                  exp (CmmLoad (CmmStackSlot (Young k') _) _)
458 459 460 461 462 463
                    | k' == k = False
                  exp (CmmLoad (CmmStackSlot _ _) _) = True
                  exp (CmmMachOp _ es) = and (map exp es)
                  exp _ = False
        p _ = NeverOptimize -- probably shouldn't happen with AlwaysSink

464 465 466
assignmentTransfer :: FwdTransfer (WithRegUsage CmmNode) AssignmentMap
assignmentTransfer = mkFTransfer3 (flip const) middleAssignment ((mkFactBase assignmentLattice .) . lastAssignment)

467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
-- Note [Soundness of inlining]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- In the Hoopl paper, the soundness condition on rewrite functions is
-- described as follows:
--
--      "If it replaces a node n by a replacement graph g, then g must
--      be observationally equivalent to n under the assumptions
--      expressed by the incoming dataflow fact f.  Moreover, analysis of
--      g must produce output fact(s) that are at least as informative
--      as the fact(s) produced by applying the transfer function to n."
--
-- We consider the second condition in more detail here.  It says given
-- the rewrite R(n, f) = g, then for any incoming fact f' consistent
-- with f (f' >= f), then running the transfer function T(f', n) <= T(f', g).
-- For inlining this is not necessarily the case:
--
--  n = "x = a + 2"
--  f = f' = {a = y}
--  g = "x = y + 2"
--  T(f', n) = {x = a + 2, a = y}
--  T(f', g) = {x = y + 2, a = y}
--
-- y + 2 and a + 2 are not obviously comparable, and a naive
-- implementation of the lattice would say they are incomparable.
-- At best, this means we may be over-conservative, at worst, it means
-- we may not terminate.
--
-- However, in the original Lerner-Grove-Chambers paper, soundness and
-- termination are separated, and only equivalence of facts is required
-- for soundness.  Monotonicity of the transfer function is not required
-- for termination (as the calculation of least-upper-bound prevents
-- this from being a problem), but it means we won't necessarily find
-- the least-fixed point.

-- Note [Coherency of annotations]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- Is it possible for our usage annotations to become invalid after we
-- start performing transformations?  As the usage info only provides
-- an upper bound, we only need to consider cases where the usages of
-- a register may increase due to transformations--e.g. any reference
-- to a local register in an AlwaysInline or AlwaysSink instruction, whose
-- originating assignment was single use (we don't care about the
-- many use case, because it is the top of the lattice).  But such a
-- case is not possible, because we always inline any single use
-- register.  QED.
--
-- TODO: A useful lint option would be to check this invariant that
-- there is never a local register in the assignment map that is
-- single-use.

-- Note [Soundness of store rewriting]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- Its soundness depends on the invariant that no assignment is made to
-- the local register before its store is accessed.  This is clearly
-- true with unoptimized spill-reload code, and as the store will always
-- be rewritten first (if possible), there is no chance of it being
-- propagated down before getting written (possibly with incorrect
-- values from the assignment map, due to reassignment of the local
-- register.)  This is probably not locally sound.

527 528 529 530 531 532 533 534
assignmentRewrite :: FwdRewrite FuelUniqSM (WithRegUsage CmmNode) AssignmentMap
assignmentRewrite = mkFRewrite3 first middle last
    where
        first _ _ = return Nothing
        middle :: WithRegUsage CmmNode O O -> AssignmentMap -> GenCmmReplGraph (WithRegUsage CmmNode) O O
        middle (Plain m) assign = return $ rewrite assign (precompute assign m) mkMiddle m
        middle (AssignLocal l e u) assign = return $ rewriteLocal assign (precompute assign (CmmAssign (CmmLocal l) e)) l e u
        last (Plain l) assign = return $ rewrite assign (precompute assign l) mkLast l
535
        -- Tuple is (inline?, reloads for sinks)
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
        precompute :: AssignmentMap -> CmmNode O x -> (Bool, [WithRegUsage CmmNode O O])
        precompute assign n = foldRegsUsed f (False, []) n -- duplicates are harmless
            where f (i, l) r = case lookupUFM assign r of
                                Just (AlwaysSink e)   -> (i, (Plain (CmmAssign (CmmLocal r) e)):l)
                                Just (AlwaysInline _) -> (True, l)
                                Just NeverOptimize    -> (i, l)
                                -- This case can show up when we have
                                -- limited optimization fuel.
                                Nothing -> (i, l)
        rewrite :: AssignmentMap
                -> (Bool, [WithRegUsage CmmNode O O])
                -> (WithRegUsage CmmNode O x -> Graph (WithRegUsage CmmNode) O x)
                -> CmmNode O x
                -> Maybe (Graph (WithRegUsage CmmNode) O x)
        rewrite _ (False, []) _ _ = Nothing
        -- Note [CmmCall Inline Hack]
        -- Conservative hack: don't do any inlining on what will
        -- be translated into an OldCmm CmmCalls, since the code
        -- produced here tends to be unproblematic and I need to write
        -- lint passes to ensure that we don't put anything in the
        -- arguments that could be construed as a global register by
        -- some later translation pass.  (For example, slots will turn
        -- into dereferences of Sp).  See [Register parameter passing].
        -- ToDo: Fix this up to only bug out if all inlines were for
        -- CmmExprs with global registers (we can't use the
        -- straightforward mapExpDeep call, in this case.) ToDo: We miss
        -- an opportunity here, where all possible inlinings should
        -- instead be sunk.
        rewrite _ (True, []) _ n | not (inlinable n) = Nothing -- see [CmmCall Inline Hack]
        rewrite assign (i, xs) mk n = Just $ mkMiddles xs <*> mk (Plain (inline i assign n))

        rewriteLocal :: AssignmentMap
                     -> (Bool, [WithRegUsage CmmNode O O])
                     -> LocalReg -> CmmExpr -> RegUsage
                     -> Maybe (Graph (WithRegUsage CmmNode) O O)
        rewriteLocal _ (False, []) _ _ _ = Nothing
        rewriteLocal assign (i, xs) l e u = Just $ mkMiddles xs <*> mkMiddle n'
            where n' = AssignLocal l e' u
                  e' = if i then wrapRecExp (inlineExp assign) e else e
            -- inlinable check omitted, since we can always inline into
            -- assignments.

        inline :: Bool -> AssignmentMap -> CmmNode e x -> CmmNode e x
        inline False _ n = n
        inline True  _ n | not (inlinable n) = n -- see [CmmCall Inline Hack]
        inline True assign n = mapExpDeep (inlineExp assign) n

        inlineExp assign old@(CmmReg (CmmLocal r))
          = case lookupUFM assign r of
              Just (AlwaysInline x) -> x
              _ -> old
        inlineExp assign old@(CmmRegOff (CmmLocal r) i)
          = case lookupUFM assign r of
              Just (AlwaysInline x) ->
                case x of
                    (CmmRegOff r' i') -> CmmRegOff r' (i + i')
                    _ -> CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)]
                          where rep = typeWidth (localRegType r)
              _ -> old
595
        -- See Note [Soundness of store rewriting]
596 597 598 599 600 601 602 603
        inlineExp _ old = old

        inlinable :: CmmNode e x -> Bool
        inlinable (CmmCall{}) = False
        inlinable (CmmForeignCall{}) = False
        inlinable (CmmUnsafeForeignCall{}) = False
        inlinable _ = True

604 605 606 607
-- Need to interleave this with inlining, because machop folding results
-- in literals, which we can inline more aggressively, and inlining
-- gives us opportunities for more folding.  However, we don't need any
-- facts to do MachOp folding.
Ian Lynagh's avatar
Ian Lynagh committed
608 609
machOpFoldRewrite :: Platform -> FwdRewrite FuelUniqSM (WithRegUsage CmmNode) a
machOpFoldRewrite platform = mkFRewrite3 first middle last
610 611 612 613 614 615 616 617 618
  where first _ _ = return Nothing
        middle :: WithRegUsage CmmNode O O -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O O
        middle (Plain m) _ = return (fmap (mkMiddle . Plain) (foldNode m))
        middle (AssignLocal l e r) _ = return (fmap f (wrapRecExpM foldExp e))
            where f e' = mkMiddle (AssignLocal l e' r)
        last   :: WithRegUsage CmmNode O C -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O C
        last (Plain l) _ = return (fmap (mkLast . Plain) (foldNode l))
        foldNode :: CmmNode e x -> Maybe (CmmNode e x)
        foldNode n = mapExpDeepM foldExp n
Ian Lynagh's avatar
Ian Lynagh committed
619
        foldExp (CmmMachOp op args) = cmmMachOpFoldM platform op args
620 621
        foldExp _ = Nothing

622
-- ToDo: Outputable instance for UsageMap and AssignmentMap