BlockLayout.hs 35.2 KB
Newer Older
1 2 3 4
--
-- Copyright (c) 2018 Andreas Klebinger
--

5 6 7 8
{-# LANGUAGE CPP #-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
9 10
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE FlexibleContexts #-}
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

module BlockLayout
    ( sequenceTop )
where

#include "HsVersions.h"
import GhcPrelude

import Instruction
import NCGMonad
import CFG

import BlockId
import Cmm
import Hoopl.Collections
import Hoopl.Label

import DynFlags (gopt, GeneralFlag(..), DynFlags, backendMaintainsCfg)
import UniqFM
import Util
import Unique

import Digraph
import Outputable
import Maybes

-- DEBUGGING ONLY
--import Debug
--import Debug.Trace
import ListSetOps (removeDups)

import OrdList
import Data.List
import Data.Foldable (toList)

import qualified Data.Set as Set
47 48 49
import Data.STRef
import Control.Monad.ST.Strict
import Control.Monad (foldM)
50 51

{-
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  Note [CFG based code layout]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  The major steps in placing blocks are as follow:
  * Compute a CFG based on the Cmm AST, see getCfgProc.
    This CFG will have edge weights representing a guess
    on how important they are.
  * After we convert Cmm to Asm we run `optimizeCFG` which
    adds a few more "educated guesses" to the equation.
  * Then we run loop analysis on the CFG (`loopInfo`) which tells us
    about loop headers, loop nesting levels and the sort.
  * Based on the CFG and loop information refine the edge weights
    in the CFG and normalize them relative to the most often visited
    node. (See `mkGlobalWeights`)
  * Feed this CFG into the block layout code (`sequenceTop`) in this
    module. Which will then produce a code layout based on the input weights.

69 70 71 72 73
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ~~~ Note [Chain based CFG serialization]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  For additional information also look at
74
  https://gitlab.haskell.org/ghc/ghc/wikis/commentary/compiler/code-layout
75 76 77 78 79 80 81 82

  We have a CFG with edge weights based on which we try to place blocks next to
  each other.

  Edge weights not only represent likelyhood of control transfer between blocks
  but also how much a block would benefit from being placed sequentially after
  it's predecessor.
  For example blocks which are preceeded by an info table are more likely to end
83 84
  up in a different cache line than their predecessor and we can't eliminate the jump
  so there is less benefit to placing them sequentially.
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

  For example consider this example:

  A:  ...
      jmp cond D (weak successor)
      jmp B
  B:  ...
      jmp C
  C:  ...
      jmp X
  D:  ...
      jmp B (weak successor)

  We determine a block layout by building up chunks (calling them chains) of
  possible control flows for which blocks will be placed sequentially.

  Eg for our example we might end up with two chains like:
  [A->B->C->X],[D]. Blocks inside chains will always be placed sequentially.
  However there is no particular order in which chains are placed since
104
  (hopefully) the blocks for which sequentiality is important have already
105 106 107
  been placed in the same chain.

  -----------------------------------------------------------------------------
108
     1) First try to create a list of good chains.
109 110
  -----------------------------------------------------------------------------

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
  Good chains are these which allow us to eliminate jump instructions.
  Which further eliminate often executed jumps first.

  We do so by:

  *)  Ignore edges which represent instructions which can not be replaced
      by fall through control flow. Primarily calls and edges to blocks which
      are prefixed by a info table we have to jump across.

  *)  Then process remaining edges in order of frequency taken and:

    +)  If source and target have not been placed build a new chain from them.

    +)  If source and target have been placed, and are ends of differing chains
        try to merge the two chains.
126

127 128
    +)  If one side of the edge is a end/front of a chain, add the other block of
        to edge to the same chain
129

130 131
        Eg if we look at edge (B -> C) and already have the chain (A -> B)
        then we extend the chain to (A -> B -> C).
132

133 134
    +)  If the edge was used to modify or build a new chain remove the edge from
        our working list.
135

136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
  *) If there any blocks not being placed into a chain after these steps we place
     them into a chain consisting of only this block.

  Ranking edges by their taken frequency, if
  two edges compete for fall through on the same target block, the one taken
  more often will automatically win out. Resulting in fewer instructions being
  executed.

  Creating singleton chains is required for situations where we have code of the
  form:

    A: goto B:
    <infoTable>
    B: goto C:
    <infoTable>
    C: ...

  As the code in block B is only connected to the rest of the program via edges
  which will be ignored in this step we make sure that B still ends up in a chain
  this way.
156 157

  -----------------------------------------------------------------------------
158
     2) We also try to fuse chains.
159 160
  -----------------------------------------------------------------------------

161 162 163
  As a result from the above step we still end up with multiple chains which
  represent sequential control flow chunks. But they are not yet suitable for
  code layout as we need to place *all* blocks into a single sequence.
164

165
  In this step we combine chains result from the above step via these steps:
166

167 168
  *)  Look at the ranked list of *all* edges, including calls/jumps across info tables
      and the like.
169

170
  *)  Look at each edge and
171

172 173 174 175 176 177
    +) Given an edge (A -> B) try to find two chains for which
      * Block A is at the end of one chain
      * Block B is at the front of the other chain.
    +) If we find such a chain we "fuse" them into a single chain, remove the
       edge from working set and continue.
    +) If we can't find such chains we skip the edge and continue.
178 179

  -----------------------------------------------------------------------------
180
     3) Place indirect successors (neighbours) after each other
181 182 183 184 185 186 187 188 189 190 191 192 193 194
  -----------------------------------------------------------------------------

  We might have chains [A,B,C,X],[E] in a CFG of the sort:

    A ---> B ---> C --------> X(exit)
                   \- ->E- -/

  While E does not follow X it's still beneficial to place them near each other.
  This can be advantageous if eg C,X,E will end up in the same cache line.

  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ~~~ Note [Triangle Control Flow]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

195
  Checking if an argument is already evaluated leads to a somewhat
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
  special case  which looks like this:

    A:
        if (R1 & 7 != 0) goto Leval; else goto Lwork;
    Leval: // global
        call (I64[R1])(R1) returns to Lwork, args: 8, res: 8, upd: 8;
    Lwork: // global
        ...

        A
        |\
        | Leval
        |/ - (This edge can be missing because of optimizations)
        Lwork

  Once we hit the metal the call instruction is just 2-3 bytes large
  depending on the register used. So we lay out the assembly like this:

        movq %rbx,%rax
        andl $7,%eax
        cmpq $1,%rax
        jne Lwork
    Leval:
        jmp *(%rbx) # encoded in 2-3 bytes.
    <info table>
    Lwork:
        ...

  We could explicitly check for this control flow pattern.

  This is advantageous because:
  * It's optimal if the argument isn't evaluated.
  * If it's evaluated we only have the extra cost of jumping over
    the 2-3 bytes for the call.
  * Guarantees the smaller encoding for the conditional jump.

  However given that Lwork usually has an info table we
  penalize this edge. So Leval should get placed first
  either way and things work out for the best.

  Optimizing for the evaluated case instead would penalize
  the other code path. It adds an jump as we can't fall through
  to Lwork because of the info table.
  Assuming that Lwork is large the chance that the "call" ends up
  in the same cache line is also fairly small.

-}


-- | Look at X number of blocks in two chains to determine
--   if they are "neighbours".
neighbourOverlapp :: Int
neighbourOverlapp = 2

250 251 252 253 254 255
-- | Maps blocks near the end of a chain to it's chain AND
-- the other blocks near the end.
-- [A,B,C,D,E] Gives entries like (B -> ([A,B], [A,B,C,D,E]))
-- where [A,B] are blocks in the end region of a chain.
-- This is cheaper then recomputing the ends multiple times.
type FrontierMap = LabelMap ([BlockId],BlockChain)
256 257 258

-- | A non empty ordered sequence of basic blocks.
--   It is suitable for serialization in this order.
259 260 261
--
--   We use OrdList instead of [] to allow fast append on both sides
--   when combining chains.
262 263
newtype BlockChain
    = BlockChain { chainBlocks :: (OrdList BlockId) }
264

265 266 267 268
-- All chains are constructed the same way so comparison
-- including structure is faster.
instance Eq BlockChain where
    BlockChain b1 == BlockChain b2 = strictlyEqOL b1 b2
269 270 271 272 273

-- Useful for things like sets and debugging purposes, sorts by blocks
-- in the chain.
instance Ord (BlockChain) where
   (BlockChain lbls1) `compare` (BlockChain lbls2)
274 275
       = ASSERT(toList lbls1 /= toList lbls2 || lbls1 `strictlyEqOL` lbls2)
         strictlyOrdOL lbls1 lbls2
276 277

instance Outputable (BlockChain) where
278
    ppr (BlockChain blks) =
279
        parens (text "Chain:" <+> ppr (fromOL $ blks) )
280

281 282
chainFoldl :: (b -> BlockId -> b) -> b -> BlockChain -> b
chainFoldl f z (BlockChain blocks) = foldl' f z blocks
283 284 285 286 287 288 289 290 291

noDups :: [BlockChain] -> Bool
noDups chains =
    let chainBlocks = concatMap chainToBlocks chains :: [BlockId]
        (_blocks, dups) = removeDups compare chainBlocks
    in if null dups then True
        else pprTrace "Duplicates:" (ppr (map toList dups) $$ text "chains" <+> ppr chains ) False

inFront :: BlockId -> BlockChain -> Bool
292
inFront bid (BlockChain seq)
293
  = headOL seq == bid
294 295 296

chainSingleton :: BlockId -> BlockChain
chainSingleton lbl
297
    = BlockChain (unitOL lbl)
298

299 300 301
chainFromList :: [BlockId] -> BlockChain
chainFromList = BlockChain . toOL

302
chainSnoc :: BlockChain -> BlockId -> BlockChain
303 304
chainSnoc (BlockChain blks) lbl
  = BlockChain (blks `snocOL` lbl)
305

306 307 308 309
chainCons :: BlockId -> BlockChain -> BlockChain
chainCons lbl (BlockChain blks)
  = BlockChain (lbl `consOL` blks)

310
chainConcat :: BlockChain -> BlockChain -> BlockChain
311 312
chainConcat (BlockChain blks1) (BlockChain blks2)
  = BlockChain (blks1 `appOL` blks2)
313 314

chainToBlocks :: BlockChain -> [BlockId]
315
chainToBlocks (BlockChain blks) = fromOL blks
316 317 318 319 320

-- | Given the Chain A -> B -> C -> D and we break at C
--   we get the two Chains (A -> B, C -> D) as result.
breakChainAt :: BlockId -> BlockChain
             -> (BlockChain,BlockChain)
321 322
breakChainAt bid (BlockChain blks)
    | not (bid == head rblks)
323 324
    = panic "Block not in chain"
    | otherwise
325 326 327 328
    = (BlockChain (toOL lblks),
       BlockChain (toOL rblks))
  where
    (lblks, rblks) = break (\lbl -> lbl == bid) (fromOL blks)
329 330

takeR :: Int -> BlockChain -> [BlockId]
331
takeR n (BlockChain blks) =
332
    take n . fromOLReverse $ blks
333 334

takeL :: Int -> BlockChain -> [BlockId]
335
takeL n (BlockChain blks) =
336
    take n . fromOL $ blks
337

338 339
-- Note [Combining neighborhood chains]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
340 341 342 343

-- See also Note [Chain based CFG serialization]
-- We have the chains (A-B-C-D) and (E-F) and an Edge C->E.
--
344 345
-- While placing the latter after the former doesn't result in sequential
-- control flow it is still benefical. As block C and E might end
346 347 348 349 350 351 352 353
-- up in the same cache line.
--
-- So we place these chains next to each other even if we can't fuse them.
--
--   A -> B -> C -> D
--             v
--             - -> E -> F ...
--
354
-- A simple heuristic to chose which chains we want to combine:
355 356 357 358 359 360 361 362 363
--   * Process edges in descending priority.
--   * Check if there is a edge near the end of one chain which goes
--     to a block near the start of another edge.
--
-- While we could take into account the space between the two blocks which
-- share an edge this blows up compile times quite a bit. It requires
-- us to find all edges between two chains, check the distance for all edges,
-- rank them based on the distance and and only then we can select two chains
-- to combine. Which would add a lot of complexity for little gain.
364 365 366
--
-- So instead we just rank by the strength of the edge and use the first pair we
-- find.
367 368 369

-- | For a given list of chains and edges try to combine chains with strong
--   edges between them.
370 371 372 373 374 375
combineNeighbourhood  :: [CfgEdge] -- ^ Edges to consider
                      -> [BlockChain] -- ^ Current chains of blocks
                      -> ([BlockChain], Set.Set (BlockId,BlockId))
                      -- ^ Resulting list of block chains, and a set of edges which
                      -- were used to fuse chains and as such no longer need to be
                      -- considered.
376 377
combineNeighbourhood edges chains
    = -- pprTraceIt "Neigbours" $
378 379
    --   pprTrace "combineNeighbours" (ppr edges) $
      applyEdges edges endFrontier startFrontier (Set.empty)
380 381 382 383 384
    where
        --Build maps from chain ends to chains
        endFrontier, startFrontier :: FrontierMap
        endFrontier =
            mapFromList $ concatMap (\chain ->
385
                                let ends = getEnds chain :: [BlockId]
386 387 388 389 390 391 392
                                    entry = (ends,chain)
                                in map (\x -> (x,entry)) ends ) chains
        startFrontier =
            mapFromList $ concatMap (\chain ->
                                let front = getFronts chain
                                    entry = (front,chain)
                                in map (\x -> (x,entry)) front) chains
393 394 395 396 397
        applyEdges :: [CfgEdge] -> FrontierMap -> FrontierMap -> Set.Set (BlockId, BlockId)
                   -> ([BlockChain], Set.Set (BlockId,BlockId))
        applyEdges [] chainEnds _chainFronts combined =
            (ordNub $ map snd $ mapElems chainEnds, combined)
        applyEdges ((CfgEdge from to _w):edges) chainEnds chainFronts combined
398 399
            | Just (c1_e,c1) <- mapLookup from chainEnds
            , Just (c2_f,c2) <- mapLookup to chainFronts
400
            , c1 /= c2 -- Avoid trying to concat a chain with itself.
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
            = let newChain = chainConcat c1 c2
                  newChainFrontier = getFronts newChain
                  newChainEnds = getEnds newChain
                  newFronts :: FrontierMap
                  newFronts =
                    let withoutOld =
                            foldl' (\m b -> mapDelete b m :: FrontierMap) chainFronts (c2_f ++ getFronts c1)
                        entry =
                            (newChainFrontier,newChain) --let bound to ensure sharing
                    in foldl' (\m x -> mapInsert x entry m)
                              withoutOld newChainFrontier

                  newEnds =
                    let withoutOld = foldl' (\m b -> mapDelete b m) chainEnds (c1_e ++ getEnds c2)
                        entry = (newChainEnds,newChain) --let bound to ensure sharing
                    in foldl' (\m x -> mapInsert x entry m)
                              withoutOld newChainEnds
              in
                -- pprTrace "ApplyEdges"
                --  (text "before" $$
                --   text "fronts" <+> ppr chainFronts $$
                --   text "ends" <+> ppr chainEnds $$

                --   text "various" $$
                --   text "newChain" <+> ppr newChain $$
                --   text "newChainFrontier" <+> ppr newChainFrontier $$
                --   text "newChainEnds" <+> ppr newChainEnds $$
                --   text "drop" <+> ppr ((c2_f ++ getFronts c1) ++ (c1_e ++ getEnds c2)) $$

                --   text "after" $$
                --   text "fronts" <+> ppr newFronts $$
                --   text "ends" <+> ppr newEnds
                --   )
434
                 applyEdges edges newEnds newFronts (Set.insert (from,to) combined)
435
            | otherwise
436
            = applyEdges edges chainEnds chainFronts combined
437 438 439 440 441
         where

        getFronts chain = takeL neighbourOverlapp chain
        getEnds chain = takeR neighbourOverlapp chain

442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
-- In the last stop we combine all chains into a single one.
-- Trying to place chains with strong edges next to each other.
mergeChains :: [CfgEdge] -> [BlockChain]
            -> (BlockChain)
mergeChains edges chains
    = -- pprTrace "combine" (ppr edges) $
      runST $ do
        let addChain m0 chain = do
                ref <- newSTRef chain
                return $ chainFoldl (\m' b -> mapInsert b ref m') m0 chain
        chainMap' <- foldM (\m0 c -> addChain m0 c) mapEmpty chains
        merge edges chainMap'
    where
        -- We keep a map from ALL blocks to their respective chain (sigh)
        -- This is required since when looking at an edge we need to find
        -- the associated chains quickly.
        -- We use a map of STRefs, maintaining a invariant of one STRef per chain.
        -- When merging chains we can update the
        -- STRef of one chain once (instead of writing to the map for each block).
        -- We then overwrite the STRefs for the other chain so there is again only
        -- a single STRef for the combined chain.
        -- The difference in terms of allocations saved is ~0.2% with -O so actually
        -- significant compared to using a regular map.

        merge :: forall s. [CfgEdge] -> LabelMap (STRef s BlockChain) -> ST s BlockChain
        merge [] chains = do
            chains' <- ordNub <$> (mapM readSTRef $ mapElems chains) :: ST s [BlockChain]
            return $ foldl' chainConcat (head chains') (tail chains')
        merge ((CfgEdge from to _):edges) chains
        --   | pprTrace "merge" (ppr (from,to) <> ppr chains) False
        --   = undefined
          | cFrom == cTo
          = merge edges chains
          | otherwise
          = do
            chains' <- mergeComb cFrom cTo
            merge edges chains'
          where
            mergeComb :: STRef s BlockChain -> STRef s BlockChain -> ST s (LabelMap (STRef s BlockChain))
            mergeComb refFrom refTo = do
                cRight <- readSTRef refTo
                chain <- pure chainConcat <*> readSTRef refFrom <*> pure cRight
                writeSTRef refFrom chain
                return $ chainFoldl (\m b -> mapInsert b refFrom m) chains cRight

            cFrom = expectJust "mergeChains:chainMap:from" $ mapLookup from chains
            cTo = expectJust "mergeChains:chainMap:to"   $ mapLookup to   chains


-- See Note [Chain based CFG serialization] for the general idea.
-- This creates and fuses chains at the same time for performance reasons.

-- Try to build chains from a list of edges.
-- Edges must be sorted **descending** by their priority.
-- Returns the constructed chains, along with all edges which
-- are irrelevant past this point, this information doesn't need
-- to be complete - it's only used to speed up the process.
-- An Edge is irrelevant if the ends are part of the same chain.
-- We say these edges are already linked
buildChains :: [CfgEdge] -> [BlockId]
            -> ( LabelMap BlockChain  -- Resulting chains, indexd by end if chain.
503
               , Set.Set (BlockId, BlockId)) --List of fused edges.
504 505
buildChains edges blocks
  = runST $ buildNext setEmpty mapEmpty mapEmpty edges Set.empty
506
  where
507 508 509 510
    -- buildNext builds up chains from edges one at a time.

    -- We keep a map from the ends of chains to the chains.
    -- This we we can easily check if an block should be appended to an
511
    -- existing chain!
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
    -- We store them using STRefs so we don't have to rebuild the spine of both
    -- maps every time we update a chain.
    buildNext :: forall s. LabelSet
              -> LabelMap (STRef s BlockChain) -- Map from end of chain to chain.
              -> LabelMap (STRef s BlockChain) -- Map from start of chain to chain.
              -> [CfgEdge] -- Edges to check - ordered by decreasing weight
              -> Set.Set (BlockId, BlockId) -- Used edges
              -> ST s   ( LabelMap BlockChain -- Chains by end
                        , Set.Set (BlockId, BlockId) --List of fused edges
                        )
    buildNext placed _chainStarts chainEnds  [] linked = do
        ends' <- sequence $ mapMap readSTRef chainEnds :: ST s (LabelMap BlockChain)
        -- Any remaining blocks have to be made to singleton chains.
        -- They might be combined with other chains later on outside this function.
        let unplaced = filter (\x -> not (setMember x placed)) blocks
            singletons = map (\x -> (x,chainSingleton x)) unplaced :: [(BlockId,BlockChain)]
        return (foldl' (\m (k,v) -> mapInsert k v m) ends' singletons , linked)
    buildNext placed chainStarts chainEnds (edge:todo) linked
        | from == to
        -- We skip self edges
        = buildNext placed chainStarts chainEnds todo (Set.insert (from,to) linked)
        | not (alreadyPlaced from) &&
          not (alreadyPlaced to)
        = do
            --pprTraceM "Edge-Chain:" (ppr edge)
            chain' <- newSTRef $ chainFromList [from,to]
            buildNext
                (setInsert to (setInsert from placed))
                (mapInsert from chain' chainStarts)
                (mapInsert to chain' chainEnds)
                todo
                (Set.insert (from,to) linked)

        | (alreadyPlaced from) &&
          (alreadyPlaced to)
        , Just predChain <- mapLookup from chainEnds
        , Just succChain <- mapLookup to chainStarts
        , predChain /= succChain -- Otherwise we try to create a cycle.
        = do
            -- pprTraceM "Fusing edge" (ppr edge)
            fuseChain predChain succChain

        | (alreadyPlaced from) &&
          (alreadyPlaced to)
        =   --pprTraceM "Skipping:" (ppr edge) >>
            buildNext placed chainStarts chainEnds todo linked

559
        | otherwise
560 561 562
        = do -- pprTraceM "Finding chain for:" (ppr edge $$
             --         text "placed" <+> ppr placed)
             findChain
563
      where
564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
        from = edgeFrom edge
        to   = edgeTo   edge
        alreadyPlaced blkId = (setMember blkId placed)

        -- Combine two chains into a single one.
        fuseChain :: STRef s BlockChain -> STRef s BlockChain
                  -> ST s   ( LabelMap BlockChain -- Chains by end
                            , Set.Set (BlockId, BlockId) --List of fused edges
                            )
        fuseChain fromRef toRef = do
            fromChain <- readSTRef fromRef
            toChain <- readSTRef toRef
            let newChain = chainConcat fromChain toChain
            ref <- newSTRef newChain
            let start = head $ takeL 1 newChain
            let end = head $ takeR 1 newChain
            -- chains <- sequence $ mapMap readSTRef chainStarts
            -- pprTraceM "pre-fuse chains:" $ ppr chains
            buildNext
                placed
                (mapInsert start ref $ mapDelete to $ chainStarts)
                (mapInsert end ref $ mapDelete from $ chainEnds)
                todo
                (Set.insert (from,to) linked)


        --Add the block to a existing chain or creates a new chain
        findChain :: ST s   ( LabelMap BlockChain -- Chains by end
                            , Set.Set (BlockId, BlockId) --List of fused edges
                            )
        findChain
          -- We can attach the block to the end of a chain
          | alreadyPlaced from
          , Just predChain <- mapLookup from chainEnds
          = do
            chain <- readSTRef predChain
            let newChain = chainSnoc chain to
            writeSTRef predChain newChain
            let chainEnds' = mapInsert to predChain $ mapDelete from chainEnds
            -- chains <- sequence $ mapMap readSTRef chainStarts
            -- pprTraceM "from chains:" $ ppr chains
            buildNext (setInsert to placed) chainStarts chainEnds' todo (Set.insert (from,to) linked)
          -- We can attack it to the front of a chain
          | alreadyPlaced to
          , Just succChain <- mapLookup to chainStarts
          = do
            chain <- readSTRef succChain
            let newChain = from `chainCons` chain
            writeSTRef succChain newChain
            let chainStarts' = mapInsert from succChain $ mapDelete to chainStarts
            -- chains <- sequence $ mapMap readSTRef chainStarts'
            -- pprTraceM "to chains:" $ ppr chains
            buildNext (setInsert from placed) chainStarts' chainEnds todo (Set.insert (from,to) linked)
          -- The placed end of the edge is part of a chain already and not an end.
618
          | otherwise
619 620 621 622 623 624 625
          = do
            let block    = if alreadyPlaced to then from else to
            --pprTraceM "Singleton" $ ppr block
            let newChain = chainSingleton block
            ref <- newSTRef newChain
            buildNext (setInsert block placed) (mapInsert block ref chainStarts)
                      (mapInsert block ref chainEnds) todo (linked)
626 627 628
            where
              alreadyPlaced blkId = (setMember blkId placed)

629 630 631 632 633 634 635
-- | Place basic blocks based on the given CFG.
-- See Note [Chain based CFG serialization]
sequenceChain :: forall a i. (Instruction i, Outputable i)
              => LabelMap a -- ^ Keys indicate an info table on the block.
              -> CFG -- ^ Control flow graph and some meta data.
              -> [GenBasicBlock i] -- ^ List of basic blocks to be placed.
              -> [GenBasicBlock i] -- ^ Blocks placed in sequence.
636 637 638 639
sequenceChain _info _weights    [] = []
sequenceChain _info _weights    [x] = [x]
sequenceChain  info weights'     blocks@((BasicBlock entry _):_) =
    let weights :: CFG
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
        weights = --pprTrace "cfg'" (pprEdgeWeights cfg')
                  cfg'
          where
            (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-} mkGlobalWeights entry weights'
            cfg' = {-# SCC rewriteEdges #-}
                    mapFoldlWithKey
                        (\cfg from m ->
                            mapFoldlWithKey
                                (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to )
                                cfg m )
                        weights'
                        globalEdgeWeights

        directEdges :: [CfgEdge]
        directEdges = sortBy (flip compare) $ catMaybes . map relevantWeight $ (infoEdgeList weights)
          where
            relevantWeight :: CfgEdge -> Maybe CfgEdge
            relevantWeight edge@(CfgEdge from to edgeInfo)
                | (EdgeInfo CmmSource { trans_cmmNode = CmmCall {} } _) <- edgeInfo
                -- Ignore edges across calls
                = Nothing
                | mapMember to info
                , w <- edgeWeight edgeInfo
                -- The payoff is small if we jump over an info table
                = Just (CfgEdge from to edgeInfo { edgeWeight = w/8 })
                | otherwise
                = Just edge

668 669 670 671 672 673 674 675 676
        blockMap :: LabelMap (GenBasicBlock i)
        blockMap
            = foldl' (\m blk@(BasicBlock lbl _ins) ->
                        mapInsert lbl blk m)
                     mapEmpty blocks

        (builtChains, builtEdges)
            = {-# SCC "buildChains" #-}
              --pprTraceIt "generatedChains" $
677 678
              --pprTrace "blocks" (ppr (mapKeys blockMap)) $
              buildChains directEdges (mapKeys blockMap)
679

680 681
        rankedEdges :: [CfgEdge]
        -- Sort descending by weight, remove fused edges
682
        rankedEdges =
683 684
            filter (\edge -> not (Set.member (edgeFrom edge,edgeTo edge) builtEdges)) $
            directEdges
685

686
        (neighbourChains, combined)
687 688
            = ASSERT(noDups $ mapElems builtChains)
              {-# SCC "groupNeighbourChains" #-}
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
            --   pprTraceIt "NeighbourChains" $
              combineNeighbourhood rankedEdges (mapElems builtChains)


        allEdges :: [CfgEdge]
        allEdges = {-# SCC allEdges #-}
                   sortOn (relevantWeight) $ filter (not . deadEdge) $ (infoEdgeList weights)
          where
            deadEdge :: CfgEdge -> Bool
            deadEdge (CfgEdge from to _) = let e = (from,to) in Set.member e combined || Set.member e builtEdges
            relevantWeight :: CfgEdge -> EdgeWeight
            relevantWeight (CfgEdge _ _ edgeInfo)
                | EdgeInfo (CmmSource { trans_cmmNode = CmmCall {}}) _ <- edgeInfo
                -- Penalize edges across calls
                = weight/(64.0)
                | otherwise
                = weight
              where
                -- negate to sort descending
                weight = negate (edgeWeight edgeInfo)

        masterChain =
            {-# SCC "mergeChains" #-}
            -- pprTraceIt "MergedChains" $
            mergeChains allEdges neighbourChains
714 715

        --Make sure the first block stays first
716 717 718 719
        prepedChains
            | inFront entry masterChain
            = [masterChain]
            | (rest,entry) <- breakChainAt entry masterChain
720 721
            = [entry,rest]
            | otherwise = pprPanic "Entry point eliminated" $
722
                            ppr masterChain
723 724

        blockList
725 726
            = ASSERT(noDups [masterChain])
              (concatMap fromOL $ map chainBlocks prepedChains)
727 728 729 730 731 732 733 734 735

        --chainPlaced = setFromList $ map blockId blockList :: LabelSet
        chainPlaced = setFromList $ blockList :: LabelSet
        unplaced =
            let blocks = mapKeys blockMap
                isPlaced b = setMember (b) chainPlaced
            in filter (\block -> not (isPlaced block)) blocks

        placedBlocks =
736 737 738 739
            -- We want debug builds to catch this as it's a good indicator for
            -- issues with CFG invariants. But we don't want to blow up production
            -- builds if something slips through.
            ASSERT(null unplaced)
740
            --pprTraceIt "placedBlocks" $
741 742
            -- ++ [] is stil kinda expensive
            if null unplaced then blockList else blockList ++ unplaced
743 744 745 746 747 748
        getBlock bid = expectJust "Block placment" $ mapLookup bid blockMap
    in
        --Assert we placed all blocks given as input
        ASSERT(all (\bid -> mapMember bid blockMap) placedBlocks)
        dropJumps info $ map getBlock placedBlocks

749 750 751
{-# SCC dropJumps #-}
-- | Remove redundant jumps between blocks when we can rely on
-- fall through.
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776
dropJumps :: forall a i. Instruction i => LabelMap a -> [GenBasicBlock i]
          -> [GenBasicBlock i]
dropJumps _    [] = []
dropJumps info ((BasicBlock lbl ins):todo)
    | not . null $ ins --This can happen because of shortcutting
    , [dest] <- jumpDestsOfInstr (last ins)
    , ((BasicBlock nextLbl _) : _) <- todo
    , not (mapMember dest info)
    , nextLbl == dest
    = BasicBlock lbl (init ins) : dropJumps info todo
    | otherwise
    = BasicBlock lbl ins : dropJumps info todo


-- -----------------------------------------------------------------------------
-- Sequencing the basic blocks

-- Cmm BasicBlocks are self-contained entities: they always end in a
-- jump, either non-local or to another basic block in the same proc.
-- In this phase, we attempt to place the basic blocks in a sequence
-- such that as many of the local jumps as possible turn into
-- fallthroughs.

sequenceTop
    :: (Instruction instr, Outputable instr)
777 778 779
    => DynFlags -- Determine which layout algo to use
    -> NcgImpl statics instr jumpDest
    -> Maybe CFG -- ^ CFG if we have one.
780 781
    -> NatCmmDecl statics instr -- ^ Function to serialize
    -> NatCmmDecl statics instr
782 783 784 785 786 787

sequenceTop _     _       _           top@(CmmData _ _) = top
sequenceTop dflags ncgImpl edgeWeights
            (CmmProc info lbl live (ListGraph blocks))
  | (gopt Opt_CfgBlocklayout dflags) && backendMaintainsCfg dflags
  --Use chain based algorithm
788
  , Just cfg <- edgeWeights
789
  = CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
790
                            {-# SCC layoutBlocks #-}
791
                            sequenceChain info cfg blocks )
792 793
  | otherwise
  --Use old algorithm
794 795
  = let cfg = if dontUseCfg then Nothing else edgeWeights
    in  CmmProc info lbl live ( ListGraph $ ncgMakeFarBranches ncgImpl info $
796
                                {-# SCC layoutBlocks #-}
797
                                sequenceBlocks cfg info blocks)
798
  where
799 800
    dontUseCfg = gopt Opt_WeightlessBlocklayout dflags ||
                 (not $ backendMaintainsCfg dflags)
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894

-- The old algorithm:
-- It is very simple (and stupid): We make a graph out of
-- the blocks where there is an edge from one block to another iff the
-- first block ends by jumping to the second.  Then we topologically
-- sort this graph.  Then traverse the list: for each block, we first
-- output the block, then if it has an out edge, we move the
-- destination of the out edge to the front of the list, and continue.

-- FYI, the classic layout for basic blocks uses postorder DFS; this
-- algorithm is implemented in Hoopl.

sequenceBlocks :: Instruction inst => Maybe CFG -> LabelMap a
               -> [GenBasicBlock inst] -> [GenBasicBlock inst]
sequenceBlocks _edgeWeight _ [] = []
sequenceBlocks edgeWeights infos (entry:blocks) =
    let entryNode = mkNode edgeWeights entry
        bodyNodes = reverse
                    (flattenSCCs (sccBlocks edgeWeights blocks))
    in dropJumps infos . seqBlocks infos $ ( entryNode : bodyNodes)
  -- the first block is the entry point ==> it must remain at the start.

sccBlocks
        :: Instruction instr
        => Maybe CFG -> [NatBasicBlock instr]
        -> [SCC (Node BlockId (NatBasicBlock instr))]
sccBlocks edgeWeights blocks =
    stronglyConnCompFromEdgedVerticesUniqR
        (map (mkNode edgeWeights) blocks)

mkNode :: (Instruction t)
       => Maybe CFG -> GenBasicBlock t
       -> Node BlockId (GenBasicBlock t)
mkNode edgeWeights block@(BasicBlock id instrs) =
    DigraphNode block id outEdges
  where
    outEdges :: [BlockId]
    outEdges
      --Select the heaviest successor, ignore weights <= zero
      = successor
      where
        successor
          | Just successors <- fmap (`getSuccEdgesSorted` id)
                                    edgeWeights -- :: Maybe [(Label, EdgeInfo)]
          = case successors of
            [] -> []
            ((target,info):_)
              | length successors > 2 || edgeWeight info <= 0 -> []
              | otherwise -> [target]
          | otherwise
          = case jumpDestsOfInstr (last instrs) of
                [one] -> [one]
                _many -> []


seqBlocks :: LabelMap i -> [Node BlockId (GenBasicBlock t1)]
                        -> [GenBasicBlock t1]
seqBlocks infos blocks = placeNext pullable0 todo0
  where
    -- pullable: Blocks that are not yet placed
    -- todo:     Original order of blocks, to be followed if we have no good
    --           reason not to;
    --           may include blocks that have already been placed, but then
    --           these are not in pullable
    pullable0 = listToUFM [ (i,(b,n)) | DigraphNode b i n <- blocks ]
    todo0     = map node_key blocks

    placeNext _ [] = []
    placeNext pullable (i:rest)
        | Just (block, pullable') <- lookupDeleteUFM pullable i
        = place pullable' rest block
        | otherwise
        -- We already placed this block, so ignore
        = placeNext pullable rest

    place pullable todo (block,[])
                          = block : placeNext pullable todo
    place pullable todo (block@(BasicBlock id instrs),[next])
        | mapMember next infos
        = block : placeNext pullable todo
        | Just (nextBlock, pullable') <- lookupDeleteUFM pullable next
        = BasicBlock id instrs : place pullable' todo nextBlock
        | otherwise
        = block : placeNext pullable todo
    place _ _ (_,tooManyNextNodes)
        = pprPanic "seqBlocks" (ppr tooManyNextNodes)


lookupDeleteUFM :: Uniquable key => UniqFM elt -> key
                -> Maybe (elt, UniqFM elt)
lookupDeleteUFM m k = do -- Maybe monad
    v <- lookupUFM m k
    return (v, delFromUFM m k)