CFG.hs 49.5 KB
Newer Older
1 2 3 4 5 6 7 8
--
-- Copyright (c) 2018 Andreas Klebinger
--

{-# LANGUAGE TypeFamilies, ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE CPP #-}
9 10 11
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DataKinds #-}
12 13 14 15 16 17

module CFG
    ( CFG, CfgEdge(..), EdgeInfo(..), EdgeWeight(..)
    , TransitionSource(..)

    --Modify the CFG
18 19
    , addWeightEdge, addEdge
    , delEdge, delNode
20 21 22
    , addNodesBetween, shortcutWeightMap
    , reverseEdges, filterEdges
    , addImmediateSuccessor
23
    , mkWeightInfo, adjustEdgeWeight, setEdgeWeight
24 25 26 27

    --Query the CFG
    , infoEdgeList, edgeList
    , getSuccessorEdges, getSuccessors
28
    , getSuccEdgesSorted
29 30
    , getEdgeInfo
    , getCfgNodes, hasNode
31 32 33

    -- Loop Information
    , loopMembers, loopLevels, loopInfo
34 35 36 37 38

    --Construction/Misc
    , getCfg, getCfgProc, pprEdgeWeights, sanityCheckCfg

    --Find backedges and update their weight
39 40 41 42
    , optimizeCFG
    , mkGlobalWeights

     )
43 44 45 46 47 48 49
where

#include "HsVersions.h"

import GhcPrelude

import BlockId
50 51
import Cmm

52 53 54 55 56 57 58 59 60
import CmmUtils
import CmmSwitch
import Hoopl.Collections
import Hoopl.Label
import Hoopl.Block
import qualified Hoopl.Graph as G

import Util
import Digraph
61 62 63 64 65 66 67 68 69 70 71 72 73
import Maybes

import Unique
import qualified Dominators as Dom
import Data.IntMap.Strict (IntMap)
import Data.IntSet (IntSet)

import qualified Data.IntMap.Strict as IM
import qualified Data.Map as M
import qualified Data.IntSet as IS
import qualified Data.Set as S
import Data.Tree
import Data.Bifunctor
74 75 76 77

import Outputable
-- DEBUGGING ONLY
--import Debug
78
-- import Debug.Trace
79 80
--import OrdList
--import Debug.Trace
81
import PprCmm () -- For Outputable instances
82 83 84 85
import qualified DynFlags as D

import Data.List

86 87 88 89 90 91 92 93 94 95 96 97
import Data.STRef.Strict
import Control.Monad.ST

import Data.Array.MArray
import Data.Array.ST
import Data.Array.IArray
import Data.Array.Unsafe (unsafeFreeze)
import Data.Array.Base (unsafeRead, unsafeWrite)

import Control.Monad

type Prob = Double
98 99 100 101 102

type Edge = (BlockId, BlockId)
type Edges = [Edge]

newtype EdgeWeight
103 104
  = EdgeWeight { weightToDouble :: Double }
  deriving (Eq,Ord,Enum,Num,Real,Fractional)
105 106

instance Outputable EdgeWeight where
107
  ppr (EdgeWeight w) = doublePrec 5 w
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143

type EdgeInfoMap edgeInfo = LabelMap (LabelMap edgeInfo)

-- | A control flow graph where edges have been annotated with a weight.
type CFG = EdgeInfoMap EdgeInfo

data CfgEdge
  = CfgEdge
  { edgeFrom :: !BlockId
  , edgeTo :: !BlockId
  , edgeInfo :: !EdgeInfo
  }

-- | Careful! Since we assume there is at most one edge from A to B
--   the Eq instance does not consider weight.
instance Eq CfgEdge where
  (==) (CfgEdge from1 to1 _) (CfgEdge from2 to2 _)
    = from1 == from2 && to1 == to2

-- | Edges are sorted ascending pointwise by weight, source and destination
instance Ord CfgEdge where
  compare (CfgEdge from1 to1 (EdgeInfo {edgeWeight = weight1}))
          (CfgEdge from2 to2 (EdgeInfo {edgeWeight = weight2}))
    | weight1 < weight2 || weight1 == weight2 && from1 < from2 ||
      weight1 == weight2 && from1 == from2 && to1 < to2
    = LT
    | from1 == from2 && to1 == to2 && weight1 == weight2
    = EQ
    | otherwise
    = GT

instance Outputable CfgEdge where
  ppr (CfgEdge from1 to1 edgeInfo)
    = parens (ppr from1 <+> text "-(" <> ppr edgeInfo <> text ")->" <+> ppr to1)

-- | Can we trace back a edge to a specific Cmm Node
144
-- or has it been introduced during assembly codegen. We use this to maintain
145 146 147 148
-- some information which would otherwise be lost during the
-- Cmm <-> asm transition.
-- See also Note [Inverting Conditional Branches]
data TransitionSource
149 150
  = CmmSource { trans_cmmNode :: (CmmNode O C)
              , trans_info :: BranchInfo }
151 152 153
  | AsmCodeGen
  deriving (Eq)

154 155 156 157 158 159 160 161 162 163 164 165
data BranchInfo = NoInfo         -- ^ Unknown, but not heap or stack check.
                | HeapStackCheck -- ^ Heap or stack check
    deriving Eq

instance Outputable BranchInfo where
    ppr NoInfo = text "regular"
    ppr HeapStackCheck = text "heap/stack"

isHeapOrStackCheck :: TransitionSource -> Bool
isHeapOrStackCheck (CmmSource { trans_info = HeapStackCheck}) = True
isHeapOrStackCheck _ = False

166 167 168 169 170 171 172 173 174 175 176 177
-- | Information about edges
data EdgeInfo
  = EdgeInfo
  { transitionSource :: !TransitionSource
  , edgeWeight :: !EdgeWeight
  } deriving (Eq)

instance Outputable EdgeInfo where
  ppr edgeInfo = text "weight:" <+> ppr (edgeWeight edgeInfo)

-- | Convenience function, generate edge info based
--   on weight not originating from cmm.
178 179
mkWeightInfo :: EdgeWeight -> EdgeInfo
mkWeightInfo = EdgeInfo AsmCodeGen
180 181 182 183 184 185 186

-- | Adjust the weight between the blocks using the given function.
--   If there is no such edge returns the original map.
adjustEdgeWeight :: CFG -> (EdgeWeight -> EdgeWeight)
                 -> BlockId -> BlockId -> CFG
adjustEdgeWeight cfg f from to
  | Just info <- getEdgeInfo from to cfg
187 188 189 190 191 192 193 194 195 196 197 198
  , !weight <- edgeWeight info
  , !newWeight <- f weight
  = addEdge from to (info { edgeWeight = newWeight}) cfg
  | otherwise = cfg

-- | Set the weight between the blocks to the given weight.
--   If there is no such edge returns the original map.
setEdgeWeight :: CFG -> EdgeWeight
              -> BlockId -> BlockId -> CFG
setEdgeWeight cfg !weight from to
  | Just info <- getEdgeInfo from to cfg
  = addEdge from to (info { edgeWeight = weight}) cfg
199 200
  | otherwise = cfg

201 202


203
getCfgNodes :: CFG -> LabelSet
204 205
getCfgNodes m =
    mapFoldlWithKey (\s k toMap -> mapFoldlWithKey (\s k _ -> setInsert k s) (setInsert k s) toMap ) setEmpty m
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283

hasNode :: CFG -> BlockId -> Bool
hasNode m node = mapMember node m || any (mapMember node) m

-- | Check if the nodes in the cfg and the set of blocks are the same.
--   In a case of a missmatch we panic and show the difference.
sanityCheckCfg :: CFG -> LabelSet -> SDoc -> Bool
sanityCheckCfg m blockSet msg
    | blockSet == cfgNodes
    = True
    | otherwise =
        pprPanic "Block list and cfg nodes don't match" (
            text "difference:" <+> ppr diff $$
            text "blocks:" <+> ppr blockSet $$
            text "cfg:" <+> ppr m $$
            msg )
            False
    where
      cfgNodes = getCfgNodes m :: LabelSet
      diff = (setUnion cfgNodes blockSet) `setDifference` (setIntersection cfgNodes blockSet) :: LabelSet

-- | Filter the CFG with a custom function f.
--   Paramaeters are `f from to edgeInfo`
filterEdges :: (BlockId -> BlockId -> EdgeInfo -> Bool) -> CFG -> CFG
filterEdges f cfg =
    mapMapWithKey filterSources cfg
    where
      filterSources from m =
        mapFilterWithKey (\to w -> f from to w) m


{- Note [Updating the CFG during shortcutting]

See Note [What is shortcutting] in the control flow optimization
code (CmmContFlowOpt.hs) for a slightly more in depth explanation on shortcutting.

In the native backend we shortcut jumps at the assembly level. (AsmCodeGen.hs)
This means we remove blocks containing only one jump from the code
and instead redirecting all jumps targeting this block to the deleted
blocks jump target.

However we want to have an accurate representation of control
flow in the CFG. So we add/remove edges accordingly to account
for the eliminated blocks and new edges.

If we shortcut A -> B -> C to A -> C:
* We delete edges A -> B and B -> C
* Replacing them with the edge A -> C

We also try to preserve jump weights while doing so.

Note that:
* The edge B -> C can't have interesting weights since
  the block B consists of a single unconditional jump without branching.
* We delete the edge A -> B and add the edge A -> C.
* The edge A -> B can be one of many edges originating from A so likely
  has edge weights we want to preserve.

For this reason we simply store the edge info from the original A -> B
edge and apply this information to the new edge A -> C.

Sometimes we have a scenario where jump target C is not represented by an
BlockId but an immediate value. I'm only aware of this happening without
tables next to code currently.

Then we go from A ---> B - -> IMM   to   A - -> IMM where the dashed arrows
are not stored in the CFG.

In that case we simply delete the edge A -> B.

In terms of implementation the native backend first builds a mapping
from blocks suitable for shortcutting to their jump targets.
Then it redirects all jump instructions to these blocks using the
built up mapping.
This function (shortcutWeightMap) takes the same mapping and
applies the mapping to the CFG in the way layed out above.

-}
284 285
shortcutWeightMap :: LabelMap (Maybe BlockId) -> CFG -> CFG
shortcutWeightMap cuts cfg =
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
  foldl' applyMapping cfg $ mapToList cuts
    where
-- takes the tuple (B,C) from the notation in [Updating the CFG during shortcutting]
      applyMapping :: CFG -> (BlockId,Maybe BlockId) -> CFG
      --Shortcut immediate
      applyMapping m (from, Nothing) =
        mapDelete from .
        fmap (mapDelete from) $ m
      --Regular shortcut
      applyMapping m (from, Just to) =
        let updatedMap :: CFG
            updatedMap
              = fmap (shortcutEdge (from,to)) $
                (mapDelete from m :: CFG )
        --Sometimes we can shortcut multiple blocks like so:
        -- A -> B -> C -> D -> E => A -> E
        -- so we check for such chains.
        in case mapLookup to cuts of
            Nothing -> updatedMap
            Just dest -> applyMapping updatedMap (to, dest)
      --Redirect edge from B to C
      shortcutEdge :: (BlockId, BlockId) -> LabelMap EdgeInfo -> LabelMap EdgeInfo
      shortcutEdge (from, to) m =
        case mapLookup from m of
          Just info -> mapInsert to info $ mapDelete from m
          Nothing   -> m

-- | Sometimes we insert a block which should unconditionally be executed
--   after a given block. This function updates the CFG for these cases.
--  So we get A -> B    => A -> A' -> B
--             \                  \
--              -> C    =>         -> C
--
addImmediateSuccessor :: BlockId -> BlockId -> CFG -> CFG
addImmediateSuccessor node follower cfg
    = updateEdges . addWeightEdge node follower uncondWeight $ cfg
    where
        uncondWeight = fromIntegral . D.uncondWeight .
                       D.cfgWeightInfo $ D.unsafeGlobalDynFlags
        targets = getSuccessorEdges cfg node
        successors = map fst targets :: [BlockId]
        updateEdges = addNewSuccs . remOldSuccs
        remOldSuccs m = foldl' (flip (delEdge node)) m successors
        addNewSuccs m =
          foldl' (\m' (t,info) -> addEdge follower t info m') m targets

-- | Adds a new edge, overwrites existing edges if present
addEdge :: BlockId -> BlockId -> EdgeInfo -> CFG -> CFG
addEdge from to info cfg =
    mapAlter addDest from cfg
    where
        addDest Nothing = Just $ mapSingleton to info
        addDest (Just wm) = Just $ mapInsert to info wm

-- | Adds a edge with the given weight to the cfg
--   If there already existed an edge it is overwritten.
--   `addWeightEdge from to weight cfg`
addWeightEdge :: BlockId -> BlockId -> EdgeWeight -> CFG -> CFG
addWeightEdge from to weight cfg =
    addEdge from to (mkWeightInfo weight) cfg

delEdge :: BlockId -> BlockId -> CFG -> CFG
delEdge from to m =
    mapAlter remDest from m
    where
        remDest Nothing = Nothing
        remDest (Just wm) = Just $ mapDelete to wm

354 355 356 357 358
delNode :: BlockId -> CFG -> CFG
delNode node cfg =
  fmap (mapDelete node)  -- < Edges to the node
    (mapDelete node cfg) -- < Edges from the node

359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
-- | Destinations from bid ordered by weight (descending)
getSuccEdgesSorted :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
getSuccEdgesSorted m bid =
    let destMap = mapFindWithDefault mapEmpty bid m
        cfgEdges = mapToList destMap
        sortedEdges = sortWith (negate . edgeWeight . snd) cfgEdges
    in  --pprTrace "getSuccEdgesSorted" (ppr bid <+> text "map:" <+> ppr m)
        sortedEdges

-- | Get successors of a given node with edge weights.
getSuccessorEdges :: CFG -> BlockId -> [(BlockId,EdgeInfo)]
getSuccessorEdges m bid = maybe [] mapToList $ mapLookup bid m

getEdgeInfo :: BlockId -> BlockId -> CFG -> Maybe EdgeInfo
getEdgeInfo from to m
    | Just wm <- mapLookup from m
    , Just info <- mapLookup to wm
    = Just $! info
    | otherwise
    = Nothing

380 381 382 383 384 385 386 387 388
getEdgeWeight :: CFG -> BlockId -> BlockId -> EdgeWeight
getEdgeWeight cfg from to =
    edgeWeight $ expectJust "Edgeweight for noexisting block" $
                 getEdgeInfo from to cfg

getTransitionSource :: BlockId -> BlockId -> CFG -> TransitionSource
getTransitionSource from to cfg = transitionSource $ expectJust "Source info for noexisting block" $
                        getEdgeInfo from to cfg

389
reverseEdges :: CFG -> CFG
390
reverseEdges cfg = mapFoldlWithKey (\cfg from toMap -> go (addNode cfg from) from toMap) mapEmpty cfg
391
  where
392 393 394 395 396 397
    -- We preserve nodes without outgoing edges!
    addNode :: CFG -> BlockId -> CFG
    addNode cfg b = mapInsertWith mapUnion b mapEmpty cfg
    go :: CFG -> BlockId -> (LabelMap EdgeInfo) -> CFG
    go cfg from toMap = mapFoldlWithKey (\cfg to info -> addEdge to from info cfg) cfg toMap  :: CFG

398 399 400 401

-- | Returns a unordered list of all edges with info
infoEdgeList :: CFG -> [CfgEdge]
infoEdgeList m =
402 403 404 405 406 407 408 409 410 411 412
    go (mapToList m) []
  where
    -- We avoid foldMap to avoid thunk buildup
    go :: [(BlockId,LabelMap EdgeInfo)] -> [CfgEdge] -> [CfgEdge]
    go [] acc = acc
    go ((from,toMap):xs) acc
      = go' xs from (mapToList toMap) acc
    go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [(BlockId,EdgeInfo)] -> [CfgEdge] -> [CfgEdge]
    go' froms _    []              acc = go froms acc
    go' froms from ((to,info):tos) acc
      = go' froms from tos (CfgEdge from to info : acc)
413 414 415 416

-- | Returns a unordered list of all edges without weights
edgeList :: CFG -> [Edge]
edgeList m =
417 418 419 420 421 422 423 424 425 426 427
    go (mapToList m) []
  where
    -- We avoid foldMap to avoid thunk buildup
    go :: [(BlockId,LabelMap EdgeInfo)] -> [Edge] -> [Edge]
    go [] acc = acc
    go ((from,toMap):xs) acc
      = go' xs from (mapKeys toMap) acc
    go' :: [(BlockId,LabelMap EdgeInfo)] -> BlockId -> [BlockId] -> [Edge] -> [Edge]
    go' froms _    []              acc = go froms acc
    go' froms from (to:tos) acc
      = go' froms from tos ((from,to) : acc)
428 429 430 431 432 433 434 435 436 437

-- | Get successors of a given node without edge weights.
getSuccessors :: CFG -> BlockId -> [BlockId]
getSuccessors m bid
    | Just wm <- mapLookup bid m
    = mapKeys wm
    | otherwise = []

pprEdgeWeights :: CFG -> SDoc
pprEdgeWeights m =
438 439
    let edges = sort $ infoEdgeList m :: [CfgEdge]
        printEdge (CfgEdge from to (EdgeInfo { edgeWeight = weight }))
440 441 442 443 444
            = text "\t" <> ppr from <+> text "->" <+> ppr to <>
              text "[label=\"" <> ppr weight <> text "\",weight=\"" <>
              ppr weight <> text "\"];\n"
        --for the case that there are no edges from/to this node.
        --This should rarely happen but it can save a lot of time
Gabor Greif's avatar
Gabor Greif committed
445
        --to immediately see it when it does.
446 447
        printNode node
            = text "\t" <> ppr node <> text ";\n"
448
        getEdgeNodes (CfgEdge from to _) = [from,to]
449 450 451 452 453 454 455 456 457 458 459 460
        edgeNodes = setFromList $ concatMap getEdgeNodes edges :: LabelSet
        nodes = filter (\n -> (not . setMember n) edgeNodes) . mapKeys $ mapFilter null m
    in
    text "digraph {\n" <>
        (foldl' (<>) empty (map printEdge edges)) <>
        (foldl' (<>) empty (map printNode nodes)) <>
    text "}\n"

{-# INLINE updateEdgeWeight #-} --Allows eliminating the tuple when possible
updateEdgeWeight :: (EdgeWeight -> EdgeWeight) -> Edge -> CFG -> CFG
updateEdgeWeight f (from, to) cfg
    | Just oldInfo <- getEdgeInfo from to cfg
461 462
    = let !oldWeight = edgeWeight oldInfo
          !newWeight = f oldWeight
463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
      in addEdge from to (oldInfo {edgeWeight = newWeight}) cfg
    | otherwise
    = panic "Trying to update invalid edge"

-- from to oldWeight => newWeight
mapWeights :: (BlockId -> BlockId -> EdgeWeight -> EdgeWeight) -> CFG -> CFG
mapWeights f cfg =
  foldl' (\cfg (CfgEdge from to info) ->
            let oldWeight = edgeWeight info
                newWeight = f from to oldWeight
            in addEdge from to (info {edgeWeight = newWeight}) cfg)
          cfg (infoEdgeList cfg)


-- | Insert a block in the control flow between two other blocks.
-- We pass a list of tuples (A,B,C) where
-- * A -> C: Old edge
-- * A -> B -> C : New Arc, where B is the new block.
-- It's possible that a block has two jumps to the same block
-- in the assembly code. However we still only store a single edge for
-- these cases.
-- We assign the old edge info to the edge A -> B and assign B -> C the
-- weight of an unconditional jump.
addNodesBetween :: CFG -> [(BlockId,BlockId,BlockId)] -> CFG
addNodesBetween m updates =
  foldl'  updateWeight m .
          weightUpdates $ updates
    where
      weight = fromIntegral . D.uncondWeight .
                D.cfgWeightInfo $ D.unsafeGlobalDynFlags
      -- We might add two blocks for different jumps along a single
      -- edge. So we end up with edges:   A -> B -> C   ,   A -> D -> C
      -- in this case after applying the first update the weight for A -> C
      -- is no longer available. So we calculate future weights before updates.
      weightUpdates = map getWeight
      getWeight :: (BlockId,BlockId,BlockId) -> (BlockId,BlockId,BlockId,EdgeInfo)
      getWeight (from,between,old)
        | Just edgeInfo <- getEdgeInfo from old m
        = (from,between,old,edgeInfo)
        | otherwise
        = pprPanic "Can't find weight for edge that should have one" (
            text "triple" <+> ppr (from,between,old) $$
505 506
            text "updates" <+> ppr updates $$
            text "cfg:" <+> ppr m )
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
      updateWeight :: CFG -> (BlockId,BlockId,BlockId,EdgeInfo) -> CFG
      updateWeight m (from,between,old,edgeInfo)
        = addEdge from between edgeInfo .
          addWeightEdge between old weight .
          delEdge from old $ m

{-
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ~~~       Note [CFG Edge Weights]    ~~~
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  Edge weights assigned do not currently represent a specific
  cost model and rather just a ranking of which blocks should
  be placed next to each other given their connection type in
  the CFG.
  This is especially relevant if we whenever two blocks will
  jump to the same target.

                     A   B
                      \ /
                       C

  Should A or B be placed in front of C? The block layout algorithm
  decides this based on which edge (A,C)/(B,C) is heavier. So we
531
  make a educated guess on which branch should be preferred.
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562

  We rank edges in this order:
  * Unconditional Control Transfer - They will always
    transfer control to their target. Unless there is a info table
    we can turn the jump into a fallthrough as well.
    We use 20k as default, so it's easy to spot if values have been
    modified but unlikely that we run into issues with overflow.
  * If branches (likely) - We assume branches marked as likely
    are taken more than 80% of the time.
    By ranking them below unconditional jumps we make sure we
    prefer the unconditional if there is a conditional and
    unconditional edge towards a block.
  * If branches (regular) - The false branch can potentially be turned
    into a fallthrough so we prefer it slightly over the true branch.
  * Unlikely branches - These can be assumed to be taken less than 20%
    of the time. So we given them one of the lowest priorities.
  * Switches - Switches at this level are implemented as jump tables
    so have a larger number of successors. So without more information
    we can only say that each individual successor is unlikely to be
    jumped to and we rank them accordingly.
  * Calls - We currently ignore calls completly:
        * By the time we return from a call there is a good chance
          that the address we return to has already been evicted from
          cache eliminating a main advantage sequential placement brings.
        * Calls always require a info table in front of their return
          address. This reduces the chance that we return to the same
          cache line further.

-}
-- | Generate weights for a Cmm proc based on some simple heuristics.
getCfgProc :: D.CfgWeights -> RawCmmDecl -> CFG
563
getCfgProc _       (CmmData {}) = mapEmpty
Ben Gamari's avatar
Ben Gamari committed
564
getCfgProc weights (CmmProc _info _lab _live graph) = getCfg weights graph
565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594

getCfg :: D.CfgWeights -> CmmGraph -> CFG
getCfg weights graph =
  foldl' insertEdge edgelessCfg $ concatMap getBlockEdges blocks
  where
    D.CFGWeights
            { D.uncondWeight = uncondWeight
            , D.condBranchWeight = condBranchWeight
            , D.switchWeight = switchWeight
            , D.callWeight = callWeight
            , D.likelyCondWeight = likelyCondWeight
            , D.unlikelyCondWeight = unlikelyCondWeight
            --  Last two are used in other places
            --, D.infoTablePenalty = infoTablePenalty
            --, D.backEdgeBonus = backEdgeBonus
            } = weights
    -- Explicitly add all nodes to the cfg to ensure they are part of the
    -- CFG.
    edgelessCfg = mapFromList $ zip (map G.entryLabel blocks) (repeat mapEmpty)
    insertEdge :: CFG -> ((BlockId,BlockId),EdgeInfo) -> CFG
    insertEdge m ((from,to),weight) =
      mapAlter f from m
        where
          f :: Maybe (LabelMap EdgeInfo) -> Maybe (LabelMap EdgeInfo)
          f Nothing = Just $ mapSingleton to weight
          f (Just destMap) = Just $ mapInsert to weight destMap
    getBlockEdges :: CmmBlock -> [((BlockId,BlockId),EdgeInfo)]
    getBlockEdges block =
      case branch of
        CmmBranch dest -> [mkEdge dest uncondWeight]
595
        CmmCondBranch cond t f l
596 597 598 599 600 601
          | l == Nothing ->
              [mkEdge f condBranchWeight,   mkEdge t condBranchWeight]
          | l == Just True ->
              [mkEdge f unlikelyCondWeight, mkEdge t likelyCondWeight]
          | l == Just False ->
              [mkEdge f likelyCondWeight,   mkEdge t unlikelyCondWeight]
602 603 604 605 606 607 608 609 610 611 612
          where
            mkEdgeInfo = -- pprTrace "Info" (ppr branchInfo <+> ppr cond)
                         EdgeInfo (CmmSource branch branchInfo) . fromIntegral
            mkEdge target weight = ((bid,target), mkEdgeInfo weight)
            branchInfo =
              foldRegsUsed
                (panic "foldRegsDynFlags")
                (\info r -> if r == SpLim || r == HpLim || r == BaseReg
                    then HeapStackCheck else info)
                NoInfo cond

613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
        (CmmSwitch _e ids) ->
          let switchTargets = switchTargetsToList ids
              --Compiler performance hack - for very wide switches don't
              --consider targets for layout.
              adjustedWeight =
                if (length switchTargets > 10) then -1 else switchWeight
          in map (\x -> mkEdge x adjustedWeight) switchTargets
        (CmmCall { cml_cont = Just cont})  -> [mkEdge cont callWeight]
        (CmmForeignCall {Cmm.succ = cont}) -> [mkEdge cont callWeight]
        (CmmCall { cml_cont = Nothing })   -> []
        other ->
            panic "Foo" $
            ASSERT2(False, ppr "Unkown successor cause:" <>
              (ppr branch <+> text "=>" <> ppr (G.successors other)))
            map (\x -> ((bid,x),mkEdgeInfo 0)) $ G.successors other
      where
        bid = G.entryLabel block
630
        mkEdgeInfo = EdgeInfo (CmmSource branch NoInfo) . fromIntegral
631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651
        mkEdge target weight = ((bid,target), mkEdgeInfo weight)
        branch = lastNode block :: CmmNode O C

    blocks = revPostorder graph :: [CmmBlock]

--Find back edges by BFS
findBackEdges :: BlockId -> CFG -> Edges
findBackEdges root cfg =
    --pprTraceIt "Backedges:" $
    map fst .
    filter (\x -> snd x == Backward) $ typedEdges
  where
    edges = edgeList cfg :: [(BlockId,BlockId)]
    getSuccs = getSuccessors cfg :: BlockId -> [BlockId]
    typedEdges =
      classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)]


optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
optimizeCFG _ (CmmData {}) cfg = cfg
optimizeCFG weights (CmmProc info _lab _live graph) cfg =
652 653 654 655 656
    {-# SCC optimizeCFG #-}
    -- pprTrace "Initial:" (pprEdgeWeights cfg) $
    -- pprTrace "Initial:" (ppr $ mkGlobalWeights (g_entry graph) cfg) $

    -- pprTrace "LoopInfo:" (ppr $ loopInfo cfg (g_entry graph)) $
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
    favourFewerPreds  .
    penalizeInfoTables info .
    increaseBackEdgeWeight (g_entry graph) $ cfg
  where

    -- | Increase the weight of all backedges in the CFG
    -- this helps to make loop jumpbacks the heaviest edges
    increaseBackEdgeWeight :: BlockId -> CFG -> CFG
    increaseBackEdgeWeight root cfg =
        let backedges = findBackEdges root cfg
            update weight
              --Keep irrelevant edges irrelevant
              | weight <= 0 = 0
              | otherwise
              = weight + fromIntegral (D.backEdgeBonus weights)
        in  foldl'  (\cfg edge -> updateEdgeWeight update edge cfg)
                    cfg backedges

    -- | Since we cant fall through info tables we penalize these.
    penalizeInfoTables :: LabelMap a -> CFG -> CFG
    penalizeInfoTables info cfg =
        mapWeights fupdate cfg
      where
        fupdate :: BlockId -> BlockId -> EdgeWeight -> EdgeWeight
        fupdate _ to weight
          | mapMember to info
          = weight - (fromIntegral $ D.infoTablePenalty weights)
          | otherwise = weight

    -- | If a block has two successors, favour the one with fewer
687
    -- predecessors and/or the one allowing fall through.
688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703
    favourFewerPreds :: CFG -> CFG
    favourFewerPreds cfg =
        let
            revCfg =
              reverseEdges $ filterEdges
                              (\_from -> fallthroughTarget)  cfg

            predCount n = length $ getSuccessorEdges revCfg n
            nodes = getCfgNodes cfg

            modifiers :: Int -> Int -> (EdgeWeight, EdgeWeight)
            modifiers preds1 preds2
              | preds1 <  preds2 = ( 1,-1)
              | preds1 == preds2 = ( 0, 0)
              | otherwise        = (-1, 1)

704
            update :: CFG -> BlockId -> CFG
705 706
            update cfg node
              | [(s1,e1),(s2,e2)] <- getSuccessorEdges cfg node
707 708
              , !w1 <- edgeWeight e1
              , !w2 <- edgeWeight e2
709 710 711 712 713
              --Only change the weights if there isn't already a ordering.
              , w1 == w2
              , (mod1,mod2) <- modifiers (predCount s1) (predCount s2)
              = (\cfg' ->
                  (adjustEdgeWeight cfg' (+mod2) node s2))
714
                    (adjustEdgeWeight cfg  (+mod1) node s1)
715 716 717 718 719 720 721 722
              | otherwise
              = cfg
        in setFoldl update cfg nodes
      where
        fallthroughTarget :: BlockId -> EdgeInfo -> Bool
        fallthroughTarget to (EdgeInfo source _weight)
          | mapMember to info = False
          | AsmCodeGen <- source = True
723 724
          | CmmSource { trans_cmmNode = CmmBranch {} } <- source = True
          | CmmSource { trans_cmmNode = CmmCondBranch {} } <- source = True
725
          | otherwise = False
726 727

-- | Determine loop membership of blocks based on SCC analysis
728
--   This is faster but only gives yes/no answers.
729 730 731 732 733 734 735 736 737 738 739 740 741
loopMembers :: CFG -> LabelMap Bool
loopMembers cfg =
    foldl' (flip setLevel) mapEmpty sccs
  where
    mkNode :: BlockId -> Node BlockId BlockId
    mkNode bid = DigraphNode bid bid (getSuccessors cfg bid)
    nodes = map mkNode (setElems $ getCfgNodes cfg)

    sccs = stronglyConnCompFromEdgedVerticesOrd nodes

    setLevel :: SCC BlockId -> LabelMap Bool -> LabelMap Bool
    setLevel (AcyclicSCC bid) m = mapInsert bid False m
    setLevel (CyclicSCC bids) m = foldl' (\m k -> mapInsert k True m) m bids
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272

loopLevels :: CFG -> BlockId -> LabelMap Int
loopLevels cfg root = liLevels $ loopInfo cfg root

data LoopInfo = LoopInfo
  { liBackEdges :: [(Edge)] -- ^ List of back edges
  , liLevels :: LabelMap Int -- ^ BlockId -> LoopLevel mapping
  , liLoops :: [(Edge, LabelSet)] -- ^ (backEdge, loopBody), body includes header
  }

instance Outputable LoopInfo where
    ppr (LoopInfo _ _lvls loops) =
        text "Loops:(backEdge, bodyNodes)" $$
            (vcat $ map ppr loops)

-- | Determine loop membership of blocks based on Dominator analysis.
--   This is slower but gives loop levels instead of just loop membership.
--   However it only detects natural loops. Irreducible control flow is not
--   recognized even if it loops. But that is rare enough that we don't have
--   to care about that special case.
loopInfo :: CFG -> BlockId -> LoopInfo
loopInfo cfg root = LoopInfo  { liBackEdges = backEdges
                              , liLevels = mapFromList loopCounts
                              , liLoops = loopBodies }
  where
    revCfg = reverseEdges cfg
    graph = fmap (setFromList . mapKeys ) cfg :: LabelMap LabelSet

    --TODO - This should be a no op: Export constructors? Use unsafeCoerce? ...
    rooted = ( fromBlockId root
              , toIntMap $ fmap toIntSet graph) :: (Int, IntMap IntSet)
    -- rooted = unsafeCoerce (root, graph)
    tree = fmap toBlockId $ Dom.domTree rooted :: Tree BlockId

    -- Map from Nodes to their dominators
    domMap :: LabelMap LabelSet
    domMap = mkDomMap tree

    edges = edgeList cfg :: [(BlockId, BlockId)]
    -- We can't recompute this from the edges, there might be blocks not connected via edges.
    nodes = getCfgNodes cfg :: LabelSet

    -- identify back edges
    isBackEdge (from,to)
      | Just doms <- mapLookup from domMap
      , setMember to doms
      = True
      | otherwise = False

    -- determine the loop body for a back edge
    findBody edge@(tail, head)
      = ( edge, setInsert head $ go (setSingleton tail) (setSingleton tail) )
      where
        -- The reversed cfg makes it easier to look up predecessors
        cfg' = delNode head revCfg
        go :: LabelSet -> LabelSet -> LabelSet
        go found current
          | setNull current = found
          | otherwise = go  (setUnion newSuccessors found)
                            newSuccessors
          where
            newSuccessors = setFilter (\n -> not $ setMember n found) successors :: LabelSet
            successors = setFromList $ concatMap
                                      (getSuccessors cfg')
                                      (setElems current) :: LabelSet

    backEdges = filter isBackEdge edges
    loopBodies = map findBody backEdges :: [(Edge, LabelSet)]

    -- Block b is part of n loop bodies => loop nest level of n
    loopCounts =
      let bodies = map (first snd) loopBodies -- [(Header, Body)]
          loopCount n = length $ nub . map fst . filter (setMember n . snd) $ bodies
      in  map (\n -> (n, loopCount n)) $ setElems nodes :: [(BlockId, Int)]

    toIntSet :: LabelSet -> IntSet
    toIntSet s = IS.fromList . map fromBlockId . setElems $ s
    toIntMap :: LabelMap a -> IntMap a
    toIntMap m = IM.fromList $ map (\(x,y) -> (fromBlockId x,y)) $ mapToList m

    mkDomMap :: Tree BlockId -> LabelMap LabelSet
    mkDomMap root = mapFromList $ go setEmpty root
      where
        go :: LabelSet -> Tree BlockId -> [(Label,LabelSet)]
        go parents (Node lbl [])
          =  [(lbl, parents)]
        go parents (Node _ leaves)
          = let nodes = map rootLabel leaves
                entries = map (\x -> (x,parents)) nodes
            in  entries ++ concatMap
                            (\n -> go (setInsert (rootLabel n) parents) n)
                            leaves

    fromBlockId :: BlockId -> Int
    fromBlockId = getKey . getUnique

    toBlockId :: Int -> BlockId
    toBlockId = mkBlockId . mkUniqueGrimily

-- We make the CFG a Hoopl Graph, so we can reuse revPostOrder.
newtype BlockNode (e :: Extensibility) (x :: Extensibility) = BN (BlockId,[BlockId])

instance G.NonLocal (BlockNode) where
  entryLabel (BN (lbl,_))   = lbl
  successors (BN (_,succs)) = succs

revPostorderFrom :: CFG -> BlockId -> [BlockId]
revPostorderFrom cfg root =
    map fromNode $ G.revPostorderFrom hooplGraph root
  where
    nodes = getCfgNodes cfg
    hooplGraph = setFoldl (\m n -> mapInsert n (toNode n) m) mapEmpty nodes

    fromNode :: BlockNode C C -> BlockId
    fromNode (BN x) = fst x

    toNode :: BlockId -> BlockNode C C
    toNode bid =
        BN (bid,getSuccessors cfg $ bid)


-- | We take in a CFG which has on its edges weights which are
--   relative only to other edges originating from the same node.
--
--   We return a CFG for which each edge represents a GLOBAL weight.
--   This means edge weights are comparable across the whole graph.
--
--   For irreducible control flow results might be imprecise, otherwise they
--   are reliable.
--
--   The algorithm is based on the Paper
--   "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus
--   The only big change is that we go over the nodes in the body of loops in
--   reverse post order. Which is required for diamond control flow to work probably.
--
--   We also apply a few prediction heuristics (based on the same paper)

{-# SCC mkGlobalWeights #-}
mkGlobalWeights :: BlockId -> CFG -> (LabelMap Double, LabelMap (LabelMap Double))
mkGlobalWeights root localCfg
  | null localCfg = panic "Error - Empty CFG"
  | otherwise
  = --pprTrace "revOrder" (ppr revOrder) $
    -- undefined --propagate (mapSingleton root 1) (revOrder)
    (blockFreqs', edgeFreqs')
  where
    -- Calculate fixpoints
    (blockFreqs, edgeFreqs) = calcFreqs nodeProbs backEdges' bodies' revOrder'
    blockFreqs' = mapFromList $ map (first fromVertex) (assocs blockFreqs) :: LabelMap Double
    edgeFreqs' = fmap fromVertexMap $ fromVertexMap edgeFreqs

    fromVertexMap :: IM.IntMap x -> LabelMap x
    fromVertexMap m = mapFromList . map (first fromVertex) $ IM.toList m

    revOrder = revPostorderFrom localCfg root :: [BlockId]
    loopinfo@(LoopInfo backedges _levels bodies) = loopInfo localCfg root

    revOrder' = map toVertex revOrder
    backEdges' = map (bimap toVertex toVertex) backedges
    bodies' = map calcBody bodies

    estimatedCfg = staticBranchPrediction root loopinfo localCfg
    -- Normalize the weights to probabilities and apply heuristics
    nodeProbs = cfgEdgeProbabilities estimatedCfg toVertex

    -- By mapping vertices to numbers in reverse post order we can bring any subset into reverse post
    -- order simply by sorting.
    -- TODO: The sort is redundant if we can guarantee that setElems returns elements ascending
    calcBody (backedge, blocks) =
        (toVertex $ snd backedge, sort . map toVertex $ (setElems blocks))

    vertexMapping = mapFromList $ zip revOrder [0..] :: LabelMap Int
    blockMapping = listArray (0,mapSize vertexMapping - 1) revOrder :: Array Int BlockId
    -- Map from blockId to indicies starting at zero
    toVertex :: BlockId -> Int
    toVertex   blockId  = expectJust "mkGlobalWeights" $ mapLookup blockId vertexMapping
    -- Map from indicies starting at zero to blockIds
    fromVertex :: Int -> BlockId
    fromVertex vertex   = blockMapping ! vertex

{- Note [Static Branch Prediction]
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The work here has been based on the paper
"Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus.

The primary differences are that if we branch on the result of a heap
check we do not apply any of the heuristics.
The reason is simple: They look like loops in the control flow graph
but are usually never entered, and if at most once.

Currently implemented is a heuristic to predict that we do not exit
loops (lehPredicts) and one to predict that backedges are more likely
than any other edge.

The back edge case is special as it superceeds any other heuristic if it
applies.

Do NOT rely solely on nofib results for benchmarking this. I recommend at least
comparing megaparsec and container benchmarks. Nofib does not seeem to have
many instances of "loopy" Cmm where these make a difference.

TODO:
* The paper containers more benchmarks which should be implemented.
* If we turn the likelyhood on if/else branches into a probability
  instead of true/false we could implement this as a Cmm pass.
  + The complete Cmm code still exists and can be accessed by the heuristics
  + There is no chance of register allocation/codegen inserting branches/blocks
  + making the TransitionSource info wrong.
  + potential to use this information in CmmPasses.
  - Requires refactoring of all the code relying on the binary nature of likelyhood.
  - Requires refactoring `loopInfo` to work on both, Cmm Graphs and the backend CFG.
-}

-- | Combination of target node id and information about the branch
--   we are looking at.
type TargetNodeInfo = (BlockId, EdgeInfo)


-- | Update branch weights based on certain heuristics.
-- See Note [Static Branch Prediction]
-- TODO: This should be combined with optimizeCFG
{-# SCC staticBranchPrediction #-}
staticBranchPrediction :: BlockId -> LoopInfo -> CFG -> CFG
staticBranchPrediction _root (LoopInfo l_backEdges loopLevels l_loops) cfg =
    -- pprTrace "staticEstimatesOn" (ppr (cfg)) $
    setFoldl update cfg nodes
  where
    nodes = getCfgNodes cfg
    backedges = S.fromList $ l_backEdges
    -- Loops keyed by their back edge
    loops = M.fromList $ l_loops :: M.Map Edge LabelSet
    loopHeads = S.fromList $ map snd $ M.keys loops

    update :: CFG -> BlockId -> CFG
    update cfg node
        -- No successors, nothing to do.
        | null successors = cfg

        -- Mix of backedges and others:
        -- Always predict the backedges.
        | not (null m) && length m < length successors
        -- Heap/Stack checks "loop", but only once.
        -- So we simply exclude any case involving them.
        , not $ any (isHeapOrStackCheck  . transitionSource . snd) successors
        = let   loopChance = repeat $! pred_LBH / (fromIntegral $ length m)
                exitChance = repeat $! (1 - pred_LBH) / fromIntegral (length not_m)
                updates = zip (map fst m) loopChance ++ zip (map fst not_m) exitChance
        in  -- pprTrace "mix" (ppr (node,successors)) $
            foldl' (\cfg (to,weight) -> setEdgeWeight cfg weight node to) cfg updates

        -- For (regular) non-binary branches we keep the weights from the STG -> Cmm translation.
        | length successors /= 2
        = cfg

        -- Only backedges - no need to adjust
        | length m > 0
        = cfg

        -- A regular binary branch, we can plug addition predictors in here.
        | [(s1,s1_info),(s2,s2_info)] <- successors
        , not $ any (isHeapOrStackCheck  . transitionSource . snd) successors
        = -- Normalize weights to total of 1
            let !w1 = max (edgeWeight s1_info) (0)
                !w2 = max (edgeWeight s2_info) (0)
                -- Of both weights are <= 0 we set both to 0.5
                normalizeWeight w = if w1 + w2 == 0 then 0.5 else w/(w1+w2)
                !cfg'  = setEdgeWeight cfg  (normalizeWeight w1) node s1
                !cfg'' = setEdgeWeight cfg' (normalizeWeight w2) node s2

                -- Figure out which heuristics apply to these successors
                heuristics = map ($ ((s1,s1_info),(s2,s2_info)))
                            [lehPredicts, phPredicts, ohPredicts, ghPredicts, lhhPredicts, chPredicts
                            , shPredicts, rhPredicts]
                -- Apply result of a heuristic. Argument is the likelyhood
                -- predicted for s1.
                applyHeuristic :: CFG -> Maybe Prob -> CFG
                applyHeuristic cfg Nothing = cfg
                applyHeuristic cfg (Just (s1_pred :: Double))
                  | s1_old == 0 || s2_old == 0 ||
                    isHeapOrStackCheck (transitionSource s1_info) ||
                    isHeapOrStackCheck (transitionSource s2_info)
                  = cfg
                  | otherwise =
                    let -- Predictions from heuristic
                        s1_prob = EdgeWeight s1_pred :: EdgeWeight
                        s2_prob = 1.0 - s1_prob
                        -- Update
                        d = (s1_old * s1_prob) + (s2_old * s2_prob) :: EdgeWeight
                        s1_prob' = s1_old * s1_prob / d
                        !s2_prob' = s2_old * s2_prob / d
                        !cfg_s1 = setEdgeWeight cfg    s1_prob' node s1
                    in  -- pprTrace "Applying heuristic!" (ppr (node,s1,s2) $$ ppr (s1_prob', s2_prob')) $
                        setEdgeWeight cfg_s1 s2_prob' node s2
                  where
                    -- Old weights
                    s1_old = getEdgeWeight cfg node s1
                    s2_old = getEdgeWeight cfg node s2

            in
            -- pprTraceIt "RegularCfgResult" $
            foldl' applyHeuristic cfg'' heuristics

        -- Branch on heap/stack check
        | otherwise = cfg

      where
        -- Chance that loops are taken.
        pred_LBH = 0.875
        -- successors
        successors = getSuccessorEdges cfg node
        -- backedges
        (m,not_m) = partition (\succ -> S.member (node, fst succ) backedges) successors

        -- Heuristics return nothing if they don't say anything about this branch
        -- or Just (prob_s1) where prob_s1 is the likelyhood for s1 to be the
        -- taken branch. s1 is the branch in the true case.

        -- Loop exit heuristic.
        -- We are unlikely to leave a loop unless it's to enter another one.
        pred_LEH = 0.75
        -- If and only if no successor is a loopheader,
        -- then we will likely not exit the current loop body.
        lehPredicts :: (TargetNodeInfo,TargetNodeInfo) -> Maybe Prob
        lehPredicts ((s1,_s1_info),(s2,_s2_info))
          | S.member s1 loopHeads || S.member s2 loopHeads
          = Nothing

          | otherwise
          = --pprTrace "lehPredict:" (ppr $ compare s1Level s2Level) $
            case compare s1Level s2Level of
                EQ -> Nothing
                LT -> Just (1-pred_LEH) --s1 exits to a shallower loop level (exits loop)
                GT -> Just (pred_LEH)   --s1 exits to a deeper loop level
            where
                s1Level = mapLookup s1 loopLevels
                s2Level = mapLookup s2 loopLevels

        -- Comparing to a constant is unlikely to be equal.
        ohPredicts (s1,_s2)
            | CmmSource { trans_cmmNode = src1 } <- getTransitionSource node (fst s1) cfg
            , CmmCondBranch cond ltrue _lfalse likely <- src1
            , likely == Nothing
            , CmmMachOp mop args <- cond
            , MO_Eq {} <- mop
            , not (null [x | x@CmmLit{} <- args])
            = if fst s1 == ltrue then Just 0.3 else Just 0.7

            | otherwise
            = Nothing

        -- TODO: These are all the other heuristics from the paper.
        -- Not all will apply, for now we just stub them out as Nothing.
        phPredicts = const Nothing
        ghPredicts = const Nothing
        lhhPredicts = const Nothing
        chPredicts = const Nothing
        shPredicts = const Nothing
        rhPredicts = const Nothing

-- We normalize all edge weights as probabilities between 0 and 1.
-- Ignoring rounding errors all outgoing edges sum up to 1.
cfgEdgeProbabilities :: CFG -> (BlockId -> Int) -> IM.IntMap (IM.IntMap Prob)
cfgEdgeProbabilities cfg toVertex
    = mapFoldlWithKey foldEdges IM.empty cfg
  where
    foldEdges = (\m from toMap -> IM.insert (toVertex from) (normalize toMap) m)

    normalize :: (LabelMap EdgeInfo) -> (IM.IntMap Prob)
    normalize weightMap
        | edgeCount <= 1 = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) 1.0 m) IM.empty weightMap
        | otherwise = mapFoldlWithKey (\m k _ -> IM.insert (toVertex k) (normalWeight k) m) IM.empty weightMap
      where
        edgeCount = mapSize weightMap
        -- Negative weights are generally allowed but are mapped to zero.
        -- We then check if there is at least one non-zero edge and if not
        -- assign uniform weights to all branches.
        minWeight = 0 :: Prob
        weightMap' = fmap (\w -> max (weightToDouble . edgeWeight $ w) minWeight) weightMap
        totalWeight = sum weightMap'

        normalWeight :: BlockId -> Prob
        normalWeight bid
         | totalWeight == 0
         = 1.0 / fromIntegral edgeCount
         | Just w <- mapLookup bid weightMap'
         = w/totalWeight
         | otherwise = panic "impossible"

-- This is the fixpoint algorithm from
--   "Static Branch Prediction and Program Profile Analysis" by Y Wu, JR Larus
-- The adaption to Haskell is my own.
calcFreqs :: IM.IntMap (IM.IntMap Prob) -> [(Int,Int)] -> [(Int, [Int])] -> [Int]
          -> (Array Int Double, IM.IntMap (IM.IntMap Prob))
calcFreqs graph backEdges loops revPostOrder = runST $ do
    visitedNodes <- newArray (0,nodeCount-1) False :: ST s (STUArray s Int Bool)
    blockFreqs <- newArray (0,nodeCount-1) 0.0 :: ST s (STUArray s Int Double)
    edgeProbs <- newSTRef graph
    edgeBackProbs <- newSTRef graph

    -- let traceArray a = do
    --       vs <- forM [0..nodeCount-1] $ \i -> readArray a i >>= (\v -> return (i,v))
          -- trace ("array: " ++ show vs) $ return ()

    let  -- See #1600, we need to inline or unboxing makes perf worse.
        -- {-# INLINE getFreq #-}
        {-# INLINE visited #-}
        visited b = unsafeRead visitedNodes b
        getFreq b = unsafeRead blockFreqs b
        -- setFreq :: forall s. Int -> Double -> ST s ()
        setFreq b f = unsafeWrite blockFreqs b f
        -- setVisited :: forall s. Node -> ST s ()
        setVisited b = unsafeWrite visitedNodes b True
        -- Frequency/probability that edge is taken.
        getProb' arr b1 b2 = readSTRef arr >>=
            (\graph ->
                return .
                        fromMaybe (error "getFreq 1") .
                        IM.lookup b2 .
                        fromMaybe (error "getFreq 2") $
                        (IM.lookup b1 graph)
            )
        setProb' arr b1 b2 prob = do
          g <- readSTRef arr
          let !m = fromMaybe (error "Foo") $ IM.lookup b1 g
              !m' = IM.insert b2 prob m
          writeSTRef arr $! (IM.insert b1 m' g)

        getEdgeFreq b1 b2 = getProb' edgeProbs b1 b2
        setEdgeFreq b1 b2 = setProb' edgeProbs b1 b2
        getProb b1 b2 = fromMaybe (error "getProb") $ do
            m' <- IM.lookup b1 graph
            IM.lookup b2 m'

        getBackProb b1 b2 = getProb' edgeBackProbs b1 b2
        setBackProb b1 b2 = setProb' edgeBackProbs b1 b2


    let -- calcOutFreqs :: Node -> ST s ()
        calcOutFreqs bhead block = do
          !f <- getFreq block
          forM (successors block) $ \bi -> do
            let !prob = getProb block bi
            let !succFreq = f * prob
            setEdgeFreq block bi succFreq
            -- traceM $ "SetOut: " ++ show (block, bi, f, prob, succFreq)
            when (bi == bhead) $ setBackProb block bi succFreq


    let propFreq block head = do
            -- traceM ("prop:" ++ show (block,head))
            -- traceShowM block

            !v <- visited block
            if v then
                return () --Dont look at nodes twice
            else if block == head then
                setFreq block 1.0 -- Loop header frequency is always 1
            else do
                let preds = IS.elems $ predecessors block
                irreducible <- (fmap or) $ forM preds $ \bp -> do
                    !bp_visited <- visited bp
                    let bp_backedge = isBackEdge bp block
                    return (not bp_visited && not bp_backedge)

                if irreducible
                then return () -- Rare we don't care
                else do
                    setFreq block 0
                    !cycleProb <- sum <$> (forM preds $ \pred -> do
                        if isBackEdge pred block
                            then
                                getBackProb pred block
                            else do
                                !f <- getFreq block
                                !prob <- getEdgeFreq pred block
                                setFreq block $! f + prob
                                return 0)
                    -- traceM $ "cycleProb:" ++ show cycleProb
                    let limit = 1 - 1/512 -- Paper uses 1 - epsilon, but this works.
                                          -- determines how large likelyhoods in loops can grow.
                    !cycleProb <- return $ min cycleProb limit -- <- return $ if cycleProb > limit then limit else cycleProb
                    -- traceM $ "cycleProb:" ++ show cycleProb

                    !f <- getFreq block
                    setFreq block (f / (1.0 - cycleProb))

            setVisited block
            calcOutFreqs head block

    -- Loops, by nesting, inner to outer
    forM_ loops $ \(head, body) -> do
        forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i True) -- Mark all nodes as visited.
        forM_ body (\i -> unsafeWrite visitedNodes i False) -- Mark all blocks reachable from head as not visited
        forM_ body $ \block -> propFreq block head

    -- After dealing with all loops, deal with non-looping parts of the CFG
    forM_ [0 .. nodeCount - 1] (\i -> unsafeWrite visitedNodes i False) -- Everything in revPostOrder is reachable
    forM_ revPostOrder $ \block -> propFreq block (head revPostOrder)

    -- trace ("Final freqs:") $ return ()
    -- let freqString = pprFreqs freqs
    -- trace (unlines freqString) $ return ()
    -- trace (pprFre) $ return ()
    graph' <- readSTRef edgeProbs
    freqs' <- unsafeFreeze  blockFreqs

    return (freqs', graph')
  where
    predecessors :: Int -> IS.IntSet
    predecessors b = fromMaybe IS.empty $ IM.lookup b revGraph
    successors b = fromMaybe (lookupError "succ" b graph)$ IM.keys <$> IM.lookup b graph
    lookupError s b g = pprPanic ("Lookup error " ++ s) $
                            ( text "node" <+> ppr b $$
                                text "graph" <+>
                                vcat (map (\(k,m) -> ppr (k,m :: IM.IntMap Double)) $ IM.toList g)
                            )

    nodeCount = IM.foldl' (\count toMap -> IM.foldlWithKey' countTargets count toMap) (IM.size graph) graph
      where
        countTargets = (\count k _ -> countNode k + count )
        countNode n = if IM.member n graph then 0 else 1

    isBackEdge from to = S.member (from,to) backEdgeSet
    backEdgeSet = S.fromList backEdges

    revGraph :: IntMap IntSet
    revGraph = IM.foldlWithKey' (\m from toMap -> addEdges m from toMap) IM.empty graph
        where
            addEdges m0 from toMap = IM.foldlWithKey' (\m k _ -> addEdge m from k) m0 toMap
            addEdge m0 from to = IM.insertWith IS.union to (IS.singleton from) m0