diff --git a/compiler/cmm/CmmContFlowOpt.hs b/compiler/cmm/CmmContFlowOpt.hs
index f39ab3c62bd261f8533bdc418c5b2655d9b6d5ad..7a008e2c0b64c3c3696ced1be568be809d520611 100644
--- a/compiler/cmm/CmmContFlowOpt.hs
+++ b/compiler/cmm/CmmContFlowOpt.hs
@@ -14,15 +14,127 @@ import BlockId
 import Cmm
 import CmmUtils
 import Maybes
+import Panic
 
 import Control.Monad
 import Prelude hiding (succ, unzip, zip)
 
------------------------------------------------------------------------------
+
+-- Note [What is shortcutting]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Consider this Cmm code:
+--
+-- L1: ...
+--     goto L2;
+-- L2: goto L3;
+-- L3: ...
 --
--- Control-flow optimisations
+-- Here L2 is an empty block and contains only an unconditional branch
+-- to L3. In this situation any block that jumps to L2 can jump
+-- directly to L3:
 --
------------------------------------------------------------------------------
+-- L1: ...
+--     goto L3;
+-- L2: goto L3;
+-- L3: ...
+--
+-- In this situation we say that we shortcut L2 to L3. One of
+-- consequences of shortcutting is that some blocks of code may become
+-- unreachable (in the example above this is true for L2).
+
+
+-- Note [Control-flow optimisations]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- This optimisation does four things:
+--
+--   - If a block finishes in an unconditonal branch to another block
+--     and that is the only jump to that block we concatenate the
+--     destination block at the end of the current one.
+--
+--   - If a block finishes in an unconditional branch, we may be able
+--     to shortcut the destination block.
+--
+--   - If a block finishes in a call whose continuation block is a
+--     goto, then we can shortcut the destination, making the
+--     continuation block the destination of the goto - but see Note
+--     [Shortcut call returns].
+--
+--   - For block finishing in conditional branch we try to invert the
+--     condition and shortcut destination of alternatives.
+--
+--   - For any block that is not a call we try to shortcut the
+--     destination(s).
+--
+-- Blocks are processed using postorder DFS traversal. A side effect
+-- of determining traversal order with a graph search is elimination
+-- of any blocks that are unreachable.
+--
+-- Transformations are improved by working from the end of the graph
+-- towards the beginning, because we may be able to perform many
+-- shortcuts in one go.
+
+
+-- Note [Shortcut call returns]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- We are going to maintain the "current" graph (BlockEnv CmmBlock) as
+-- we go, and also a mapping from BlockId to BlockId, representing
+-- continuation labels that we have renamed.  This latter mapping is
+-- important because we might shortcut a CmmCall continuation.  For
+-- example:
+--
+--    Sp[0] = L
+--    call g returns to L
+--    L: goto M
+--    M: ...
+--
+-- So when we shortcut the L block, we need to replace not only
+-- the continuation of the call, but also references to L in the
+-- code (e.g. the assignment Sp[0] = L):
+--
+--    Sp[0] = M
+--    call g returns to M
+--    M: ...
+--
+-- So we keep track of which labels we have renamed and apply the mapping
+-- at the end with replaceLabels.
+
+
+-- Note [Shortcut call returns and proc-points]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Consider this code that you might get from a recursive
+-- let-no-escape:
+--
+--       goto L1
+--      L1:
+--       if (Hp > HpLim) then L2 else L3
+--      L2:
+--       call stg_gc_noregs returns to L4
+--      L4:
+--       goto L1
+--      L3:
+--       ...
+--       goto L1
+--
+-- Then the control-flow optimiser shortcuts L4.  But that turns L1
+-- into the call-return proc point, and every iteration of the loop
+-- has to shuffle variables to and from the stack.  So we must *not*
+-- shortcut L4.
+--
+-- Moreover not shortcutting call returns is probably fine.  If L4 can
+-- concat with its branch target then it will still do so.  And we
+-- save some compile time because we don't have to traverse all the
+-- code in replaceLabels.
+--
+-- However, we probably do want to do this if we are splitting proc
+-- points, because L1 will be a proc-point anyway, so merging it with
+-- L4 reduces the number of proc points.  Unfortunately recursive
+-- let-no-escapes won't generate very good code with proc-point
+-- splitting on - we should probably compile them to explicitly use
+-- the native calling convention instead.
 
 cmmCfgOpts :: Bool -> CmmGraph -> CmmGraph
 cmmCfgOpts split g = fst (blockConcat split g)
@@ -43,65 +155,15 @@ cmmCfgOptsProc split (CmmProc info lbl live g) = CmmProc info' lbl live g'
                        else info{ cit_lbl = infoTblLbl k' })
              | otherwise
              = (k,info)
-
 cmmCfgOptsProc _ top = top
 
 
------------------------------------------------------------------------------
---
--- Block concatenation
---
------------------------------------------------------------------------------
-
--- This optimisation does three things:
---
---   - If a block finishes with an unconditional branch, then we may
---     be able to duplicate the block it points to and remove the
---     branch.  We do this if either
---        a) the destination block is small (e.g. just another branch), or
---        b) this is the only jump to this particular destination block.
---
---   - If a block finishes in a call whose continuation block is a
---     goto, then we can shortcut the destination, making the
---     destination of the goto into the continuation.  E.g.
---             call g returns to L    ==>     call g returns to M
---          L: goto M                      M: ...blah...
---          M: ...blah...
---     (but see Note [shortcut call returns])
---
---   - Remove any unreachable blocks from the graph.  This is a side
---     effect of starting with a postorder DFS traversal of the graph
-
--- Both transformations are improved by working from the end of the
--- graph towards the beginning, because we may be able to perform many
--- shortcuts in one go.
-
--- We need to walk over the blocks from the end back to the
--- beginning.  We are going to maintain the "current" graph
--- (BlockEnv CmmBlock) as we go, and also a mapping from BlockId
--- to BlockId, representing continuation labels that we have
--- renamed.  This latter mapping is important because we might
--- shortcut a CmmCall continuation.  For example:
---
---    Sp[0] = L
---    call g returns to L
---
---    L: goto M
---
---    M: ...
---
--- So when we shortcut the L block, we need to replace not only
--- the continuation of the call, but also references to L in the
--- code (e.g. the assignment Sp[0] = L).  So we keep track of
--- which labels we have renamed and apply the mapping at the end
--- with replaceLabels.
-
 blockConcat :: Bool -> CmmGraph -> (CmmGraph, BlockEnv BlockId)
 blockConcat splitting_procs g@CmmGraph { g_entry = entry_id }
   = (replaceLabels shortcut_map $ ofBlockMap new_entry new_blocks, shortcut_map')
   where
-     -- we might be able to shortcut the entry BlockId itself.
-     -- remember to update the shortcut_map', since we also have to
+     -- We might be able to shortcut the entry BlockId itself.
+     -- Remember to update the shortcut_map, since we also have to
      -- update the info_tbls mapping now.
      (new_entry, shortcut_map')
        | Just entry_blk <- mapLookup entry_id new_blocks
@@ -110,146 +172,196 @@ blockConcat splitting_procs g@CmmGraph { g_entry = entry_id }
        | otherwise
        = (entry_id, shortcut_map)
 
+     -- blocks is a list of blocks in DFS postorder, while blockmap is
+     -- a map of blocks. We process each element from blocks and update
+     -- blockmap accordingly
      blocks = postorderDfs g
      blockmap = foldr addBlock emptyBody blocks
-      -- the initial blockmap is constructed from the postorderDfs result,
-      -- so that we automatically throw away unreachable blocks.
 
-     (new_blocks, shortcut_map) =
-           foldr maybe_concat (blockmap, mapEmpty) blocks
+     -- Accumulator contains three components:
+     --  * map of blocks in a graph
+     --  * map of shortcut labels. See Note [Shortcut call returns]
+     --  * map containing number of predecessors for each block. We discard
+     --    it after we process all blocks.
+     (new_blocks, shortcut_map, _) =
+           foldr maybe_concat (blockmap, mapEmpty, initialBackEdges) blocks
+
+     -- Map of predecessors for initial graph. We increase number of
+     -- predecessors for entry block by one to denote that it is
+     -- target of a jump, even if no block in the current graph jumps
+     -- to it.
+     initialBackEdges = incPreds entry_id (predMap blocks)
 
      maybe_concat :: CmmBlock
-                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
-                  -> (BlockEnv CmmBlock, BlockEnv BlockId)
-     maybe_concat block (blocks, shortcut_map)
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId, BlockEnv Int)
+                  -> (BlockEnv CmmBlock, BlockEnv BlockId, BlockEnv Int)
+     maybe_concat block (blocks, shortcut_map, backEdges)
+        -- If:
+        --   (1) current block ends with unconditional branch to b' and
+        --   (2) it has exactly one predecessor (namely, current block) and
+        --   (3) we have not mapped any other label to b'
+        --       (see Note [Shortcut call returns]).
+        -- Then:
+        --   (1) append b' block at the end of current block
+        --   (2) remove b' from the map of blocks
+        --   (3) remove information about b' from predecessors map
+        --
+        -- This guard must be first so that we always eliminate blocks that have
+        -- only one predecessor. If we had a target block that is both
+        -- shorcutable and has only one predecessor and attempted to shortcut it
+        -- first we would make that block unreachable but would not remove it
+        -- from the graph.
         | CmmBranch b' <- last
         , Just blk' <- mapLookup b' blocks
-        , shouldConcatWith b' blk'
-        = (mapInsert bid (splice head blk') blocks, shortcut_map)
-
-        -- calls: if we can shortcut the continuation label, then
-        -- we must *also* remember to substitute for the label in the
-        -- code, because we will push it somewhere.
-        | splitting_procs -- Note [shortcut call returns]
+        , hasOnePredecessor b'
+        , hasNotBeenMappedTo b' shortcut_map
+        = let bid' = entryLabel blk'
+          in ( mapDelete bid' $ mapInsert bid (splice head blk') blocks
+             , shortcut_map
+             , mapDelete b' backEdges )
+
+        -- If:
+        --   (1) current block ends with unconditional branch to b' and
+        --   (2) we can shortcut block b'
+        -- Then:
+        --   (1) concatenate b' at the end of current block, effectively
+        --       changing target of uncondtional jump from b' to dest
+        --   (2) increase number of predecessors of dest by 1
+        --   (3) decrease number of predecessors of b' by 1
+        | CmmBranch b' <- last
+        , Just blk' <- mapLookup b' blocks
+        , Just dest <- canShortcut blk'
+        = ( mapInsert bid (splice head blk') blocks, shortcut_map,
+            decPreds b' $ incPreds dest backEdges )
+
+        -- If:
+        --   (1) we are splitting proc points (see Note
+        --       [Shortcut call returns and proc-points]) and
+        --   (2) current block is a CmmCall or CmmForeignCall with
+        --       continuation b' and
+        --   (3) we can shortcut that continuation to dest
+        -- Then:
+        --   (1) we change continuation to point to b'
+        --   (2) create mapping from b' to dest
+        --   (3) increase number of predecessors of dest by 1
+        --   (4) decrease number of predecessors of b' by 1
+        --
+        -- Later we will use replaceLabels to substitute all occurences of b'
+        -- with dest.
+        | splitting_procs
         , Just b'   <- callContinuation_maybe last
         , Just blk' <- mapLookup b' blocks
         , Just dest <- canShortcut blk'
-        = (blocks, mapInsert b' dest shortcut_map)
-           -- replaceLabels will substitute dest for b' everywhere, later
-
-        -- non-calls: see if we can shortcut any of the successors,
-        -- and check whether we should invert the conditional
+        = ( mapInsert bid (blockJoinTail head (update_cont dest)) blocks
+          , mapInsert b' dest shortcut_map
+          , decPreds b' $ incPreds dest backEdges )
+
+        -- If:
+        --   (1) a block does not end with a call
+        -- Then:
+        --   (1) if it ends with a conditional attempt to invert the
+        --       conditional
+        --   (2) attempt to shortcut all destination blocks
+        --   (3) if new successors of a block are different from the old ones
+        --       we update the of predecessors accordingly
         | Nothing <- callContinuation_maybe last
-        = ( mapInsert bid (blockJoinTail head swapcond_last) blocks
-          , shortcut_map )
-
+        = let oldSuccs = successors last
+              newSuccs = successors swapcond_last
+          in ( mapInsert bid (blockJoinTail head swapcond_last) blocks
+             , shortcut_map
+             , if oldSuccs == newSuccs
+               then backEdges
+               else foldr incPreds (foldr decPreds backEdges oldSuccs) newSuccs )
+
+        -- Otherwise don't do anything
         | otherwise
-        = (blocks, shortcut_map)
+        = ( blocks, shortcut_map, backEdges )
         where
           (head, last) = blockSplitTail block
           bid = entryLabel block
 
+          -- Changes continuation of a call to a specified label
+          update_cont dest =
+              case last of
+                CmmCall{}        -> last { cml_cont = Just dest }
+                CmmForeignCall{} -> last { succ = dest }
+                _                -> panic "Can't shortcut continuation."
+
+          -- Attempts to shortcut successors of last node
           shortcut_last = mapSuccessors shortcut last
             where
               shortcut l =
                  case mapLookup l blocks of
-                   Just b | Just dest <- canShortcut b  -> dest
+                   Just b | Just dest <- canShortcut b -> dest
                    _otherwise -> l
 
-          -- for a conditional, we invert the conditional if that
-          -- would make it more likely that the branch-not-taken case
-          -- becomes a fallthrough.  This helps the native codegen a
-          -- little bit, and probably has no effect on LLVM.  It's
-          -- convenient to do it here, where we have the information
-          -- about predecessors.
-          --
+          -- For a conditional, we invert the conditional if that would make it
+          -- more likely that the branch-not-taken case becomes a fallthrough.
+          -- This helps the native codegen a little bit, and probably has no
+          -- effect on LLVM.  It's convenient to do it here, where we have the
+          -- information about predecessors.
           swapcond_last
             | CmmCondBranch cond t f <- shortcut_last
             , numPreds f > 1
-            , numPreds t == 1
+            , hasOnePredecessor t
             , Just cond' <- maybeInvertCmmExpr cond
             = CmmCondBranch cond' f t
 
             | otherwise
             = shortcut_last
 
-
-     shouldConcatWith b block
-       | okToDuplicate block = True  -- short enough to duplicate
-       | numPreds b == 1     = True  -- only one predecessor: go for it
-       | otherwise           = False
-
-     numPreds bid = mapLookup bid backEdges `orElse` 0
-
-     canShortcut :: CmmBlock -> Maybe BlockId
-     canShortcut block
-       | (_, middle, CmmBranch dest) <- blockSplit block
-       , isEmptyBlock middle
-       = Just dest
-       | otherwise
-       = Nothing
-
-     backEdges :: BlockEnv Int -- number of predecessors for each block
-     backEdges = mapInsertWith (+) entry_id 1 $ -- add 1 for the entry id
-                   predMap blocks
-
-     splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
-     splice head rest = head `blockAppend` snd (blockSplitHead rest)
-
-
+          -- Number of predecessors for a block
+          numPreds bid = mapLookup bid backEdges `orElse` 0
+
+          hasOnePredecessor b = numPreds b == 1
+
+          hasNotBeenMappedTo :: BlockId -> BlockEnv BlockId -> Bool
+          hasNotBeenMappedTo b successor_map = mapMember b successor_map
+
+-- Functions for incrementing and decrementing number of predecessors. If
+-- decrementing would set the predecessor count to 0, we remove entry from the
+-- map.
+-- Invariant: if a block has no predecessors it should be dropped from the
+-- graph because it is unreachable. maybe_concat is constructed to maintain
+-- that invariant, but calling replaceLabels may introduce unreachable blocks.
+-- We rely on subsequent passes in the Cmm pipeline to remove unreachable
+-- blocks.
+incPreds, decPreds :: BlockId -> BlockEnv Int -> BlockEnv Int
+incPreds bid edges = mapInsertWith (+) bid 1 edges
+decPreds bid edges = case mapLookup bid edges of
+                       Just preds | preds > 1 -> mapInsert bid (preds - 1) edges
+                       Just _                 -> mapDelete bid edges
+                       _                      -> edges
+
+
+-- Checks if a block consists only of "goto dest". If it does than we return
+-- "Just dest" label. See Note [What is shortcutting]
+canShortcut :: CmmBlock -> Maybe BlockId
+canShortcut block
+    | (_, middle, CmmBranch dest) <- blockSplit block
+    , isEmptyBlock middle
+    = Just dest
+    | otherwise
+    = Nothing
+
+
+-- Concatenates two blocks. First one is assumed to be open on exit, the second
+-- is assumed to be closed on entry (i.e. it has a label attached to it, which
+-- the splice function removes by calling snd on result of blockSplitHead).
+splice :: Block CmmNode C O -> CmmBlock -> CmmBlock
+splice head rest = head `blockAppend` snd (blockSplitHead rest)
+
+
+-- If node is a call with continuation call return Just label of that
+-- continuation. Otherwise return Nothing.
 callContinuation_maybe :: CmmNode O C -> Maybe BlockId
 callContinuation_maybe (CmmCall { cml_cont = Just b }) = Just b
 callContinuation_maybe (CmmForeignCall { succ = b })   = Just b
 callContinuation_maybe _ = Nothing
 
-okToDuplicate :: CmmBlock -> Bool
-okToDuplicate block
-  = case blockSplit block of
-      (_, m, CmmBranch _) -> isEmptyBlock m
-      -- cheap and cheerful; we might expand this in the future to
-      -- e.g. spot blocks that represent a single instruction or two.
-      -- Be careful: a CmmCall can be more than one instruction, it
-      -- has a CmmExpr inside it.
-      _otherwise -> False
-
-
-{-  Note [shortcut call returns]
-
-Consider this code that you might get from a recursive let-no-escape:
-
-      goto L1
-     L1:
-      if (Hp > HpLim) then L2 else L3
-     L2:
-      call stg_gc_noregs returns to L4
-     L4:
-      goto L1
-     L3:
-      ...
-      goto L1
-
-Then the control-flow optimiser shortcuts L4.  But that turns L1
-into the call-return proc point, and every iteration of the loop
-has to shuffle variables to and from the stack.  So we must *not*
-shortcut L4.
-
-Moreover not shortcutting call returns is probably fine.  If L4 can
-concat with its branch target then it will still do so.  And we
-save some compile time because we don't have to traverse all the
-code in replaceLabels.
-
-However, we probably do want to do this if we are splitting proc
-points, because L1 will be a proc-point anyway, so merging it with L4
-reduces the number of proc points.  Unfortunately recursive
-let-no-escapes won't generate very good code with proc-point splitting
-on - we should probably compile them to explicitly use the native
-calling convention instead.
--}
-
-------------------------------------------------------------------------
+
 -- Map over the CmmGraph, replacing each label with its mapping in the
 -- supplied BlockEnv.
-
 replaceLabels :: BlockEnv BlockId -> CmmGraph -> CmmGraph
 replaceLabels env g
   | mapNull env = g
@@ -275,19 +387,14 @@ replaceLabels env g
 mkCmmCondBranch :: CmmExpr -> Label -> Label -> CmmNode O C
 mkCmmCondBranch p t f = if t == f then CmmBranch t else CmmCondBranch p t f
 
-----------------------------------------------------------------
--- Build a map from a block to its set of predecessors. Very useful.
-
+-- Build a map from a block to its set of predecessors.
 predMap :: [CmmBlock] -> BlockEnv Int
-predMap blocks = foldr add_preds mapEmpty blocks -- find the back edges
+predMap blocks = foldr add_preds mapEmpty blocks
   where
     add_preds block env = foldr add env (successors block)
       where add lbl env = mapInsertWith (+) lbl 1 env
 
------------------------------------------------------------------------------
---
 -- Removing unreachable blocks
-
 removeUnreachableBlocksProc :: CmmDecl -> CmmDecl
 removeUnreachableBlocksProc (CmmProc info lbl live g)
    = CmmProc info lbl live (removeUnreachableBlocks g)