diff --git a/compiler/GHC/CmmToAsm.hs b/compiler/GHC/CmmToAsm.hs
index 544edc801ee8484b6e209f2aabeb5ef904452597..f771d792541dd5cac170abad4d3afdfad664ad8e 100644
--- a/compiler/GHC/CmmToAsm.hs
+++ b/compiler/GHC/CmmToAsm.hs
@@ -728,7 +728,7 @@ cmmNativeGen dflags this_mod modLoc ncgImpl us fileIds dbgMap cmm count
 
         let optimizedCFG :: Maybe CFG
             optimizedCFG =
-                optimizeCFG (cfgWeightInfo dflags) cmm <$!> postShortCFG
+                optimizeCFG (gopt Opt_CmmStaticPred dflags) (cfgWeightInfo dflags) cmm <$!> postShortCFG
 
         maybeDumpCfg dflags optimizedCFG "CFG Weights - Final" proc_name
 
diff --git a/compiler/GHC/CmmToAsm/BlockLayout.hs b/compiler/GHC/CmmToAsm/BlockLayout.hs
index 07faa91473d93ed087b628f2bafa9a1d28cd68a0..67eff764e1600f080049fb78f51116c44c940439 100644
--- a/compiler/GHC/CmmToAsm/BlockLayout.hs
+++ b/compiler/GHC/CmmToAsm/BlockLayout.hs
@@ -636,22 +636,8 @@ sequenceChain :: forall a i. (Instruction i, Outputable i)
               -> [GenBasicBlock i] -- ^ Blocks placed in sequence.
 sequenceChain _info _weights    [] = []
 sequenceChain _info _weights    [x] = [x]
-sequenceChain  info weights'     blocks@((BasicBlock entry _):_) =
-    let weights :: CFG
-        weights = --pprTrace "cfg'" (pprEdgeWeights cfg')
-                  cfg'
-          where
-            (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-} mkGlobalWeights entry weights'
-            cfg' = {-# SCC rewriteEdges #-}
-                    mapFoldlWithKey
-                        (\cfg from m ->
-                            mapFoldlWithKey
-                                (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to )
-                                cfg m )
-                        weights'
-                        globalEdgeWeights
-
-        directEdges :: [CfgEdge]
+sequenceChain  info weights     blocks@((BasicBlock entry _):_) =
+    let directEdges :: [CfgEdge]
         directEdges = sortBy (flip compare) $ catMaybes . map relevantWeight $ (infoEdgeList weights)
           where
             relevantWeight :: CfgEdge -> Maybe CfgEdge
diff --git a/compiler/GHC/CmmToAsm/CFG.hs b/compiler/GHC/CmmToAsm/CFG.hs
index ad3a3cdae7fa322457e320590724fa8311be6861..5c68e77fd1a681bbaa55f18c714282b144094fdc 100644
--- a/compiler/GHC/CmmToAsm/CFG.hs
+++ b/compiler/GHC/CmmToAsm/CFG.hs
@@ -670,11 +670,21 @@ findBackEdges root cfg =
     typedEdges =
       classifyEdges root getSuccs edges :: [((BlockId,BlockId),EdgeType)]
 
-
-optimizeCFG :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
-optimizeCFG _ (CmmData {}) cfg = cfg
-optimizeCFG weights (CmmProc info _lab _live graph) cfg =
-    {-# SCC optimizeCFG #-}
+optimizeCFG :: Bool -> D.CfgWeights -> RawCmmDecl -> CFG -> CFG
+optimizeCFG _ _ (CmmData {}) cfg = cfg
+optimizeCFG doStaticPred weights proc@(CmmProc _info _lab _live graph) cfg =
+  (if doStaticPred then staticPredCfg (g_entry graph) else id) $
+    optHsPatterns weights proc $ cfg
+
+-- | Modify branch weights based on educated guess on
+-- patterns GHC tends to produce and how they affect
+-- performance.
+--
+-- Most importantly we penalize jumps across info tables.
+optHsPatterns :: D.CfgWeights -> RawCmmDecl -> CFG -> CFG
+optHsPatterns _ (CmmData {}) cfg = cfg
+optHsPatterns weights (CmmProc info _lab _live graph) cfg =
+    {-# SCC optHsPatterns #-}
     -- pprTrace "Initial:" (pprEdgeWeights cfg) $
     -- pprTrace "Initial:" (ppr $ mkGlobalWeights (g_entry graph) cfg) $
 
@@ -749,6 +759,21 @@ optimizeCFG weights (CmmProc info _lab _live graph) cfg =
           | CmmSource { trans_cmmNode = CmmCondBranch {} } <- source = True
           | otherwise = False
 
+-- | Convert block-local branch weights to global weights.
+staticPredCfg :: BlockId -> CFG -> CFG
+staticPredCfg entry cfg = cfg'
+  where
+    (_, globalEdgeWeights) = {-# SCC mkGlobalWeights #-}
+                             mkGlobalWeights entry cfg
+    cfg' = {-# SCC rewriteEdges #-}
+            mapFoldlWithKey
+                (\cfg from m ->
+                    mapFoldlWithKey
+                        (\cfg to w -> setEdgeWeight cfg (EdgeWeight w) from to )
+                        cfg m )
+                cfg
+                globalEdgeWeights
+
 -- | Determine loop membership of blocks based on SCC analysis
 --   This is faster but only gives yes/no answers.
 loopMembers :: HasDebugCallStack => CFG -> LabelMap Bool
@@ -922,6 +947,10 @@ revPostorderFrom cfg root =
 --   reverse post order. Which is required for diamond control flow to work probably.
 --
 --   We also apply a few prediction heuristics (based on the same paper)
+--
+--   The returned result represents frequences.
+--   For blocks it's the expected number of executions and
+--   for edges is the number of traversals.
 
 {-# NOINLINE mkGlobalWeights #-}
 {-# SCC mkGlobalWeights #-}
diff --git a/compiler/GHC/CmmToAsm/Instr.hs b/compiler/GHC/CmmToAsm/Instr.hs
index 833a72a74a61f5197960179a8883e705bfb28e64..869c5eb238dd02d2f8fb65bc53456b2ceee11544 100644
--- a/compiler/GHC/CmmToAsm/Instr.hs
+++ b/compiler/GHC/CmmToAsm/Instr.hs
@@ -37,7 +37,10 @@ import GHC.CmmToAsm.Config
 --      (for allocation purposes, anyway).
 --
 data RegUsage
-        = RU [Reg] [Reg]
+        = RU    {
+                reads :: [Reg],
+                writes :: [Reg]
+                }
 
 -- | No regs read or written to.
 noUsage :: RegUsage
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear.hs b/compiler/GHC/CmmToAsm/Reg/Linear.hs
index 00b4915d7b14e4e0351fe201e9c9f82c4c0e24a3..55cb73af1abaa3abea7cbf4820d45ac8c377672b 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-}
+{-# LANGUAGE ConstraintKinds #-}
 
 {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-}
 
@@ -137,6 +138,7 @@ import GHC.Platform
 import Data.Maybe
 import Data.List
 import Control.Monad
+import Control.Applicative
 
 -- -----------------------------------------------------------------------------
 -- Top level of the register allocator
@@ -229,8 +231,13 @@ linearRegAlloc config entry_ids block_live sccs
   go f = linearRegAlloc' config f entry_ids block_live sccs
   platform = ncgPlatform config
 
+-- | Constraints on the instruction instances used by the
+-- linear allocator.
+type OutputableRegConstraint freeRegs instr =
+        (FR freeRegs, Outputable freeRegs, Outputable instr, Instruction instr)
+
 linearRegAlloc'
-        :: (FR freeRegs, Outputable instr, Instruction instr)
+        :: OutputableRegConstraint freeRegs instr
         => NCGConfig
         -> freeRegs
         -> [BlockId]                    -- ^ entry points
@@ -246,7 +253,7 @@ linearRegAlloc' config initFreeRegs entry_ids block_live sccs
         return  (blocks, stats, getStackUse stack)
 
 
-linearRA_SCCs :: (FR freeRegs, Instruction instr, Outputable instr)
+linearRA_SCCs :: OutputableRegConstraint freeRegs instr
               => [BlockId]
               -> BlockMap RegSet
               -> [NatBasicBlock instr]
@@ -281,7 +288,7 @@ linearRA_SCCs entry_ids block_live blocksAcc (CyclicSCC blocks : sccs)
    more sanity checking to guard against this eventuality.
 -}
 
-process :: (FR freeRegs, Instruction instr, Outputable instr)
+process :: OutputableRegConstraint freeRegs instr
         => [BlockId]
         -> BlockMap RegSet
         -> [GenBasicBlock (LiveInstr instr)]
@@ -325,15 +332,18 @@ process entry_ids block_live (b@(BasicBlock id _) : blocks)
 -- | Do register allocation on this basic block
 --
 processBlock
-        :: (FR freeRegs, Outputable instr, Instruction instr)
+        :: OutputableRegConstraint freeRegs instr
         => BlockMap RegSet              -- ^ live regs on entry to each basic block
         -> LiveBasicBlock instr         -- ^ block to do register allocation on
         -> RegM freeRegs [NatBasicBlock instr]   -- ^ block with registers allocated
 
 processBlock block_live (BasicBlock id instrs)
- = do   initBlock id block_live
+ = do   -- pprTraceM "processBlock" $ text "" $$ ppr (BasicBlock id instrs)
+        initBlock id block_live
+
         (instrs', fixups)
                 <- linearRA block_live [] [] id instrs
+        -- pprTraceM "blockResult" $ ppr (instrs', fixups)
         return  $ BasicBlock id instrs' : fixups
 
 
@@ -369,7 +379,7 @@ initBlock id block_live
 
 -- | Do allocation for a sequence of instructions.
 linearRA
-        :: (FR freeRegs, Outputable instr, Instruction instr)
+        :: OutputableRegConstraint freeRegs instr
         => BlockMap RegSet                      -- ^ map of what vregs are live on entry to each block.
         -> [instr]                              -- ^ accumulator for instructions already processed.
         -> [NatBasicBlock instr]                -- ^ accumulator for blocks of fixup code.
@@ -396,7 +406,7 @@ linearRA block_live accInstr accFixups id (instr:instrs)
 
 -- | Do allocation for a single instruction.
 raInsn
-        :: (FR freeRegs, Outputable instr, Instruction instr)
+        :: OutputableRegConstraint freeRegs instr
         => BlockMap RegSet                      -- ^ map of what vregs are love on entry to each block.
         -> [instr]                              -- ^ accumulator for instructions already processed.
         -> BlockId                              -- ^ the id of the current block, for debugging
@@ -476,7 +486,7 @@ isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
                   | otherwise = False
 
 
-genRaInsn :: (FR freeRegs, Instruction instr, Outputable instr)
+genRaInsn :: OutputableRegConstraint freeRegs instr
           => BlockMap RegSet
           -> [instr]
           -> BlockId
@@ -486,6 +496,7 @@ genRaInsn :: (FR freeRegs, Instruction instr, Outputable instr)
           -> RegM freeRegs ([instr], [NatBasicBlock instr])
 
 genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
+--   pprTraceM "genRaInsn" $ ppr (block_id, instr)
   platform <- getPlatform
   case regUsageOfInstr platform instr of { RU read written ->
     do
@@ -525,6 +536,8 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
     (fixup_blocks, adjusted_instr)
         <- joinToTargets block_live block_id instr
 
+--     when (not $ null fixup_blocks) $ pprTraceM "genRA:FixBlocks" $ ppr fixup_blocks
+
     -- Debugging - show places where the reg alloc inserted
     -- assignment fixup blocks.
     -- when (not $ null fixup_blocks) $
@@ -737,7 +750,7 @@ data SpillLoc = ReadMem StackSlot  -- reading from register only in memory
 --   the list of free registers and free stack slots.
 
 allocateRegsAndSpill
-        :: (FR freeRegs, Outputable instr, Instruction instr)
+        :: forall freeRegs instr. (FR freeRegs, Outputable instr, Instruction instr)
         => Bool                 -- True <=> reading (load up spilled regs)
         -> [VirtualReg]         -- don't push these out
         -> [instr]              -- spill insns
@@ -749,7 +762,8 @@ allocateRegsAndSpill _       _    spills alloc []
         = return (spills, reverse alloc)
 
 allocateRegsAndSpill reading keep spills alloc (r:rs)
- = do   assig <- getAssigR
+ = do   assig <- getAssigR :: RegM freeRegs (RegMap Loc)
+        -- pprTraceM "allocateRegsAndSpill:assig" (ppr (r:rs) $$ ppr assig)
         let doSpill = allocRegsAndSpill_spill reading keep spills alloc r rs assig
         case lookupUFM assig r of
                 -- case (1a): already in a register
@@ -779,6 +793,26 @@ allocateRegsAndSpill reading keep spills alloc (r:rs)
 
                         | otherwise -> doSpill WriteNew
 
+-- | Given a virtual reg find a preferred real register.
+-- The preferred register is simply the first one the variable
+-- was assigned to (if any). This way when we allocate for a loop
+-- variables are likely to end up in the same registers at the
+-- end and start of the loop, avoiding redundant reg-reg moves.
+-- Note: I tried returning a list of past assignments, but that
+-- turned out to barely matter but added a few tenths of
+-- a percent to compile time.
+findPrefRealReg :: forall freeRegs u. Uniquable u
+               => u -> RegM freeRegs (Maybe RealReg)
+findPrefRealReg vreg = do
+  bassig <- getBlockAssigR :: RegM freeRegs (BlockMap (freeRegs,RegMap Loc))
+  return $ foldr (findVirtRegAssig) Nothing bassig
+  where
+    findVirtRegAssig :: (freeRegs,RegMap Loc) -> Maybe RealReg -> Maybe RealReg
+    findVirtRegAssig assig z =
+        z <|>   case lookupUFM (snd assig) vreg of
+                        Just (InReg real_reg) -> Just real_reg
+                        Just (InBoth real_reg _) -> Just real_reg
+                        _ -> z
 
 -- reading is redundant with reason, but we keep it around because it's
 -- convenient and it maintains the recursive structure of the allocator. -- EZY
@@ -795,18 +829,26 @@ allocRegsAndSpill_spill :: (FR freeRegs, Instruction instr, Outputable instr)
 allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
  = do   platform <- getPlatform
         freeRegs <- getFreeRegsR
-        let freeRegs_thisClass  = frGetFreeRegs platform (classOfVirtualReg r) freeRegs
+        let freeRegs_thisClass  = frGetFreeRegs platform (classOfVirtualReg r) freeRegs :: [RealReg]
 
-        case freeRegs_thisClass of
+        -- Can we put the variable into a register it already was?
+        pref_reg <- findPrefRealReg r
 
+        case freeRegs_thisClass of
          -- case (2): we have a free register
-         (my_reg : _) ->
-           do   spills'   <- loadTemp r spill_loc my_reg spills
+         (first_free : _) ->
+           do   let final_reg
+                        | Just reg <- pref_reg
+                        , reg `elem` freeRegs_thisClass
+                        = reg
+                        | otherwise
+                        = first_free
+                spills'   <- loadTemp r spill_loc final_reg spills
 
-                setAssigR       (addToUFM assig r $! newLocation spill_loc my_reg)
-                setFreeRegsR $  frAllocateReg platform my_reg freeRegs
+                setAssigR       (addToUFM assig r $! newLocation spill_loc final_reg)
+                setFreeRegsR $  frAllocateReg platform final_reg freeRegs
 
-                allocateRegsAndSpill reading keep spills' (my_reg : alloc) rs
+                allocateRegsAndSpill reading keep spills' (final_reg : alloc) rs
 
 
           -- case (3): we need to push something out to free up a register
@@ -814,7 +856,8 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
            do   let inRegOrBoth (InReg _) = True
                     inRegOrBoth (InBoth _ _) = True
                     inRegOrBoth _ = False
-                let candidates' =
+                let candidates' :: UniqFM Loc
+                    candidates' =
                       flip delListFromUFM keep $
                       filterUFM inRegOrBoth $
                       assig
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
index 5784660e3fb60fb809bcc4f9a1a72b61fa3267c5..6a110f0a484f254fca5bacc10843389b29dd373e 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/Base.hs
@@ -30,6 +30,7 @@ import GHC.Types.Unique.FM
 import GHC.Types.Unique.Supply
 import GHC.Cmm.BlockId
 
+data ReadingOrWriting = Reading | Writing deriving (Eq,Ord)
 
 -- | Used to store the register assignment on entry to a basic block.
 --      We use this to handle join points, where multiple branch instructions
@@ -138,6 +139,8 @@ data RA_State freeRegs
         , ra_config     :: !NCGConfig
 
         -- | (from,fixup,to) : We inserted fixup code between from and to
-        , ra_fixups     :: [(BlockId,BlockId,BlockId)] }
+        , ra_fixups     :: [(BlockId,BlockId,BlockId)]
+
+        }
 
 
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
index fe19164357f258da93e1066fcc82d9f8692e5a74..fd0719c656e030c341098043a4ec4ceb56725168 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/PPC.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+
 -- | Free regs map for PowerPC
 module GHC.CmmToAsm.Reg.Linear.PPC where
 
@@ -27,6 +29,9 @@ import Data.Bits
 data FreeRegs = FreeRegs !Word32 !Word32
               deriving( Show )  -- The Show is used in an ASSERT
 
+instance Outputable FreeRegs where
+    ppr = text . show
+
 noFreeRegs :: FreeRegs
 noFreeRegs = FreeRegs 0 0
 
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
index ac7dc85366df0b9c5aa767cd2fa20fad191c72af..063a8836b33222670ea970e77974bac8da36b3d3 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/SPARC.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE CPP #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
 
 -- | Free regs map for SPARC
 module GHC.CmmToAsm.Reg.Linear.SPARC where
@@ -38,6 +39,9 @@ data FreeRegs
 instance Show FreeRegs where
         show = showFreeRegs
 
+instance Outputable FreeRegs where
+        ppr = text . showFreeRegs
+
 -- | A reg map where no regs are free to be allocated.
 noFreeRegs :: FreeRegs
 noFreeRegs = FreeRegs 0 0 0
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
index f96cc71239e4ec91a859f6e24d1f433dfc38d5ff..ab05ab632aff11781b7fe609d19051ec7d956b15 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/State.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE CPP, PatternSynonyms, DeriveFunctor #-}
+{-# LANGUAGE ScopedTypeVariables #-}
 
 #if !defined(GHC_LOADED_INTO_GHCI)
 {-# LANGUAGE UnboxedTuples #-}
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
index ae37b0f9d1086707b89604140851685c2f40a861..42f63b5752271280206985093ee80d1c202ae3a0 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
 
 -- | Free regs map for i386
 module GHC.CmmToAsm.Reg.Linear.X86 where
@@ -9,12 +10,13 @@ import GHC.Platform.Reg.Class
 import GHC.Platform.Reg
 import GHC.Utils.Panic
 import GHC.Platform
+import GHC.Utils.Outputable
 
 import Data.Word
 import Data.Bits
 
 newtype FreeRegs = FreeRegs Word32
-    deriving Show
+    deriving (Show,Outputable)
 
 noFreeRegs :: FreeRegs
 noFreeRegs = FreeRegs 0
diff --git a/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
index 325e033e853ef0fb15eac8fc9f88b960bdfe7f77..44eea342a4dfc9cd36bdc9c07c75bc86172dc158 100644
--- a/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
+++ b/compiler/GHC/CmmToAsm/Reg/Linear/X86_64.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
 
 -- | Free regs map for x86_64
 module GHC.CmmToAsm.Reg.Linear.X86_64 where
@@ -9,12 +10,13 @@ import GHC.Platform.Reg.Class
 import GHC.Platform.Reg
 import GHC.Utils.Panic
 import GHC.Platform
+import GHC.Utils.Outputable
 
 import Data.Word
 import Data.Bits
 
 newtype FreeRegs = FreeRegs Word64
-    deriving Show
+    deriving (Show,Outputable)
 
 noFreeRegs :: FreeRegs
 noFreeRegs = FreeRegs 0
diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs
index b0be5f4bcee6621697f541de6eb1acdac8207c1f..d0b04713fbfb8017f4df42877c8ea2e0c095aaa2 100644
--- a/compiler/GHC/Driver/Flags.hs
+++ b/compiler/GHC/Driver/Flags.hs
@@ -181,6 +181,7 @@ data GeneralFlag
    | Opt_LlvmFillUndefWithGarbage       -- Testing for undef bugs (hidden flag)
    | Opt_IrrefutableTuples
    | Opt_CmmSink
+   | Opt_CmmStaticPred
    | Opt_CmmElimCommonBlocks
    | Opt_AsmShortcutting
    | Opt_OmitYields
diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs
index ac099a61dd13c7f0a7eb4ec4de2bf7e803b94b73..87b13a8e99d1d12b96c3e18cce96ec4dcb007285 100644
--- a/compiler/GHC/Driver/Session.hs
+++ b/compiler/GHC/Driver/Session.hs
@@ -3514,6 +3514,7 @@ fFlagsDeps = [
   flagSpec "case-folding"                     Opt_CaseFolding,
   flagSpec "cmm-elim-common-blocks"           Opt_CmmElimCommonBlocks,
   flagSpec "cmm-sink"                         Opt_CmmSink,
+  flagSpec "cmm-static-pred"                  Opt_CmmStaticPred,
   flagSpec "cse"                              Opt_CSE,
   flagSpec "stg-cse"                          Opt_StgCSE,
   flagSpec "stg-lift-lams"                    Opt_StgLiftLams,
@@ -4065,6 +4066,7 @@ optLevelFlags -- see Note [Documenting optimisation flags]
     , ([1,2],   Opt_CmmElimCommonBlocks)
     , ([2],     Opt_AsmShortcutting)
     , ([1,2],   Opt_CmmSink)
+    , ([1,2],   Opt_CmmStaticPred)
     , ([1,2],   Opt_CSE)
     , ([1,2],   Opt_StgCSE)
     , ([2],     Opt_StgLiftLams)
diff --git a/compiler/GHC/Utils/Outputable.hs b/compiler/GHC/Utils/Outputable.hs
index ba843cef30c98dce091d6b7c183ec595ebc00332..149713773de852fd6526fbf61c5bbafb99de678b 100644
--- a/compiler/GHC/Utils/Outputable.hs
+++ b/compiler/GHC/Utils/Outputable.hs
@@ -837,6 +837,9 @@ instance Outputable Word16 where
 instance Outputable Word32 where
     ppr n = integer $ fromIntegral n
 
+instance Outputable Word64 where
+    ppr n = integer $ fromIntegral n
+
 instance Outputable Word where
     ppr n = integer $ fromIntegral n
 
diff --git a/docs/users_guide/8.12.1-notes.rst b/docs/users_guide/8.12.1-notes.rst
index 46a729af7033bb8359c0a80e6efe624a690d6a9c..9fabc4731069be2a3dd3741d8cba4b8b93840361 100644
--- a/docs/users_guide/8.12.1-notes.rst
+++ b/docs/users_guide/8.12.1-notes.rst
@@ -10,7 +10,14 @@ following sections.
 Highlights
 ----------
 
-- TODO
+* NCG
+
+  - The linear register allocator saw improvements reducing the number
+    of redundant move instructions. Rare edge cases can see double
+    digit improvements in runtime for inner loops.
+
+    In the mean this improved runtime by about 0.8%. For details
+    see ticket #17823.
 
 Full details
 ------------
@@ -155,11 +162,11 @@ Arrow notation
    ``hsGroupTopLevelFixitySigs`` function, which collects all top-level fixity
    signatures, including those for class methods defined inside classes.
 
-- The ``Exception`` module was boiled down acknowledging the existence of 
+- The ``Exception`` module was boiled down acknowledging the existence of
   the ``exceptions`` dependency. In particular, the ``ExceptionMonad``
   class is not a proper class anymore, but a mere synonym for ``MonadThrow``,
-  ``MonadCatch``, ``MonadMask`` (all from ``exceptions``) and ``MonadIO``. 
-  All of ``g*``-functions from the module (``gtry``, ``gcatch``, etc.) are 
+  ``MonadCatch``, ``MonadMask`` (all from ``exceptions``) and ``MonadIO``.
+  All of ``g*``-functions from the module (``gtry``, ``gcatch``, etc.) are
   erased, and their ``exceptions``-alternatives are meant to be used in the
   GHC code instead.
 
diff --git a/docs/users_guide/using-optimisation.rst b/docs/users_guide/using-optimisation.rst
index 4ca47524a7018bd92a57b2d7d7cf98af014af6ae..8ec19cb147f412965914523d03a522b9cd96b76c 100644
--- a/docs/users_guide/using-optimisation.rst
+++ b/docs/users_guide/using-optimisation.rst
@@ -214,6 +214,19 @@ by saying ``-fno-wombat``.
     to their usage sites. It also inlines simple expressions like
     literals or registers.
 
+.. ghc-flag:: -fcmm-static-pred
+    :shortdesc: Enable static control flow prediction. Implied by :ghc-flag:`-O`.
+    :type: dynamic
+    :reverse: -fno-cmm-static-pred
+    :category:
+
+    :default: off but enabled with :ghc-flag:`-O`.
+
+    This enables static control flow prediction on the final Cmm
+    code. If enabled GHC will apply certain heuristics to identify
+    loops and hot code paths. This information is then used by the
+    register allocation and code layout passes.
+
 .. ghc-flag:: -fasm-shortcutting
     :shortdesc: Enable shortcutting on assembly. Implied by :ghc-flag:`-O2`.
     :type: dynamic