From ad15c2b4bd37082ce989268b3d2f86a2cd34386a Mon Sep 17 00:00:00 2001
From: Jan Stolarek <jan.stolarek@p.lodz.pl>
Date: Thu, 12 Sep 2013 14:29:37 +0100
Subject: [PATCH] Improve sinking pass

This commit does two things:

  * Allows duplicating of global registers and literals by inlining
    them. Previously we would only inline global register or literal
    if it was used only once.

  * Changes method of determining conflicts between a node and an
    assignment. New method has two advantages. It relies on
    DefinerOfRegs and UserOfRegs typeclasses, so if a set of registers
    defined or used by a node should ever change, `conflicts` function
    will use the changed definition. This definition also catches
    more cases than the previous one (namely CmmCall and CmmForeignCall)
    which is a step towards making it possible to run sinking pass
    before stack layout (currently this doesn't work).

This patch also adds a lot of comments that are result of about two-week
long investigation of how sinking pass works and why it does what it does.
---
 compiler/cmm/CmmLayoutStack.hs |  21 ++---
 compiler/cmm/CmmPipeline.hs    | 140 ++++++++++++++++++++++++++++++---
 compiler/cmm/CmmSink.hs        |  97 +++++++++++++++++------
 3 files changed, 216 insertions(+), 42 deletions(-)

diff --git a/compiler/cmm/CmmLayoutStack.hs b/compiler/cmm/CmmLayoutStack.hs
index 2b2dccdaed76..17d111d46f62 100644
--- a/compiler/cmm/CmmLayoutStack.hs
+++ b/compiler/cmm/CmmLayoutStack.hs
@@ -164,24 +164,24 @@ layout dflags procpoints liveness entry entry_args final_stackmaps final_hwm blo
     go (b0 : bs) acc_stackmaps acc_hwm acc_blocks
       = do
        let (entry0@(CmmEntry entry_lbl), middle0, last0) = blockSplit b0
-    
+
        let stack0@StackMap { sm_sp = sp0 }
                = mapFindWithDefault
                      (pprPanic "no stack map for" (ppr entry_lbl))
                      entry_lbl acc_stackmaps
-    
+
        -- pprTrace "layout" (ppr entry_lbl <+> ppr stack0) $ return ()
-    
+
        -- (a) Update the stack map to include the effects of
        --     assignments in this block
        let stack1 = foldBlockNodesF (procMiddle acc_stackmaps) middle0 stack0
-    
+
        -- (b) Insert assignments to reload all the live variables if this
        --     block is a proc point
        let middle1 = if entry_lbl `setMember` procpoints
                         then foldr blockCons middle0 (insertReloads stack0)
                         else middle0
-    
+
        -- (c) Look at the last node and if we are making a call or
        --     jumping to a proc point, we must save the live
        --     variables, adjust Sp, and construct the StackMaps for
@@ -190,7 +190,7 @@ layout dflags procpoints liveness entry entry_args final_stackmaps final_hwm blo
        (middle2, sp_off, last1, fixup_blocks, out)
            <- handleLastNode dflags procpoints liveness cont_info
                              acc_stackmaps stack1 middle0 last0
-    
+
        -- pprTrace "layout(out)" (ppr out) $ return ()
 
        -- (d) Manifest Sp: run over the nodes in the block and replace
@@ -566,9 +566,9 @@ setupStackFrame dflags lbl liveness updfr_off ret_args stack0
 -- So to fix this we want to set up the stack frame before the
 -- conditional jump.  How do we know when to do this, and when it is
 -- safe?  The basic idea is, when we see the assignment
--- 
+--
 --   Sp[young(L)] = L
--- 
+--
 -- we know that
 --   * we are definitely heading for L
 --   * there can be no more reads from another stack area, because young(L)
@@ -877,7 +877,7 @@ stackMapToLiveness dflags StackMap{..} =
 -- Lowering safe foreign calls
 
 {-
-Note [lower safe foreign calls]
+Note [Lower safe foreign calls]
 
 We start with
 
@@ -907,7 +907,8 @@ live across the call.  Our job now is to expand the call so we get
    ...
 
 Note the copyOut, which saves the results in the places that L1 is
-expecting them (see Note {safe foreign call convention]).
+expecting them (see Note {safe foreign call convention]). Note also
+that safe foreign call is replace by an unsafe one in the Cmm graph.
 -}
 
 lowerSafeForeignCall :: DynFlags -> CmmBlock -> UniqSM CmmBlock
diff --git a/compiler/cmm/CmmPipeline.hs b/compiler/cmm/CmmPipeline.hs
index dfcdce0dfba0..86504662c4b7 100644
--- a/compiler/cmm/CmmPipeline.hs
+++ b/compiler/cmm/CmmPipeline.hs
@@ -88,13 +88,6 @@ cpsTop hsc_env proc =
        when (not (setNull noncall_pps) && dopt Opt_D_dump_cmm dflags) $
          pprTrace "Non-call proc points: " (ppr noncall_pps) $ return ()
 
-       ----------- Sink and inline assignments *before* stack layout -----------
-       {-  Maybe enable this later
-       g <- {-# SCC "sink1" #-}
-            condPass Opt_CmmSink (cmmSink dflags) g
-                     Opt_D_dump_cmm_rewrite "Sink assignments (1)"
-       -}
-
        ----------- Layout the stack and manifest Sp ----------------------------
        (g, stackmaps) <-
             {-# SCC "layoutStack" #-}
@@ -103,10 +96,10 @@ cpsTop hsc_env proc =
                else return (g, mapEmpty)
        dump Opt_D_dump_cmm_sp "Layout Stack" g
 
-       ----------- Sink and inline assignments *after* stack layout ------------
-       g <- {-# SCC "sink2" #-}
+       ----------- Sink and inline assignments  --------------------------------
+       g <- {-# SCC "sink" #-} -- See Note [Sinking after stack layout]
             condPass Opt_CmmSink (cmmSink dflags) g
-                     Opt_D_dump_cmm_rewrite "Sink assignments (2)"
+                     Opt_D_dump_cmm_rewrite "Sink assignments"
 
        ------------- CAF analysis ----------------------------------------------
        let cafEnv = {-# SCC "cafAnal" #-} cafAnal g
@@ -190,6 +183,133 @@ cpsTop hsc_env proc =
                   (ArchPPC, OSDarwin, pic) -> pic
                   _                        -> False
 
+-- Note [Sinking after stack layout]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- In the past we considered running sinking pass also before stack
+-- layout, but after making some measurements we realized that:
+--
+--   a) running sinking only before stack layout produces slower
+--      code than running sinking only before stack layout
+--
+--   b) running sinking both before and after stack layout produces
+--      code that has the same performance as when running sinking
+--      only after stack layout.
+--
+-- In other words sinking before stack layout doesn't buy as anything.
+--
+-- An interesting question is "why is it better to run sinking after
+-- stack layout"? It seems that the major reason are stores and loads
+-- generated by stack layout. Consider this code before stack layout:
+--
+--  c1E:
+--      _c1C::P64 = R3;
+--      _c1B::P64 = R2;
+--      _c1A::P64 = R1;
+--      I64[(young<c1D> + 8)] = c1D;
+--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+--  c1D:
+--      R3 = _c1C::P64;
+--      R2 = _c1B::P64;
+--      R1 = _c1A::P64;
+--      call (P64[(old + 8)])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Stack layout pass will save all local variables live across a call
+-- (_c1C, _c1B and _c1A in this example) on the stack just before
+-- making a call and reload them from the stack after returning from a
+-- call:
+--
+--  c1E:
+--      _c1C::P64 = R3;
+--      _c1B::P64 = R2;
+--      _c1A::P64 = R1;
+--      I64[Sp - 32] = c1D;
+--      P64[Sp - 24] = _c1A::P64;
+--      P64[Sp - 16] = _c1B::P64;
+--      P64[Sp - 8] = _c1C::P64;
+--      Sp = Sp - 32;
+--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+--  c1D:
+--      _c1A::P64 = P64[Sp + 8];
+--      _c1B::P64 = P64[Sp + 16];
+--      _c1C::P64 = P64[Sp + 24];
+--      R3 = _c1C::P64;
+--      R2 = _c1B::P64;
+--      R1 = _c1A::P64;
+--      Sp = Sp + 32;
+--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- If we don't run sinking pass after stack layout we are basically
+-- left with such code. However, running sinking on this code can lead
+-- to significant improvements:
+--
+--  c1E:
+--      I64[Sp - 32] = c1D;
+--      P64[Sp - 24] = R1;
+--      P64[Sp - 16] = R2;
+--      P64[Sp - 8] = R3;
+--      Sp = Sp - 32;
+--      call stg_gc_noregs() returns to c1D, args: 8, res: 8, upd: 8;
+--  c1D:
+--      R3 = P64[Sp + 24];
+--      R2 = P64[Sp + 16];
+--      R1 = P64[Sp + 8];
+--      Sp = Sp + 32;
+--      call (P64[Sp])(R3, R2, R1) args: 8, res: 0, upd: 8;
+--
+-- Now we only have 9 assignments instead of 15.
+--
+-- There is one case when running sinking before stack layout could
+-- be beneficial. Consider this:
+--
+--   L1:
+--      x = y
+--      call f() returns L2
+--   L2: ...x...y...
+--
+-- Since both x and y are live across a call to f, they will be stored
+-- on the stack during stack layout and restored after the call:
+--
+--   L1:
+--      x = y
+--      P64[Sp - 24] = L2
+--      P64[Sp - 16] = x
+--      P64[Sp - 8]  = y
+--      Sp = Sp - 24
+--      call f() returns L2
+--   L2:
+--      y = P64[Sp + 16]
+--      x = P64[Sp + 8]
+--      Sp = Sp + 24
+--      ...x...y...
+--
+-- However, if we run sinking before stack layout we would propagate x
+-- to its usage place (both x and y must be local register for this to
+-- be possible - global registers cannot be floated past a call):
+--
+--   L1:
+--      x = y
+--      call f() returns L2
+--   L2: ...y...y...
+--
+-- Thus making x dead at the call to f(). If we ran stack layout now
+-- we would generate less stores and loads:
+--
+--   L1:
+--      x = y
+--      P64[Sp - 16] = L2
+--      P64[Sp - 8]  = y
+--      Sp = Sp - 16
+--      call f() returns L2
+--   L2:
+--      y = P64[Sp + 8]
+--      Sp = Sp + 16
+--      ...y...y...
+--
+-- But since we don't see any benefits from running sinking befroe stack
+-- layout, this situation probably doesn't arise too often in practice.
+--
+
 {- Note [inconsistent-pic-reg]
 
 On x86/Darwin, PIC is implemented by inserting a sequence like
diff --git a/compiler/cmm/CmmSink.hs b/compiler/cmm/CmmSink.hs
index 41323ecad3f1..7b5aaa6affcc 100644
--- a/compiler/cmm/CmmSink.hs
+++ b/compiler/cmm/CmmSink.hs
@@ -3,8 +3,6 @@ module CmmSink (
      cmmSink
   ) where
 
-import CodeGen.Platform (callerSaves)
-
 import Cmm
 import CmmOpt
 import BlockId
@@ -220,18 +218,26 @@ cmmSink dflags graph = ofBlockList (g_entry graph) $ sink mapEmpty $ blocks
 
 -- small: an expression we don't mind duplicating
 isSmall :: CmmExpr -> Bool
-isSmall (CmmReg (CmmLocal _)) = True  -- not globals, we want to coalesce them instead
+isSmall (CmmReg (CmmLocal _)) = True  -- not globals, we want to coalesce them instead* See below
 isSmall (CmmLit _) = True
 isSmall (CmmMachOp (MO_Add _) [x,y]) = isTrivial x && isTrivial y
 isSmall (CmmRegOff (CmmLocal _) _) = True
 isSmall _ = False
+
+Coalesce global registers? What does that mean? We observed no decrease
+in performance comming from inlining of global registers, hence we do it now
+(see isTrivial function). Ideally we'd like to measure performance using
+some tool like perf or VTune and make decisions what to inline based on that.
 -}
 
+--
+-- We allow duplication of trivial expressions: registers (both local and
+-- global) and literals.
+--
 isTrivial :: CmmExpr -> Bool
-isTrivial (CmmReg (CmmLocal _)) = True
--- isTrivial (CmmLit _) = True  -- Disabled because it used to make thing worse.
-                                -- Needs further investigation
-isTrivial _ = False
+isTrivial (CmmReg _) = True
+isTrivial (CmmLit _) = True
+isTrivial _          = False
 
 --
 -- annotate each node with the set of registers live *after* the node
@@ -430,6 +436,7 @@ tryToInline dflags live node assigs = go usages node [] assigs
                  inline other = other
 
 -- Note [dependent assignments]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 --
 -- If our assignment list looks like
 --
@@ -474,7 +481,8 @@ regsUsedIn ls e = wrapRecExpf f e False
 -- nor the NCG can do it.  See Note [Register parameter passing]
 -- See also StgCmmForeign:load_args_into_temps.
 okToInline :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
-okToInline dflags expr CmmUnsafeForeignCall{} = not (anyCallerSavesRegs dflags expr)
+okToInline dflags expr node@(CmmUnsafeForeignCall{}) =
+    not (globalRegistersConflict dflags expr node)
 okToInline _ _ _ = True
 
 -- -----------------------------------------------------------------------------
@@ -487,23 +495,23 @@ okToInline _ _ _ = True
 conflicts :: DynFlags -> Assignment -> CmmNode O x -> Bool
 conflicts dflags (r, rhs, addr) node
 
-  -- (1) an assignment to a register conflicts with a use of the register
-  | CmmAssign reg  _ <- node, reg `regUsedIn` rhs                 = True
+  -- (1) node defines registers used by rhs of assignment. This catches
+  -- assignmnets and all three kinds of calls. See Note [Sinking and calls]
+  | globalRegistersConflict dflags rhs node                       = True
+  | localRegistersConflict  dflags rhs node                       = True
+
+  -- (2) node uses register defined by assignment
   | foldRegsUsed dflags (\b r' -> r == r' || b) False node        = True
 
-  -- (2) a store to an address conflicts with a read of the same memory
+  -- (3) a store to an address conflicts with a read of the same memory
   | CmmStore addr' e <- node
   , memConflicts addr (loadAddr dflags addr' (cmmExprWidth dflags e)) = True
 
-  -- (3) an assignment to Hp/Sp conflicts with a heap/stack read respectively
+  -- (4) an assignment to Hp/Sp conflicts with a heap/stack read respectively
   | HeapMem    <- addr, CmmAssign (CmmGlobal Hp) _ <- node        = True
   | StackMem   <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
   | SpMem{}    <- addr, CmmAssign (CmmGlobal Sp) _ <- node        = True
 
-  -- (4) assignments that read caller-saves GlobalRegs conflict with a
-  -- foreign call.  See Note [Unsafe foreign calls clobber caller-save registers]
-  | CmmUnsafeForeignCall{} <- node, anyCallerSavesRegs dflags rhs = True
-
   -- (5) foreign calls clobber heap: see Note [Foreign calls clobber heap]
   | CmmUnsafeForeignCall{} <- node, memConflicts addr AnyMem      = True
 
@@ -513,12 +521,57 @@ conflicts dflags (r, rhs, addr) node
   -- (7) otherwise, no conflict
   | otherwise = False
 
-
-anyCallerSavesRegs :: DynFlags -> CmmExpr -> Bool
-anyCallerSavesRegs dflags e = wrapRecExpf f e False
-  where f (CmmReg (CmmGlobal r)) _
-         | callerSaves (targetPlatform dflags) r = True
-        f _ z = z
+-- Returns True if node defines any global registers that are used in the
+-- Cmm expression
+globalRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+globalRegistersConflict dflags expr node =
+    foldRegsDefd dflags (\b r -> b || (CmmGlobal r) `regUsedIn` expr) False node
+
+-- Returns True if node defines any local registers that are used in the
+-- Cmm expression
+localRegistersConflict :: DynFlags -> CmmExpr -> CmmNode e x -> Bool
+localRegistersConflict dflags expr node =
+    foldRegsDefd dflags (\b r -> b || (CmmLocal  r) `regUsedIn` expr) False node
+
+-- Note [Sinking and calls]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- We have three kinds of calls: normal (CmmCall), safe foreign (CmmForeignCall)
+-- and unsafe foreign (CmmUnsafeForeignCall). We perform sinking pass after
+-- stack layout (see Note [Sinking after stack layout]) which leads to two
+-- invariants related to calls:
+--
+--   a) during stack layout phase all safe foreign calls are turned into
+--      unsafe foreign calls (see Note [Lower safe foreign calls]). This
+--      means that we will never encounter CmmForeignCall node when running
+--      sinking after stack layout
+--
+--   b) stack layout saves all variables live across a call on the stack
+--      just before making a call (remember we are not sinking assignments to
+--      stack):
+--
+--       L1:
+--          x = R1
+--          P64[Sp - 16] = L2
+--          P64[Sp - 8]  = x
+--          Sp = Sp - 16
+--          call f() returns L2
+--       L2:
+--
+--      We will attempt to sink { x = R1 } but we will detect conflict with
+--      { P64[Sp - 8]  = x } and hence we will drop { x = R1 } without even
+--      checking whether it conflicts with { call f() }. In this way we will
+--      never need to check any assignment conflicts with CmmCall. Remeber
+--      that we still need to check for potential memory conflicts.
+--
+-- So the result is that we only need to worry about CmmUnsafeForeignCall nodes
+-- when checking conflicts (see Note [Unsafe foreign calls clobber caller-save registers]).
+-- This assumption holds only when we do sinking after stack layout. If we run
+-- it before stack layout we need to check for possible conflicts with all three
+-- kinds of calls. Our `conflicts` function does that by using a generic
+-- foldRegsDefd and foldRegsUsed functions defined in DefinerOfRegs and
+-- UserOfRegs typeclasses.
+--
 
 -- An abstraction of memory read or written.
 data AbsMem
-- 
GitLab