From 6429943b0a377e076bcfd26c79ceb27cf2f4a9ab Mon Sep 17 00:00:00 2001
From: Simon Peyton Jones <simonpj@microsoft.com>
Date: Fri, 19 Feb 2021 11:03:15 +0000
Subject: [PATCH] Fix terrible occurrence-analysis bug

Ticket #19360 showed up a terrible bug in the occurrence analyser,
in a situation like this

   Rec { f = g
       ; g = ..f...
         {-# RULE g .. = ...f... #-}  }

Then f was postInlineUnconditionally, but not in the RULE (which
is simplified first), so we had a RULE mentioning a variable that
was not in scope.

This led me to review (again) the subtle loop-breaker stuff in the
occurrence analyser. The actual changes are few, and are largely
simplifications.  I did a /lot/ of comment re-organising though.

There was an unexpected amount of fallout.

* Validation failed when compiling the stage2 compiler with profiling
  on. That turned to tickle a second latent bug in the same OccAnal
  code (at least I think it was always there), which led me to
  simplify still further; see Note [inl_fvs] in GHC.Core.Opt.OccurAnal.

* But that in turn let me to some strange behaviour in CSE when ticks
  are in the picture, which I duly fixed.  See Note [Dealing with ticks]
  in GHC.Core.Opt.CSE.

* Then I got an ASSERT failure in CoreToStg, which again seems to be
  a latent bug.  See Note [Ticks in applications] in GHC.CoreToStg

* I also made one unforced change: I now simplify the RHS of a RULE in
  the same way as the RHS of a stable unfolding. This can allow a
  trivial binding to disappear sooner than otherwise, and I don't
  think it has any downsides.  The change is in
  GHC.Core.Opt.Simplify.simplRules.
---
 compiler/GHC/Core/Opt/CSE.hs                  |  85 +++-
 compiler/GHC/Core/Opt/OccurAnal.hs            | 407 ++++++++++--------
 compiler/GHC/Core/Opt/Simplify.hs             |  20 +-
 compiler/GHC/Core/Opt/Simplify/Utils.hs       |   8 +-
 compiler/GHC/CoreToStg.hs                     |  27 +-
 .../tests/numeric/should_compile/T7116.stdout |  14 +-
 .../tests/simplCore/should_compile/T19360.hs  |  29 ++
 .../simplCore/should_compile/T8331.stderr     |   7 +-
 .../tests/simplCore/should_compile/all.T      |   1 +
 9 files changed, 373 insertions(+), 225 deletions(-)
 create mode 100644 testsuite/tests/simplCore/should_compile/T19360.hs

diff --git a/compiler/GHC/Core/Opt/CSE.hs b/compiler/GHC/Core/Opt/CSE.hs
index 45e26acc4b27..9e3010ca47be 100644
--- a/compiler/GHC/Core/Opt/CSE.hs
+++ b/compiler/GHC/Core/Opt/CSE.hs
@@ -18,7 +18,7 @@ import GHC.Prelude
 import GHC.Core.Subst
 import GHC.Types.Var    ( Var )
 import GHC.Types.Var.Env ( mkInScopeSet )
-import GHC.Types.Id     ( Id, idType, idHasRules
+import GHC.Types.Id     ( Id, idType, idHasRules, zapStableUnfolding
                         , idInlineActivation, setInlineActivation
                         , zapIdOccInfo, zapIdUsageInfo, idInlinePragma
                         , isJoinId, isJoinId_maybe )
@@ -236,8 +236,8 @@ because we promised to inline foo as what the user wrote.  See similar Note
 
 Nor do we want to change the reverse mapping. Suppose we have
 
-   {-# Unf = Stable (\pq. build blah) #-}
-   foo = <expr>
+   foo {-# Unf = Stable (\pq. build blah) #-}
+       = <expr>
    bar = <expr>
 
 There could conceivably be merit in rewriting the RHS of bar:
@@ -250,6 +250,23 @@ that a function's definition is so small that it should always inline.
 In this case we still want to do CSE (#13340). Hence the use of
 isAnyInlinePragma rather than isStableUnfolding.
 
+Now consider
+   foo = <expr>
+   bar {-# Unf = Stable ... #-}
+      = <expr>
+
+where the unfolding was added by strictness analysis, say.  Then
+CSE goes ahead, so we get
+   bar = foo
+and probably use SUBSTITUTE that will make 'bar' dead.  But just
+possibly not -- see Note [Dealing with ticks].  In that case we might
+be left with
+   bar = tick t1 (tick t2 foo)
+in which case we would really like to get rid of the stable unfolding
+(generated by the strictness analyser, say).  Hence the zapStableUnfolding
+in cse_bind.  Not a big deal, and only makes a difference when ticks
+get into the picture.
+
 Note [Corner case for case expressions]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Here is another reason that we do not use SUBSTITUTE for
@@ -389,9 +406,10 @@ cse_bind toplevel env (in_id, in_rhs) out_id
   | otherwise
   = (env', (out_id'', out_rhs))
   where
-    (env', out_id') = addBinding env in_id out_id out_rhs
-    (cse_done, out_rhs) = try_for_cse env in_rhs
-    out_id'' | cse_done  = delayInlining toplevel out_id'
+    (env', out_id') = addBinding env in_id out_id out_rhs cse_done
+    (cse_done, out_rhs)  = try_for_cse env in_rhs
+    out_id'' | cse_done  = zapStableUnfolding $
+                           delayInlining toplevel out_id'
              | otherwise = out_id'
 
 delayInlining :: TopLevelFlag -> Id -> Id
@@ -409,19 +427,23 @@ delayInlining top_lvl bndr
   | otherwise
   = bndr
 
-addBinding :: CSEnv                      -- Includes InId->OutId cloning
-           -> InVar                      -- Could be a let-bound type
-           -> OutId -> OutExpr           -- Processed binding
-           -> (CSEnv, OutId)             -- Final env, final bndr
+addBinding :: CSEnv            -- Includes InId->OutId cloning
+           -> InVar            -- Could be a let-bound type
+           -> OutId -> OutExpr -- Processed binding
+           -> Bool             -- True <=> RHS was CSE'd and is a variable
+                               --          or maybe (Tick t variable)
+           -> (CSEnv, OutId)   -- Final env, final bndr
 -- Extend the CSE env with a mapping [rhs -> out-id]
 -- unless we can instead just substitute [in-id -> rhs]
 --
 -- It's possible for the binder to be a type variable (see
 -- Note [Type-let] in GHC.Core), in which case we can just substitute.
-addBinding env in_id out_id rhs'
+addBinding env in_id out_id rhs' cse_done
   | not (isId in_id) = (extendCSSubst env in_id rhs',     out_id)
   | noCSE in_id      = (env,                              out_id)
   | use_subst        = (extendCSSubst env in_id rhs',     out_id)
+  | cse_done         = (env,                              out_id)
+                       -- See Note [Dealing with ticks]
   | otherwise        = (extendCSEnv env rhs' id_expr', zapped_id)
   where
     id_expr'  = varToCoreExpr out_id
@@ -438,9 +460,8 @@ addBinding env in_id out_id rhs'
 
     -- Should we use SUBSTITUTE or EXTEND?
     -- See Note [CSE for bindings]
-    use_subst = case rhs' of
-                   Var {} -> True
-                   _      -> False
+    use_subst | Var {} <- rhs' = True
+              | otherwise      = False
 
 -- | Given a binder `let x = e`, this function
 -- determines whether we should add `e -> x` to the cs_map
@@ -487,6 +508,38 @@ The net effect is that for the y-binding we want to
 
 This is done by cse_bind.  I got it wrong the first time (#13367).
 
+Note [Dealing with ticks]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Ticks complicate CSE a bit, as I discovered in the fallout from
+fixing #19360.
+
+* To get more CSE-ing, we strip all the tickishFloatable ticks from
+  an expression
+  - when inserting into the cs_map (see extendCSEnv)
+  - when looking up in the cs_map (see call to lookupCSEnv in try_for_cse)
+  Quite why only the tickishFloatble ticks, I'm not quite sure.
+
+* If we get a hit in cs_map, we wrap the result in the ticks from the
+  thing we are looking up (see try_for_cse)
+
+Net result: if we get a hit, we might replace
+  let x = tick t1 (tick t2 e)
+with
+  let x = tick t1 (tick t2 y)
+where 'y' is the variable that 'e' maps to.  Now consider addBinding for
+the binding for 'x':
+
+* We can't use SUBSTITUTE because those ticks might not be trivial (we
+  use tickishIsCode in exprIsTrivial)
+
+* We should not use EXTEND, because we definitely don't want to
+  add  (tick t1 (tick t2 y)) :-> x
+  to the cs_map. Remember we strip off the ticks, so that would amount
+  to adding y :-> x, very silly.
+
+TL;DR: we do neither; hence the cse_done case in addBinding.
+
+
 Note [Delay inlining after CSE]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Suppose (#15445) we have
@@ -603,14 +656,14 @@ cseCase env scrut bndr ty alts
     combineAlts alt_env (map cse_alt alts)
   where
     ty' = substTy (csEnvSubst env) ty
-    scrut1 = tryForCSE env scrut
+    (cse_done, scrut1) = try_for_cse env scrut
 
     bndr1 = zapIdOccInfo bndr
       -- Zapping the OccInfo is needed because the extendCSEnv
       -- in cse_alt may mean that a dead case binder
       -- becomes alive, and Lint rejects that
     (env1, bndr2)    = addBinder env bndr1
-    (alt_env, bndr3) = addBinding env1 bndr bndr2 scrut1
+    (alt_env, bndr3) = addBinding env1 bndr bndr2 scrut1 cse_done
          -- addBinding: see Note [CSE for case expressions]
 
     con_target :: OutExpr
diff --git a/compiler/GHC/Core/Opt/OccurAnal.hs b/compiler/GHC/Core/Opt/OccurAnal.hs
index d4d617bf6f35..3f31ae258bfd 100644
--- a/compiler/GHC/Core/Opt/OccurAnal.hs
+++ b/compiler/GHC/Core/Opt/OccurAnal.hs
@@ -369,6 +369,14 @@ Solution:
 
 Note [Recursive bindings: the grand plan]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Loop breaking is surprisingly subtle.  First read the section 4 of
+"Secrets of the GHC inliner".  This describes our basic plan.  We
+avoid infinite inlinings by choosing loop breakers, and ensuring that
+a loop breaker cuts each loop.
+
+See also Note [Inlining and hs-boot files] in GHC.Core.ToIface, which
+deals with a closely related source of infinite loops.
+
 When we come across a binding group
   Rec { x1 = r1; ...; xn = rn }
 we treat it like this (occAnalRecBind):
@@ -379,9 +387,12 @@ we treat it like this (occAnalRecBind):
    Wrap the details in a LetrecNode, ready for SCC analysis.
    All this is done by makeNode.
 
+   The edges of this graph are the "scope edges".
+
 2. Do SCC-analysis on these Nodes:
    - Each CyclicSCC will become a new Rec
    - Each AcyclicSCC will become a new NonRec
+
    The key property is that every free variable of a binding is
    accounted for by the scope edges, so that when we are done
    everything is still in scope.
@@ -392,22 +403,11 @@ we treat it like this (occAnalRecBind):
    identify suitable loop-breakers to ensure that inlining terminates.
    This is done by occAnalRec.
 
-   4a To do so we form a new set of Nodes, with the same details, but
-      different edges, the "loop-breaker nodes". The loop-breaker nodes
-      have both more and fewer dependencies than the scope edges
-      (see Note [Choosing loop breakers])
-
-      More edges: if f calls g, and g has an active rule that mentions h
-                 then we add an edge from f -> h
+   To do so, form the loop-breaker graph, do SCC analysis. For each
+   CyclicSCC we choose a loop breaker, delete all edges to that node,
+   re-analyse the SCC, and iterate. See Note [Choosing loop breakers]
+   for the details
 
-      Fewer edges: we only include dependencies on active rules, on rule
-                   RHSs (not LHSs) and if there is an INLINE pragma only
-                   on the stable unfolding (and vice versa).  The scope
-                   edges must be much more inclusive.
-
-   4b. The "weak fvs" of a node are, by definition:
-       the scope fvs - the loop-breaker fvs
-       See Note [Weak loop breakers], and the nd_weak field of Details
 
 Note [Dead code]
 ~~~~~~~~~~~~~~~~
@@ -432,7 +432,6 @@ when 'g' no longer uses 'f' at all (eg 'f' does not occur in a RULE in
 'g'). 'occAnalBind' first consumes 'CyclicSCC g' and then it consumes
 'AcyclicSCC f', where 'body_usage' won't contain 'f'.
 
-------------------------------------------------------------
 Note [Forming Rec groups]
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 The key point about the "Forming Rec groups" step is that it /preserves
@@ -498,10 +497,9 @@ of the file.
     at any of the definitions.  This is done by Simplify.simplRecBind,
     when it calls addLetIdInfo.
 
-------------------------------------------------------------
-Note [Inline rules]
-~~~~~~~~~~~~~~~~~~~
-None of the above stuff about RULES applies to Inline Rules,
+Note [Stable unfoldings]
+~~~~~~~~~~~~~~~~~~~~~~~~
+None of the above stuff about RULES applies to a stable unfolding
 stored in a CoreUnfolding.  The unfolding, if any, is simplified
 at the same time as the regular RHS of the function (ie *not* like
 Note [Rules are visible in their own rec group]), so it should be
@@ -575,6 +573,8 @@ But watch out!  If 'fs' is not chosen as a loop breaker, we may get an infinite
   - now there's another opportunity to apply the RULE
 
 This showed up when compiling Control.Concurrent.Chan.getChanContents.
+Hence the transitive rule_fv_env stuff described in
+Note [Rules and  loop breakers].
 
 ------------------------------------------------------------
 Note [Finding join points]
@@ -840,13 +840,12 @@ occAnalRec env lvl (CyclicSCC details_s) (body_uds, binds)
   = (body_uds, binds)                   -- See Note [Dead code]
 
   | otherwise   -- At this point we always build a single Rec
-  = -- pprTrace "occAnalRec" (vcat
-    --   [ text "weak_fvs" <+> ppr weak_fvs
-    --   , text "lb nodes" <+> ppr loop_breaker_nodes])
+  = -- pprTrace "occAnalRec" (ppr loop_breaker_nodes)
     (final_uds, Rec pairs : binds)
 
   where
-    bndrs = map nd_bndr details_s
+    bndrs      = map nd_bndr details_s
+    all_simple = all nd_simple details_s
 
     ------------------------------
     -- Make the nodes for the loop-breaker analysis
@@ -856,20 +855,19 @@ occAnalRec env lvl (CyclicSCC details_s) (body_uds, binds)
     (final_uds, loop_breaker_nodes) = mkLoopBreakerNodes env lvl body_uds details_s
 
     ------------------------------
-    weak_fvs :: VarSet
-    weak_fvs = mapUnionVarSet nd_weak details_s
+    active_rule_fvs :: VarSet
+    active_rule_fvs = mapUnionVarSet nd_active_rule_fvs details_s
 
     ---------------------------
     -- Now reconstruct the cycle
     pairs :: [(Id,CoreExpr)]
-    pairs | isEmptyVarSet weak_fvs = reOrderNodes   0 weak_fvs loop_breaker_nodes []
-          | otherwise              = loopBreakNodes 0 weak_fvs loop_breaker_nodes []
-          -- If weak_fvs is empty, the loop_breaker_nodes will include
-          -- all the edges in the original scope edges [remember,
-          -- weak_fvs is the difference between scope edges and
-          -- lb-edges], so a fresh SCC computation would yield a
-          -- single CyclicSCC result; and reOrderNodes deals with
-          -- exactly that case
+    pairs | all_simple = reOrderNodes   0 active_rule_fvs loop_breaker_nodes []
+          | otherwise  = loopBreakNodes 0 active_rule_fvs loop_breaker_nodes []
+          -- In the common case when all are "simple" (no rules at all)
+          -- the loop_breaker_nodes will include all the scope edges
+          -- so a SCC computation would yield a single CyclicSCC result;
+          -- and reOrderNodes deals with exactly that case.
+          -- Saves a SCC analysis in a common case
 
 
 {- *********************************************************************
@@ -880,149 +878,183 @@ occAnalRec env lvl (CyclicSCC details_s) (body_uds, binds)
 
 {- Note [Choosing loop breakers]
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Loop breaking is surprisingly subtle.  First read the section 4 of
-"Secrets of the GHC inliner".  This describes our basic plan.
-We avoid infinite inlinings by choosing loop breakers, and
-ensuring that a loop breaker cuts each loop.
+In Step 4 in Note [Recursive bindings: the grand plan]), occAnalRec does
+loop-breaking on each CyclicSCC of the original program:
 
-See also Note [Inlining and hs-boot files] in GHC.Core.ToIface, which
-deals with a closely related source of infinite loops.
+* mkLoopBreakerNodes: Form the loop-breaker graph for that CyclicSCC
+
+* loopBreakNodes: Do SCC analysis on it
+
+* reOrderNodes: For each CyclicSCC, pick a loop breaker
+    * Delete edges to that loop breaker
+    * Do another SCC analysis on that reduced SCC
+    * Repeat
+
+To form the loop-breaker graph, we construct a new set of Nodes, the
+"loop-breaker nodes", with the same details but different edges, the
+"loop-breaker edges".  The loop-breaker nodes have both more and fewer
+dependencies than the scope edges:
+
+  More edges:
+     If f calls g, and g has an active rule that mentions h then
+     we add an edge from f -> h.  See Note [Rules and loop breakers].
+
+  Fewer edges: we only include dependencies
+     * only on /active/ rules,
+     * on rule /RHSs/ (not LHSs)
+
+The scope edges, by contrast, must be much more inclusive.
+
+The nd_simple flag tracks the common case when a binding has no RULES
+at all, in which case the loop-breaker edges will be identical to the
+scope edges.
+
+Note that in Example [eftInt], *neither* eftInt *nor* eftIntFB is
+chosen as a loop breaker, because their RHSs don't mention each other.
+And indeed both can be inlined safely.
+
+Note [inl_fvs]
+~~~~~~~~~~~~~~
+Note that the loop-breaker graph includes edges for occurrences in
+/both/ the RHS /and/ the stable unfolding.  Consider this, which actually
+occurred when compiling BooleanFormula.hs in GHC:
+
+  Rec { lvl1 = go
+      ; lvl2[StableUnf = go] = lvl1
+      ; go = ...go...lvl2... }
+
+From the point of view of infinite inlining, we need only these edges:
+   lvl1 :-> go
+   lvl2 :-> go       -- The RHS lvl1 will never be used for inlining
+   go   :-> go, lvl2
+
+But the danger is that, lacking any edge to lvl1, we'll put it at the
+end thus
+  Rec { lvl2[ StableUnf = go] = lvl1
+      ; go[LoopBreaker] = ...go...lvl2... }
+      ; lvl1[Occ=Once]  = go }
+
+And now the Simplifer will try to use PreInlineUnconditionally on lvl1
+(which occurs just once), but because it is last we won't actually
+substitute in lvl2.  Sigh.
+
+To avoid this possiblity, we include edges from lvl2 to /both/ its
+stable unfolding /and/ its RHS.  Hence the defn of inl_fvs in
+makeNode.  Maybe we could be more clever, but it's very much a corner
+case.
+
+Note [Weak loop breakers]
+~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a last nasty wrinkle.  Suppose we have
+
+    Rec { f = f_rhs
+          RULE f [] = g
+
+          h = h_rhs
+          g = h
+          ...more...
+    }
+
+Remember that we simplify the RULES before any RHS (see Note
+[Rules are visible in their own rec group] above).
+
+So we must *not* postInlineUnconditionally 'g', even though
+its RHS turns out to be trivial.  (I'm assuming that 'g' is
+not chosen as a loop breaker.)  Why not?  Because then we
+drop the binding for 'g', which leaves it out of scope in the
+RULE!
+
+Here's a somewhat different example of the same thing
+    Rec { q = r
+        ; r = ...p...
+        ; p = p_rhs
+          RULE p [] = q }
+Here the RULE is "below" q, but we *still* can't postInlineUnconditionally
+q, because the RULE for p is active throughout.  So the RHS of r
+might rewrite to     r = ...q...
+So q must remain in scope in the output program!
+
+We "solve" this by:
+
+    Make q a "weak" loop breaker (OccInfo = IAmLoopBreaker True)
+    iff q is a mentioned in the RHS of an active RULE in the Rec group
 
-Fundamentally, we do SCC analysis on a graph.  For each recursive
-group we choose a loop breaker, delete all edges to that node,
-re-analyse the SCC, and iterate.
+A normal "strong" loop breaker has IAmLoopBreaker False.  So:
 
-But what is the graph?  NOT the same graph as was used for Note
-[Forming Rec groups]!  In particular, a RULE is like an equation for
+                                Inline  postInlineUnconditionally
+strong   IAmLoopBreaker False    no      no
+weak     IAmLoopBreaker True     yes     no
+         other                   yes     yes
+
+The **sole** reason for this kind of loop breaker is so that
+postInlineUnconditionally does not fire.  Ugh.
+
+Annoyingly, since we simplify the rules *first* we'll never inline
+q into p's RULE.  That trivial binding for q will hang around until
+we discard the rule.  Yuk.  But it's rare.
+
+ Note [Rules and loop breakers]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+When we form the loop-breaker graph (Step 4 in Note [Recursive
+bindings: the grand plan]), we must be careful about RULEs.
+
+For a start, we want a loop breaker to cut every cycle, so inactive
+rules play no part; we need only consider /active/ rules.
+See Note [Finding rule RHS free vars]
+
+The second point is more subtle.  A RULE is like an equation for
 'f' that is *always* inlined if it is applicable.  We do *not* disable
 rules for loop-breakers.  It's up to whoever makes the rules to make
 sure that the rules themselves always terminate.  See Note [Rules for
 recursive functions] in GHC.Core.Opt.Simplify
 
 Hence, if
-    f's RHS (or its INLINE template if it has one) mentions g, and
+    f's RHS (or its stable unfolding if it has one) mentions g, and
     g has a RULE that mentions h, and
     h has a RULE that mentions f
 
 then we *must* choose f to be a loop breaker.  Example: see Note
-[Specialisation rules].
-
-In general, take the free variables of f's RHS, and augment it with
-all the variables reachable by RULES from those starting points.  That
-is the whole reason for computing rule_fv_env in occAnalBind.  (Of
-course we only consider free vars that are also binders in this Rec
-group.)  See also Note [Finding rule RHS free vars]
+[Specialisation rules]. So out plan is this:
 
-Note that when we compute this rule_fv_env, we only consider variables
-free in the *RHS* of the rule, in contrast to the way we build the
-Rec group in the first place (Note [Rule dependency info])
+   Take the free variables of f's RHS, and augment it with all the
+   variables reachable by a transitive sequence RULES from those
+   starting points.
 
-Note that if 'g' has RHS that mentions 'w', we should add w to
-g's loop-breaker edges.  More concretely there is an edge from f -> g
-iff
-        (a) g is mentioned in f's RHS `xor` f's INLINE rhs
-            (see Note [Inline rules])
-        (b) or h is mentioned in f's RHS, and
-            g appears in the RHS of an active RULE of h
-            or a /transitive sequence/ of /active rules/ starting with h
+That is the whole reason for computing rule_fv_env in mkLoopBreakerNodes.
+Wrinkles:
 
-Why "active rules"?  See Note [Finding rule RHS free vars]
+* We only consider /active/ rules. See Note [Finding rule RHS free vars]
 
-Why "transitive sequence"?  Because active rules apply
-unconditionallly, without checking loop-breaker-ness.
-See Note [Loop breaker dependencies].
-
-Note that in Example [eftInt], *neither* eftInt *nor* eftIntFB is
-chosen as a loop breaker, because their RHSs don't mention each other.
-And indeed both can be inlined safely.
+* We need only consider free vars that are also binders in this Rec
+  group.  See also Note [Finding rule RHS free vars]
 
-Note again that the edges of the graph we use for computing loop breakers
-are not the same as the edges we use for computing the Rec blocks.
-That's why we use
-
-- makeNode             for the Rec block analysis
-- makeLoopBreakerNodes for the loop breaker analysis
-
-  * Note [Finding rule RHS free vars]
-    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    Consider this real example from Data Parallel Haskell
-         tagZero :: Array Int -> Array Tag
-         {-# INLINE [1] tagZeroes #-}
-         tagZero xs = pmap (\x -> fromBool (x==0)) xs
-
-         {-# RULES "tagZero" [~1] forall xs n.
-             pmap fromBool <blah blah> = tagZero xs #-}
-    So tagZero's RHS mentions pmap, and pmap's RULE mentions tagZero.
-    However, tagZero can only be inlined in phase 1 and later, while
-    the RULE is only active *before* phase 1.  So there's no problem.
-
-    To make this work, we look for the RHS free vars only for
-    *active* rules. That's the reason for the occ_rule_act field
-    of the OccEnv.
-
-  * Note [Weak loop breakers]
-    ~~~~~~~~~~~~~~~~~~~~~~~~~
-    There is a last nasty wrinkle.  Suppose we have
-
-        Rec { f = f_rhs
-              RULE f [] = g
-
-              h = h_rhs
-              g = h
-              ...more...
-        }
-
-    Remember that we simplify the RULES before any RHS (see Note
-    [Rules are visible in their own rec group] above).
-
-    So we must *not* postInlineUnconditionally 'g', even though
-    its RHS turns out to be trivial.  (I'm assuming that 'g' is
-    not chosen as a loop breaker.)  Why not?  Because then we
-    drop the binding for 'g', which leaves it out of scope in the
-    RULE!
-
-    Here's a somewhat different example of the same thing
-        Rec { g = h
-            ; h = ...f...
-            ; f = f_rhs
-              RULE f [] = g }
-    Here the RULE is "below" g, but we *still* can't postInlineUnconditionally
-    g, because the RULE for f is active throughout.  So the RHS of h
-    might rewrite to     h = ...g...
-    So g must remain in scope in the output program!
-
-    We "solve" this by:
-
-        Make g a "weak" loop breaker (OccInfo = IAmLoopBreaker True)
-        iff g is a "missing free variable" of the Rec group
-
-    A "missing free variable" x is one that is mentioned in an RHS or
-    INLINE or RULE of a binding in the Rec group, but where the
-    dependency on x may not show up in the loop_breaker_nodes (see
-    note [Choosing loop breakers} above).
-
-    A normal "strong" loop breaker has IAmLoopBreaker False.  So
-
-                                    Inline  postInlineUnconditionally
-   strong   IAmLoopBreaker False    no      no
-   weak     IAmLoopBreaker True     yes     no
-            other                   yes     yes
-
-    The **sole** reason for this kind of loop breaker is so that
-    postInlineUnconditionally does not fire.  Ugh.  (Typically it'll
-    inline via the usual callSiteInline stuff, so it'll be dead in the
-    next pass, so the main Ugh is the tiresome complication.)
--}
+* We only consider variables free in the *RHS* of the rule, in
+  contrast to the way we build the Rec group in the first place (Note
+  [Rule dependency info])
 
-type Binding = (Id,CoreExpr)
+* Why "transitive sequence of rules"?  Because active rules apply
+  unconditionally, without checking loop-breaker-ness.
+ See Note [Loop breaker dependencies].
 
-loopBreakNodes :: Int
-               -> VarSet        -- Binders whose dependencies may be "missing"
-                                -- See Note [Weak loop breakers]
-               -> [LetrecNode]
-               -> [Binding]             -- Append these to the end
-               -> [Binding]
-{-
+Note [Finding rule RHS free vars]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Consider this real example from Data Parallel Haskell
+     tagZero :: Array Int -> Array Tag
+     {-# INLINE [1] tagZeroes #-}
+     tagZero xs = pmap (\x -> fromBool (x==0)) xs
+
+     {-# RULES "tagZero" [~1] forall xs n.
+         pmap fromBool <blah blah> = tagZero xs #-}
+So tagZero's RHS mentions pmap, and pmap's RULE mentions tagZero.
+However, tagZero can only be inlined in phase 1 and later, while
+the RULE is only active *before* phase 1.  So there's no problem.
+
+To make this work, we look for the RHS free vars only for
+*active* rules. That's the reason for the occ_rule_act field
+of the OccEnv.
+
+Note [loopBreakNodes]
+~~~~~~~~~~~~~~~~~~~~~
 loopBreakNodes is applied to the list of nodes for a cyclic strongly
 connected component (there's guaranteed to be a cycle).  It returns
 the same nodes, but
@@ -1039,7 +1071,18 @@ that the simplifier will generally do a good job if it works from top bottom,
 recording inlinings for any Ids which aren't marked as "no-inline" as it goes.
 -}
 
+type Binding = (Id,CoreExpr)
+
+-- See Note [loopBreakNodes]
+loopBreakNodes :: Int
+               -> VarSet        -- Binders whose dependencies may be "missing"
+                                -- See Note [Weak loop breakers]
+               -> [LetrecNode]
+               -> [Binding]             -- Append these to the end
+               -> [Binding]
+
 -- Return the bindings sorted into a plausible order, and marked with loop breakers.
+-- See Note [loopBreakNodes]
 loopBreakNodes depth weak_fvs nodes binds
   = -- pprTrace "loopBreakNodes" (ppr nodes) $
     go (stronglyConnCompFromEdgedVerticesUniqR nodes)
@@ -1269,16 +1312,13 @@ data Details
                                   -- ignoring phase (ie assuming all are active)
                                   -- See Note [Forming Rec groups]
 
-       , nd_inl  :: IdSet       -- Free variables of
-                                --   the stable unfolding (if present and active)
-                                --   or the RHS (if not)
+       , nd_inl  :: IdSet       -- Free variables of the stable unfolding and the RHS
                                 -- but excluding any RULES
                                 -- This is the IdSet that may be used if the Id is inlined
 
-       , nd_weak :: IdSet       -- Binders of this Rec that are mentioned in nd_uds
-                                -- but are *not* in nd_inl.  These are the ones whose
-                                -- dependencies might not be respected by loop_breaker_nodes
-                                -- See Note [Weak loop breakers]
+       , nd_simple :: Bool      -- True iff this binding has no local RULES
+                                -- If all nodes are simple we don't need a loop-breaker
+                                -- dep-anal before reconstructing.
 
        , nd_active_rule_fvs :: IdSet    -- Variables bound in this Rec group that are free
                                         -- in the RHS of an active rule for this bndr
@@ -1291,8 +1331,8 @@ instance Outputable Details where
              (sep [ text "bndr =" <+> ppr (nd_bndr nd)
                   , text "uds =" <+> ppr (nd_uds nd)
                   , text "inl =" <+> ppr (nd_inl nd)
-                  , text "weak =" <+> ppr (nd_weak nd)
-                  , text "rule_rvs =" <+> ppr (nd_active_rule_fvs nd)
+                  , text "simple =" <+> ppr (nd_simple nd)
+                  , text "active_rule_fvs =" <+> ppr (nd_active_rule_fvs nd)
                   , text "score =" <+> ppr (nd_score nd)
              ])
 
@@ -1315,7 +1355,7 @@ makeNode :: OccEnv -> ImpRuleEdges -> VarSet
 makeNode env imp_rule_edges bndr_set (bndr, rhs)
   = DigraphNode { node_payload      = details
                 , node_key          = varUnique bndr
-                , node_dependencies = nonDetKeysUniqSet node_fvs }
+                , node_dependencies = nonDetKeysUniqSet scope_fvs }
     -- It's OK to use nonDetKeysUniqSet here as stronglyConnCompFromEdgedVerticesR
     -- is still deterministic with edges in nondeterministic order as
     -- explained in Note [Deterministic SCC] in GHC.Data.Graph.Directed.
@@ -1323,26 +1363,35 @@ makeNode env imp_rule_edges bndr_set (bndr, rhs)
     details = ND { nd_bndr            = bndr'
                  , nd_rhs             = rhs'
                  , nd_rhs_bndrs       = bndrs'
-                 , nd_uds             = rhs_usage
+                 , nd_uds             = scope_uds
                  , nd_inl             = inl_fvs
-                 , nd_weak            = node_fvs `minusVarSet` inl_fvs
+                 , nd_simple          = null rules_w_uds && null imp_rule_info
                  , nd_active_rule_fvs = active_rule_fvs
                  , nd_score           = pprPanic "makeNodeDetails" (ppr bndr) }
 
     bndr' = bndr `setIdUnfolding`      unf'
                  `setIdSpecialisation` mkRuleInfo rules'
 
-    rhs_usage = rhs_uds `andUDs` unf_uds `andUDs` rule_uds
+    inl_uds = rhs_uds `andUDs` unf_uds
+    scope_uds = inl_uds `andUDs` rule_uds
                    -- Note [Rules are extra RHSs]
                    -- Note [Rule dependency info]
-    node_fvs   = udFreeVars bndr_set rhs_usage
+    scope_fvs = udFreeVars bndr_set scope_uds
+    -- scope_fvs: all occurrences from this binder: RHS, unfolding,
+    --            and RULES, both LHS and RHS thereof, active or inactive
 
+    inl_fvs  = udFreeVars bndr_set inl_uds
+    -- inl_fvs: vars that would become free if the function was inlined.
+    -- We conservatively approximate that by thefree vars from the RHS
+    -- and the unfolding together.
+    -- See Note [inl_fvs]
+
+    mb_join_arity = isJoinId_maybe bndr
     -- Get join point info from the *current* decision
     -- We don't know what the new decision will be!
     -- Using the old decision at least allows us to
     -- preserve existing join point, even RULEs are added
     -- See Note [Join points and unfoldings/rules]
-    mb_join_arity = isJoinId_maybe bndr
 
     --------- Right hand side ---------
     -- Constructing the edges for the main Rec computation
@@ -1359,16 +1408,6 @@ makeNode env imp_rule_edges bndr_set (bndr, rhs)
     unf = realIdUnfolding bndr -- realIdUnfolding: Ignore loop-breaker-ness
                                -- here because that is what we are setting!
     (unf_uds, unf') = occAnalUnfolding rhs_env Recursive mb_join_arity unf
-    inl_uds | isStableUnfolding unf = unf_uds
-            | otherwise             = rhs_uds
-    inl_fvs = udFreeVars bndr_set inl_uds
-    -- inl_fvs: the vars that would become free if the function was inlined;
-    -- usually that means the RHS, unless the unfolding is a stable one.
-    -- Note: We could do this only for functions with an *active* unfolding
-    --       (returning emptyVarSet for an inactive one), but is_active
-    --       isn't the right thing (it tells about RULE activation),
-    --       so we'd need more plumbing
-
 
     --------- IMP-RULES --------
     is_active     = occ_rule_act env :: Activation -> Bool
@@ -1397,6 +1436,7 @@ mkLoopBreakerNodes :: OccEnv -> TopLevelFlag
                    -> [Details]
                    -> (UsageDetails, -- adjusted
                        [LetrecNode])
+-- See Note [Choosing loop breakers]
 -- This function primarily creates the Nodes for the
 -- loop-breaker SCC analysis.  More specifically:
 --   a) tag each binder with its occurrence info
@@ -1445,8 +1485,7 @@ The loop breaker dependencies of x in a recursive
 group { f1 = e1; ...; fn = en } are:
 
 - The "inline free variables" of f: the fi free in
-  either f's unfolding (if f has a stable unfolding)
-  of     f's RHS       (if not)
+  f's stable unfolding and RHS; see Note [inl_fvs]
 
 - Any fi reachable from those inline free variables by a sequence
   of RULE rewrites.  Remember, rule rewriting is not affected
diff --git a/compiler/GHC/Core/Opt/Simplify.hs b/compiler/GHC/Core/Opt/Simplify.hs
index da039a8e83c1..13f0fdc46c98 100644
--- a/compiler/GHC/Core/Opt/Simplify.hs
+++ b/compiler/GHC/Core/Opt/Simplify.hs
@@ -3996,14 +3996,17 @@ simplRules env mb_new_id rules mb_cont
       = return rule
 
     simpl_rule rule@(Rule { ru_bndrs = bndrs, ru_args = args
-                          , ru_fn = fn_name, ru_rhs = rhs })
+                          , ru_fn = fn_name, ru_rhs = rhs
+                          , ru_act = act })
       = do { (env', bndrs') <- simplBinders env bndrs
            ; let rhs_ty = substTy env' (exprType rhs)
                  rhs_cont = case mb_cont of  -- See Note [Rules and unfolding for join points]
                                 Nothing   -> mkBoringStop rhs_ty
                                 Just cont -> ASSERT2( join_ok, bad_join_msg )
                                              cont
-                 rule_env = updMode updModeForRules env'
+                 lhs_env = updMode updModeForRules env'
+                 rhs_env = updMode (updModeForStableUnfoldings act) env'
+                           -- See Note [Simplifying the RHS of a RULE]
                  fn_name' = case mb_new_id of
                               Just id -> idName id
                               Nothing -> fn_name
@@ -4018,9 +4021,18 @@ simplRules env mb_new_id rules mb_cont
                  bad_join_msg = vcat [ ppr mb_new_id, ppr rule
                                      , ppr (fmap isJoinId_maybe mb_new_id) ]
 
-           ; args' <- mapM (simplExpr rule_env) args
-           ; rhs'  <- simplExprC rule_env rhs rhs_cont
+           ; args' <- mapM (simplExpr lhs_env) args
+           ; rhs'  <- simplExprC rhs_env rhs rhs_cont
            ; return (rule { ru_bndrs = bndrs'
                           , ru_fn    = fn_name'
                           , ru_args  = args'
                           , ru_rhs   = rhs' }) }
+
+{- Note [Simplifying the RHS of a RULE]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We can simplify the RHS of a RULE much as we do the RHS of a stable
+unfolding.  We used to use the much more conservative updModeForRules
+for the RHS as well as the LHS, but that seems more conservative
+than necesary.  Allowing some inlining might, for example, eliminate
+a binding.
+-}
\ No newline at end of file
diff --git a/compiler/GHC/Core/Opt/Simplify/Utils.hs b/compiler/GHC/Core/Opt/Simplify/Utils.hs
index 51dbc408d017..2e27466c55cd 100644
--- a/compiler/GHC/Core/Opt/Simplify/Utils.hs
+++ b/compiler/GHC/Core/Opt/Simplify/Utils.hs
@@ -883,12 +883,12 @@ simplEnvForGHCi logger dflags
 
 updModeForStableUnfoldings :: Activation -> SimplMode -> SimplMode
 -- See Note [Simplifying inside stable unfoldings]
-updModeForStableUnfoldings inline_rule_act current_mode
-  = current_mode { sm_phase      = phaseFromActivation inline_rule_act
+updModeForStableUnfoldings unf_act current_mode
+  = current_mode { sm_phase      = phaseFromActivation unf_act
                  , sm_inline     = True
                  , sm_eta_expand = False }
-                     -- sm_eta_expand: see Note [No eta expansion in stable unfoldings]
-       -- For sm_rules, just inherit; sm_rules might be "off"
+       -- sm_eta_expand: see Note [No eta expansion in stable unfoldings]
+       -- sm_rules: just inherit; sm_rules might be "off"
        -- because of -fno-enable-rewrite-rules
   where
     phaseFromActivation (ActiveAfter _ n) = Phase n
diff --git a/compiler/GHC/CoreToStg.hs b/compiler/GHC/CoreToStg.hs
index ee885eaacf4c..5a53cc933fa1 100644
--- a/compiler/GHC/CoreToStg.hs
+++ b/compiler/GHC/CoreToStg.hs
@@ -949,18 +949,33 @@ myCollectArgs expr
   where
     go (Var v)          as ts = (v, as, ts)
     go (App f a)        as ts = go f (a:as) ts
-    go (Tick t e)       as ts = ASSERT( all isTypeArg as )
+    go (Tick t e)       as ts = ASSERT2( not (tickishIsCode t) || all isTypeArg as
+                                       , ppr e $$ ppr as $$ ppr ts )
+                                -- See Note [Ticks in applications]
                                 go e as (t:ts) -- ticks can appear in type apps
     go (Cast e _)       as ts = go e as ts
     go (Lam b e)        as ts
        | isTyVar b            = go e as ts -- Note [Collect args]
     go _                _  _  = pprPanic "CoreToStg.myCollectArgs" (ppr expr)
 
--- Note [Collect args]
--- ~~~~~~~~~~~~~~~~~~~
---
--- This big-lambda case occurred following a rather obscure eta expansion.
--- It all seems a bit yukky to me.
+{- Note [Collect args]
+~~~~~~~~~~~~~~~~~~~~~~
+This big-lambda case occurred following a rather obscure eta expansion.
+It all seems a bit yukky to me.
+
+Note [Ticks in applications]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+We can get an application like
+   (tick t f) True False
+via inlining in the CorePrep pass; see Note [Inlining in CorePrep]
+in GHC.CoreToStg.Prep.  The tick does not satisfy tickishIsCode;
+the inlining-in-CorePrep happens for cpExprIsTrivial which tests
+tickishIsCode.
+
+So we test the same thing here, pushing any non-code ticks to
+the top (they don't generate any code, after all).  This showed
+up in the fallout from fixing #19360.
+-}
 
 stgArity :: Id -> HowBound -> Arity
 stgArity _ (LetBound _ arity) = arity
diff --git a/testsuite/tests/numeric/should_compile/T7116.stdout b/testsuite/tests/numeric/should_compile/T7116.stdout
index 996d391b44f1..8560ad4a1018 100644
--- a/testsuite/tests/numeric/should_compile/T7116.stdout
+++ b/testsuite/tests/numeric/should_compile/T7116.stdout
@@ -62,11 +62,9 @@ dl :: Double -> Double
  Arity=1,
  Str=<SP(U)>,
  Cpr=m1,
- Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True,
+ Unf=Unf{Src=<vanilla>, TopLvl=True, Value=True, ConLike=True,
          WorkFree=True, Expandable=True,
-         Guidance=ALWAYS_IF(arity=1,unsat_ok=True,boring_ok=False)
-         Tmpl= \ (x [Occ=Once1!] :: Double) ->
-                 case x of { GHC.Types.D# y -> GHC.Types.D# (GHC.Prim.+## y y) }}]
+         Guidance=ALWAYS_IF(arity=0,unsat_ok=True,boring_ok=True)}]
 dl = dr
 
 -- RHS size: {terms: 8, types: 3, coercions: 0, joins: 0/0}
@@ -94,13 +92,9 @@ fl :: Float -> Float
  Arity=1,
  Str=<SP(U)>,
  Cpr=m1,
- Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True,
+ Unf=Unf{Src=<vanilla>, TopLvl=True, Value=True, ConLike=True,
          WorkFree=True, Expandable=True,
-         Guidance=ALWAYS_IF(arity=1,unsat_ok=True,boring_ok=False)
-         Tmpl= \ (x [Occ=Once1!] :: Float) ->
-                 case x of { GHC.Types.F# y ->
-                 GHC.Types.F# (GHC.Prim.plusFloat# y y)
-                 }}]
+         Guidance=ALWAYS_IF(arity=0,unsat_ok=True,boring_ok=True)}]
 fl = fr
 
 
diff --git a/testsuite/tests/simplCore/should_compile/T19360.hs b/testsuite/tests/simplCore/should_compile/T19360.hs
new file mode 100644
index 000000000000..3998fb6d51d3
--- /dev/null
+++ b/testsuite/tests/simplCore/should_compile/T19360.hs
@@ -0,0 +1,29 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE RankNTypes #-}
+
+module T19360 where
+
+import GHC.Exts (oneShot)
+import Data.Typeable (Typeable, cast)
+
+newtype SDoc = SDoc' (Bool -> Bool)
+
+data ApiAnn = ApiAnn
+
+showAstData :: D a => Bool -> a -> SDoc
+showAstData z = showAstData'
+  where
+    showAstData' :: D a => a -> SDoc
+    showAstData' x =
+      case cast x of
+         Just (a :: ApiAnn) ->
+           case z of
+             True -> showAstData' a
+             False -> SDoc' (oneShot (\_ -> False))
+         Nothing -> gmapQr showAstData' x
+
+class Typeable a => D a where
+  gmapQr :: forall r. (forall d. D d => d -> r) -> a -> r
+
+instance D ApiAnn where
+  gmapQr g ApiAnn = g ApiAnn
diff --git a/testsuite/tests/simplCore/should_compile/T8331.stderr b/testsuite/tests/simplCore/should_compile/T8331.stderr
index 030b1752985b..1c510374811c 100644
--- a/testsuite/tests/simplCore/should_compile/T8331.stderr
+++ b/testsuite/tests/simplCore/should_compile/T8331.stderr
@@ -4,6 +4,11 @@
     forall (@s)
            ($dMonadAbstractIOST :: MonadAbstractIOST (ReaderT Int (ST s))).
       useAbstractMonad @(ReaderT Int (ST s)) $dMonadAbstractIOST
-      = useAbstractMonad_$suseAbstractMonad @s
+      = (useAbstractMonad1 @s)
+        `cast` (<Int>_R
+                %<'Many>_N ->_R <Int>_R %<'Many>_N ->_R Sym (N:ST[0] <s>_N <Int>_R)
+                                ; Sym (N:ReaderT[0] <Int>_R <ST s>_R <Int>_N)
+                :: Coercible
+                     (Int -> Int -> STRep s Int) (Int -> ReaderT Int (ST s) Int))
 
 
diff --git a/testsuite/tests/simplCore/should_compile/all.T b/testsuite/tests/simplCore/should_compile/all.T
index b3936d11e6ee..a2783f33e1ab 100644
--- a/testsuite/tests/simplCore/should_compile/all.T
+++ b/testsuite/tests/simplCore/should_compile/all.T
@@ -346,3 +346,4 @@ test('T18668', normal, compile, ['-dsuppress-uniques'])
 test('T18995', [ grep_errmsg(r'print') ], compile, ['-O -ddump-simpl -dsuppress-uniques'])
 test('T19168', normal, compile, [''])
 test('T19246', only_ways(['optasm']), multimod_compile, ['T19246', '-v0 -ddump-rules'])
+test('T19360', only_ways(['optasm']), compile, [''])
-- 
GitLab