diff --git a/compiler/GHC/Core/Opt/Arity.hs b/compiler/GHC/Core/Opt/Arity.hs index 84b46860c4ff5ca477a9b294df836b1e89bd1e17..ea98b54d52d2279b82be6c78f62c4e337d30b7f9 100644 --- a/compiler/GHC/Core/Opt/Arity.hs +++ b/compiler/GHC/Core/Opt/Arity.hs @@ -1270,8 +1270,14 @@ arityLam id (AT oss div) floatIn :: Cost -> ArityType -> ArityType -- We have something like (let x = E in b), -- where b has the given arity type. -floatIn IsCheap at = at -floatIn IsExpensive at = addWork at +-- NB: be as lazy as possible in the Cost-of-E argument; +-- we can often get away without ever looking at it +-- See Note [Care with nested expressions] +floatIn ch at@(AT lams div) + = case lams of + [] -> at + (IsExpensive,_):_ -> at + (_,os):lams -> AT ((ch,os):lams) div addWork :: ArityType -> ArityType -- Add work to the outermost level of the arity type @@ -1354,6 +1360,25 @@ That gives \1.T (see Note [Combining case branches: andWithTail], first bullet). So 'go2' gets an arityType of \(C?)(C1).T, which is what we want. +Note [Care with nested expressions] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Consider + arityType (Just <big-expressions>) +We will take + arityType Just = AT [(IsCheap,os)] topDiv +and then do + arityApp (AT [(IsCheap os)] topDiv) (exprCost <big-expression>) +The result will be AT [] topDiv. It doesn't matter what <big-expresison> +is! The same is true of + arityType (let x = <rhs> in <body>) +where the cost of <rhs> doesn't matter unless <body> has a useful +arityType. + +TL;DR in `floatIn`, do not to look at the Cost argument until you have to. + +I found this when looking at #24471, although I don't think it was really +the main culprit. + Note [Combining case branches: andWithTail] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When combining the ArityTypes for two case branches (with andArityType) @@ -1576,7 +1601,7 @@ arityType env (Case scrut bndr _ alts) = alts_type | otherwise -- In the remaining cases we may not push - = addWork alts_type -- evaluation of the scrutinee in + = addWork alts_type -- evaluation of the scrutinee in where env' = delInScope env bndr arity_type_alt (Alt _con bndrs rhs) = arityType (delInScopeList env' bndrs) rhs diff --git a/compiler/GHC/Core/Opt/SetLevels.hs b/compiler/GHC/Core/Opt/SetLevels.hs index f089859678b7bff3f4c3a418997a08afd788a0ae..fe336e528632f549035ddb694b856a819979f5e4 100644 --- a/compiler/GHC/Core/Opt/SetLevels.hs +++ b/compiler/GHC/Core/Opt/SetLevels.hs @@ -643,7 +643,8 @@ lvlMFE env strict_ctxt e@(_, AnnCase {}) = lvlExpr env e -- See Note [Case MFEs] lvlMFE env strict_ctxt ann_expr - | floatTopLvlOnly env && not (isTopLvl dest_lvl) + | not float_me + || floatTopLvlOnly env && not (isTopLvl dest_lvl) -- Only floating to the top level is allowed. || hasFreeJoin env fvs -- If there is a free join, don't float -- See Note [Free join points] @@ -652,8 +653,9 @@ lvlMFE env strict_ctxt ann_expr -- how it will be represented at runtime. -- See Note [Representation polymorphism invariants] in GHC.Core || notWorthFloating expr abs_vars - || not float_me - = -- Don't float it out + -- Test notWorhtFloating last; + -- See Note [Large free-variable sets] + = -- Don't float it out lvlExpr env ann_expr | float_is_new_lam || exprIsTopLevelBindable expr expr_ty @@ -822,6 +824,28 @@ early loses opportunities for RULES which (needless to say) are important in some nofib programs (gcd is an example). [SPJ note: I think this is obsolete; the flag seems always on.] +Note [Large free-variable sets] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In #24471 we had something like + x1 = I# 1 + ... + x1000 = I# 1000 + foo = f x1 (f x2 (f x3 ....)) +So every sub-expression in `foo` has lots and lots of free variables. But +none of these sub-expressions float anywhere; the entire float-out pass is a +no-op. + +In lvlMFE, we want to find out quickly if the MFE is not-floatable; that is +the common case. In #24471 it turned out that we were testing `abs_vars` (a +relatively complicated calculation that takes at least O(n-free-vars) time to +compute) for every sub-expression. + +Better instead to test `float_me` early. That still involves looking at +dest_lvl, which means looking at every free variable, but the constant factor +is a lot better. + +ToDo: find a way to fix the bad asymptotic complexity. + Note [Floating join point bindings] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Mostly we only float a join point if it can /stay/ a join point. But diff --git a/compiler/GHC/CoreToStg/Prep.hs b/compiler/GHC/CoreToStg/Prep.hs index 558baac28ad56913622c61945222e81ed819a653..495ab54fb510fee0aacacd01d81cf0dfc78dd9a9 100644 --- a/compiler/GHC/CoreToStg/Prep.hs +++ b/compiler/GHC/CoreToStg/Prep.hs @@ -1469,8 +1469,7 @@ cpeArg env dmd arg = do { (floats1, arg1) <- cpeRhsE env arg -- arg1 can be a lambda ; let arg_ty = exprType arg1 is_unlifted = isUnliftedType arg_ty - dec = wantFloatLocal NonRecursive dmd is_unlifted - floats1 arg1 + dec = wantFloatLocal NonRecursive dmd is_unlifted floats1 arg1 ; (floats2, arg2) <- executeFloatDecision dec floats1 arg1 -- Else case: arg1 might have lambdas, and we can't -- put them inside a wrapBinds @@ -1482,23 +1481,29 @@ cpeArg env dmd arg then return (floats2, arg2) else do { v <- newVar arg_ty -- See Note [Eta expansion of arguments in CorePrep] - ; let arg3 = cpeEtaExpandArg env arg2 + ; let arity = cpeArgArity env dec arg2 + arg3 = cpeEtaExpand arity arg2 arg_float = mkNonRecFloat env dmd is_unlifted v arg3 ; return (snocFloat floats2 arg_float, varToCoreExpr v) } } -cpeEtaExpandArg :: CorePrepEnv -> CoreArg -> CoreArg +cpeArgArity :: CorePrepEnv -> FloatDecision -> CoreArg -> Arity -- ^ See Note [Eta expansion of arguments in CorePrep] -cpeEtaExpandArg env arg = cpeEtaExpand arity arg - where - arity | Just ao <- cp_arityOpts (cpe_config env) -- Just <=> -O1 or -O2 - , not (has_join_in_tail_context arg) +-- Returning 0 means "no eta-expansion"; see cpeEtaExpand +cpeArgArity env float_decision arg + | FloatNone <- float_decision + = 0 -- Crucial short-cut + -- See wrinkle (EA2) in Note [Eta expansion of arguments in CorePrep] + + | Just ao <- cp_arityOpts (cpe_config env) -- Just <=> -O1 or -O2 + , not (has_join_in_tail_context arg) -- See Wrinkle (EA1) of Note [Eta expansion of arguments in CorePrep] - = case exprEtaExpandArity ao arg of - Nothing -> 0 - Just at -> arityTypeArity at - | otherwise - = exprArity arg -- this is cheap enough for -O0 + = case exprEtaExpandArity ao arg of + Nothing -> 0 + Just at -> arityTypeArity at + + | otherwise + = exprArity arg -- this is cheap enough for -O0 has_join_in_tail_context :: CoreExpr -> Bool -- ^ Identify the cases where we'd generate invalid `CpeApp`s as described in @@ -1510,34 +1515,10 @@ has_join_in_tail_context (Tick _ e) = has_join_in_tail_context e has_join_in_tail_context (Case _ _ _ alts) = any has_join_in_tail_context (rhssOfAlts alts) has_join_in_tail_context _ = False -{- -Note [Eta expansion of arguments with join heads] -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -See Note [Eta expansion for join points] in GHC.Core.Opt.Arity -Eta expanding the join point would introduce crap that we can't -generate code for - ------------------------------------------------------------------------------- --- Building the saturated syntax --- --------------------------------------------------------------------------- - -Note [Eta expansion of hasNoBinding things in CorePrep] -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -maybeSaturate deals with eta expanding to saturate things that can't deal with -unsaturated applications (identified by 'hasNoBinding', currently -foreign calls, unboxed tuple/sum constructors, and representation-polymorphic -primitives such as 'coerce' and 'unsafeCoerce#'). - -Historical Note: Note that eta expansion in CorePrep used to be very fragile -due to the "prediction" of CAFfyness that we used to make during tidying. -We previously saturated primop -applications here as well but due to this fragility (see #16846) we now deal -with this another way, as described in Note [Primop wrappers] in GHC.Builtin.PrimOps. --} - maybeSaturate :: Id -> CpeApp -> Int -> [CoreTickish] -> UniqSM CpeRhs maybeSaturate fn expr n_args unsat_ticks | hasNoBinding fn -- There's no binding + -- See Note [Eta expansion of hasNoBinding things in CorePrep] = return $ wrapLamBody (\body -> foldr mkTick body unsat_ticks) sat_expr | mark_arity > 0 -- A call-by-value function. See Note [CBV Function Ids] @@ -1567,24 +1548,14 @@ maybeSaturate fn expr n_args unsat_ticks fn_arity = idArity fn excess_arity = (max fn_arity mark_arity) - n_args sat_expr = cpeEtaExpand excess_arity expr - applied_marks = n_args >= (length . dropWhile (not . isMarkedCbv) . reverse . expectJust "maybeSaturate" $ (idCbvMarks_maybe fn)) + applied_marks = n_args >= (length . dropWhile (not . isMarkedCbv) . + reverse . expectJust "maybeSaturate" $ (idCbvMarks_maybe fn)) -- For join points we never eta-expand (See Note [Do not eta-expand join points]) - -- so we assert all arguments that need to be passed cbv are visible so that the backend can evalaute them if required.. -{- -************************************************************************ -* * - Simple GHC.Core operations -* * -************************************************************************ --} + -- so we assert all arguments that need to be passed cbv are visible so that the + -- backend can evalaute them if required.. -{- --- ----------------------------------------------------------------------------- --- Eta reduction --- ----------------------------------------------------------------------------- - -Note [Eta expansion] -~~~~~~~~~~~~~~~~~~~~~ +{- Note [Eta expansion] +~~~~~~~~~~~~~~~~~~~~~~~ Eta expand to match the arity claimed by the binder Remember, CorePrep must not change arity @@ -1603,6 +1574,19 @@ NB2: we have to be careful that the result of etaExpand doesn't an SCC note - we're now careful in etaExpand to make sure the SCC is pushed inside any new lambdas that are generated. +Note [Eta expansion of hasNoBinding things in CorePrep] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +maybeSaturate deals with eta expanding to saturate things that can't deal +with unsaturated applications (identified by 'hasNoBinding', currently +foreign calls, unboxed tuple/sum constructors, and representation-polymorphic +primitives such as 'coerce' and 'unsafeCoerce#'). + +Historical Note: Note that eta expansion in CorePrep used to be very fragile +due to the "prediction" of CAFfyness that we used to make during tidying. We +previously saturated primop applications here as well but due to this +fragility (see #16846) we now deal with this another way, as described in +Note [Primop wrappers] in GHC.Builtin.PrimOps. + Note [Eta expansion and the CorePrep invariants] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ It turns out to be much much easier to do eta expansion @@ -1685,6 +1669,22 @@ There is a nasty Wrinkle: This scenario occurs rarely; hence it's OK to generate sub-optimal code. The alternative would be to fix Note [Eta expansion for join points], but that's quite challenging due to unfoldings of (recursive) join points. + +(EA2) In cpeArgArity, if float_decision = FloatNone) the `arg` will look like + let <binds> in rhs + where <binds> is non-empty and can't be floated out of a lazy context (see + `wantFloatLocal`). So we can't eta-expand it anyway, so we can return 0 + forthwith. Without this short-cut we will call exprEtaExpandArity on the + `arg`, and <binds> might be enormous. exprEtaExpandArity be very expensive + on this: it uses arityType, and may look at <binds>. + + On the other hand, if float_decision = FloatAll, there will be no + let-bindings around 'arg'; they will have floated out. So + exprEtaExpandArity is cheap. + + This can make a huge difference on deeply nested expressions like + f (f (f (f (f ...)))) + #24471 is a good example, where Prep took 25% of compile time! -} cpeEtaExpand :: Arity -> CpeRhs -> CpeRhs @@ -1899,7 +1899,7 @@ instance Outputable FloatInfo where -- See Note [Floating in CorePrep] -- and Note [BindInfo and FloatInfo] data FloatingBind - = Float !CoreBind !BindInfo !FloatInfo + = Float !CoreBind !BindInfo !FloatInfo -- Never a join-point binding | UnsafeEqualityCase !CoreExpr !CoreBndr !AltCon ![CoreBndr] | FloatTick CoreTickish @@ -2126,19 +2126,16 @@ data FloatDecision | FloatAll executeFloatDecision :: FloatDecision -> Floats -> CpeRhs -> UniqSM (Floats, CpeRhs) -executeFloatDecision dec floats rhs = do - let (float,stay) = case dec of - _ | isEmptyFloats floats -> (emptyFloats,emptyFloats) - FloatNone -> (emptyFloats, floats) - FloatAll -> (floats, emptyFloats) - -- Wrap `stay` around `rhs`. - -- NB: `rhs` might have lambdas, and we can't - -- put them inside a wrapBinds, which expects a `CpeBody`. - if isEmptyFloats stay -- Fast path where we don't need to call `rhsToBody` - then return (float, rhs) - else do - (floats', body) <- rhsToBody rhs - return (float, wrapBinds stay $ wrapBinds floats' body) +executeFloatDecision dec floats rhs + = case dec of + FloatAll -> return (floats, rhs) + FloatNone + | isEmptyFloats floats -> return (emptyFloats, rhs) + | otherwise -> do { (floats', body) <- rhsToBody rhs + ; return (emptyFloats, wrapBinds floats $ + wrapBinds floats' body) } + -- FloatNone case: `rhs` might have lambdas, and we can't + -- put them inside a wrapBinds, which expects a `CpeBody`. wantFloatTop :: Floats -> FloatDecision wantFloatTop fs diff --git a/testsuite/tests/perf/compiler/T24471.hs b/testsuite/tests/perf/compiler/T24471.hs new file mode 100644 index 0000000000000000000000000000000000000000..ca77dd72aa4e5cbc847e74b930c7caafdb752b14 --- /dev/null +++ b/testsuite/tests/perf/compiler/T24471.hs @@ -0,0 +1,8 @@ +{-# LANGUAGE TemplateHaskell #-} +module T24471 where + +import T24471a + +{-# OPAQUE foo #-} +foo :: (List_ Int a -> a) -> a +foo alg = $$(between [|| alg ||] 0 1000) diff --git a/testsuite/tests/perf/compiler/T24471a.hs b/testsuite/tests/perf/compiler/T24471a.hs new file mode 100644 index 0000000000000000000000000000000000000000..418df13675766bc0107d96cc456e6fe394723432 --- /dev/null +++ b/testsuite/tests/perf/compiler/T24471a.hs @@ -0,0 +1,8 @@ +{-# LANGUAGE TemplateHaskell #-} +module T24471a where + +data List_ a f = Nil_ | Cons_ a f deriving Functor + +between alg a b + | a == b = [|| $$alg Nil_ ||] + | otherwise = [|| $$alg (Cons_ a $$(between alg (a + 1) b)) ||] diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T index 4c19400b8899abc24e3050247f43f6119c49ada3..60c59481989c659e00fa48ccd387b9a13353bd89 100644 --- a/testsuite/tests/perf/compiler/all.T +++ b/testsuite/tests/perf/compiler/all.T @@ -712,3 +712,7 @@ test ('LookupFusion', [collect_stats('bytes allocated',2), when(wordsize(32), skip)], compile_and_run, ['-O2 -package base']) + +test('T24471', + [req_th, collect_compiler_stats('all', 5)], + multimod_compile, ['T24471', '-v0 -O'])