From 9b4129a580e6c1d18197ef2ed3a8b89d52a2b133 Mon Sep 17 00:00:00 2001 From: Andreas Klebinger <klebinger.andreas@gmx.at> Date: Thu, 25 Apr 2024 13:08:21 +0200 Subject: [PATCH] -fprof-late: Only insert cost centres on functions/non-workfree cafs. They are usually useless and doing so for data values comes with a large compile time/code size overhead. Fixes #24103 --- compiler/GHC/Core/LateCC.hs | 8 +++-- compiler/GHC/Core/LateCC/TopLevelBinds.hs | 40 ++++++++++++++++++----- compiler/GHC/Core/LateCC/Types.hs | 2 +- compiler/GHC/Core/Type.hs | 2 ++ compiler/GHC/Driver/Main.hs | 2 +- compiler/GHC/Types/RepType.hs | 3 ++ docs/users_guide/9.12.1-notes.rst | 10 ++++++ docs/users_guide/profiling.rst | 6 ++-- 8 files changed, 58 insertions(+), 15 deletions(-) diff --git a/compiler/GHC/Core/LateCC.hs b/compiler/GHC/Core/LateCC.hs index f7dbb7f05c1b..8b3bcc65fd34 100644 --- a/compiler/GHC/Core/LateCC.hs +++ b/compiler/GHC/Core/LateCC.hs @@ -21,6 +21,7 @@ import GHC.Types.SrcLoc import GHC.Utils.Error import GHC.Utils.Logger import GHC.Utils.Outputable +import GHC.Types.RepType (mightBeFunTy) -- | Late cost center insertion logic used by the driver addLateCostCenters :: @@ -78,8 +79,11 @@ addLateCostCenters logger LateCCConfig{..} core_binds = do top_level_cc_pred :: CoreExpr -> Bool top_level_cc_pred = case lateCCConfig_whichBinds of - LateCCAllBinds -> - const True + LateCCBinds -> \rhs -> + -- Make sure we record any functions. Even if it's something like `f = g`. + mightBeFunTy (exprType rhs) || + -- If the RHS is a CAF doing work also insert a CC. + not (exprIsWorkFree rhs) LateCCOverloadedBinds -> isOverloadedTy . exprType LateCCNone -> diff --git a/compiler/GHC/Core/LateCC/TopLevelBinds.hs b/compiler/GHC/Core/LateCC/TopLevelBinds.hs index 299cf4040e8e..6c215b78b1c5 100644 --- a/compiler/GHC/Core/LateCC/TopLevelBinds.hs +++ b/compiler/GHC/Core/LateCC/TopLevelBinds.hs @@ -3,16 +3,18 @@ module GHC.Core.LateCC.TopLevelBinds where import GHC.Prelude -import GHC.Core --- import GHC.Core.LateCC import GHC.Core.LateCC.Types import GHC.Core.LateCC.Utils + +import GHC.Core import GHC.Core.Opt.Monad import GHC.Driver.DynFlags import GHC.Types.Id import GHC.Types.Name import GHC.Unit.Module.ModGuts +import Data.Maybe + {- Note [Collecting late cost centres] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Usually cost centres defined by a module are collected @@ -26,7 +28,7 @@ us from collecting them here when we run this pass before tidy. Note [Adding late cost centres to top level bindings] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The basic idea is very simple. For every top level binder +The basic idea is very simple. For a top level binder `f = rhs` we compile it as if the user had written `f = {-# SCC f #-} rhs`. @@ -37,6 +39,21 @@ might inhibit optimizations at the call site. For this reason we provide flags for both approaches as they have different tradeoffs. +To reduce overhead we ignore workfree bindings because they don't contribute +meaningfully to a performance profile. This reduces code size massively as it +allows us to allocate definitions like `val = Just 32` at compile time instead +of turning them into a CAF of the form `val = <scc val> let x = Just 32 in x` which +would be the alternative. + +We make an exception for rhss with function types. This allows us to get +cost centres on eta-reduced definitions like `f = g`. By putting a tick onto +`f`s rhs we end up with + + f = \eta1 eta2 ... etan -> + <scc f> g eta1 ... etan + +Which can make it easier to understand call graphs of an application. + We also don't add a cost centre for any binder that is a constructor worker or wrapper. These will never meaningfully enrich the resulting profile so we improve efficiency by omitting those. @@ -89,15 +106,20 @@ topLevelBindsCC pred core_bind = doBndr :: Id -> CoreExpr -> LateCCM s CoreExpr doBndr bndr rhs - -- Cost centres on constructor workers are pretty much useless - -- so we don't emit them if we are looking at the rhs of a constructor - -- binding. - | Just _ <- isDataConId_maybe bndr = pure rhs - | otherwise = if pred rhs then addCC bndr rhs else pure rhs + -- Not a constructor worker. + -- Cost centres on constructor workers are pretty much useless so we don't emit them + -- if we are looking at the rhs of a constructor binding. + | isNothing (isDataConId_maybe bndr) + , pred rhs + = addCC bndr rhs + | otherwise = pure rhs -- We want to put the cost centre below the lambda as we only care about - -- executions of the RHS. + -- executions of the RHS. Note that the lambdas might be hidden under ticks + -- or casts. So look through these as well. addCC :: Id -> CoreExpr -> LateCCM s CoreExpr + addCC bndr (Cast rhs co) = pure Cast <*> addCC bndr rhs <*> pure co + addCC bndr (Tick t rhs) = (Tick t) <$> addCC bndr rhs addCC bndr (Lam b rhs) = Lam b <$> addCC bndr rhs addCC bndr rhs = do let name = idName bndr diff --git a/compiler/GHC/Core/LateCC/Types.hs b/compiler/GHC/Core/LateCC/Types.hs index ca9ccc7b29c3..9956df7a2b11 100644 --- a/compiler/GHC/Core/LateCC/Types.hs +++ b/compiler/GHC/Core/LateCC/Types.hs @@ -34,7 +34,7 @@ data LateCCConfig = -- | The types of top-level bindings we support adding cost centers to. data LateCCBindSpec = LateCCNone - | LateCCAllBinds + | LateCCBinds | LateCCOverloadedBinds -- | Late cost centre insertion environment diff --git a/compiler/GHC/Core/Type.hs b/compiler/GHC/Core/Type.hs index 8f8d9d554e5a..405ecb7b3532 100644 --- a/compiler/GHC/Core/Type.hs +++ b/compiler/GHC/Core/Type.hs @@ -1978,6 +1978,8 @@ isPiTy ty = case coreFullView ty of _ -> False -- | Is this a function? +-- Note: `forall {b}. Show b => b -> IO b` will not be considered a function by this function. +-- It would merely be a forall wrapping a function type. isFunTy :: Type -> Bool isFunTy ty | FunTy {} <- coreFullView ty = True diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs index eacebf2fbe23..fb7df9f04145 100644 --- a/compiler/GHC/Driver/Main.hs +++ b/compiler/GHC/Driver/Main.hs @@ -1805,7 +1805,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do if gopt Opt_ProfLateInlineCcs dflags then LateCCNone else if gopt Opt_ProfLateCcs dflags then - LateCCAllBinds + LateCCBinds else if gopt Opt_ProfLateOverloadedCcs dflags then LateCCOverloadedBinds else diff --git a/compiler/GHC/Types/RepType.hs b/compiler/GHC/Types/RepType.hs index d1749c76f386..91b1a475c2e8 100644 --- a/compiler/GHC/Types/RepType.hs +++ b/compiler/GHC/Types/RepType.hs @@ -693,6 +693,9 @@ mightBeFunTy :: Type -> Bool -- AK: It would be nice to figure out and document the difference -- between this and isFunTy at some point. mightBeFunTy ty + -- Currently ghc has no unlifted functions. + | definitelyUnliftedType ty + = False | [BoxedRep _] <- typePrimRep ty , Just tc <- tyConAppTyCon_maybe (unwrapType ty) , isDataTyCon tc diff --git a/docs/users_guide/9.12.1-notes.rst b/docs/users_guide/9.12.1-notes.rst index 5a3d40b2f477..fb3ee09d41c1 100644 --- a/docs/users_guide/9.12.1-notes.rst +++ b/docs/users_guide/9.12.1-notes.rst @@ -34,6 +34,16 @@ See ``Note [The importance of tracking free coercion variables]`` in ``GHC.Core.TyCo.Rep``, :ref:`constraint-solving-with-plugins` and the migration guide. +- The flag `-fprof-late` will no longer prevent top level constructors from being statically allocated. + + It used to be the case that we would add a cost centre for bindings like `foo = Just bar`. + This turned the binding into a caf that would allocate the constructor on first evaluation. + + However without the cost centre `foo` can be allocated at compile time. This reduces code-bloat and + reduces overhead for short-running applications. + + The tradeoff is that calling `whoCreated` on top level value definitions like `foo` will be less informative. + GHCi ~~~~ diff --git a/docs/users_guide/profiling.rst b/docs/users_guide/profiling.rst index bf10204df665..c0c8d70f1ebb 100644 --- a/docs/users_guide/profiling.rst +++ b/docs/users_guide/profiling.rst @@ -483,10 +483,12 @@ of your profiled program will be different to that of the unprofiled one. :since: 9.4.1 - Adds an automatic ``SCC`` annotation to all top level bindings late in the compilation pipeline after - the optimizer has run and unfoldings have been created. This means these cost centres will not interfere with core-level optimizations + Adds an automatic ``SCC`` annotation to all top level bindings which might perform work. + This is done late in the compilation pipeline after the optimizer has run and unfoldings have been created. + This means these cost centres will not interfere with core-level optimizations and the resulting profile will be closer to the performance profile of an optimized non-profiled executable. + While the results of this are generally informative, some of the compiler internal names will leak into the profile. Further if a function is inlined into a use site it's costs will be counted against the caller's cost center. -- GitLab