diff --git a/compiler/GHC/Core/LateCC.hs b/compiler/GHC/Core/LateCC.hs index 32db6c4b540e2b71b14dc5c81b19da8b8c81fa50..f7dbb7f05c1b104a37265b2b29c3272c539f2c33 100644 --- a/compiler/GHC/Core/LateCC.hs +++ b/compiler/GHC/Core/LateCC.hs @@ -1,164 +1,90 @@ -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE TupleSections #-} +{-# LANGUAGE RecordWildCards #-} --- | Adds cost-centers after the core piple has run. +-- | Adds cost-centers after the core pipline has run. module GHC.Core.LateCC - ( addLateCostCentresMG - , addLateCostCentresPgm - , addLateCostCentres -- Might be useful for API users - , Env(..) + ( -- * Inserting cost centres + addLateCostCenters ) where -import Control.Applicative -import Control.Monad -import qualified Data.Set as S - import GHC.Prelude -import GHC.Types.CostCentre -import GHC.Types.CostCentre.State -import GHC.Types.Name hiding (varName) -import GHC.Types.Tickish -import GHC.Unit.Module.ModGuts -import GHC.Types.Var -import GHC.Unit.Types -import GHC.Data.FastString -import GHC.Core -import GHC.Core.Opt.Monad -import GHC.Core.Utils (mkTick) -import GHC.Types.Id -import GHC.Driver.DynFlags +import GHC.Core +import GHC.Core.LateCC.OverloadedCalls +import GHC.Core.LateCC.TopLevelBinds +import GHC.Core.LateCC.Types +import GHC.Core.LateCC.Utils +import GHC.Core.Seq +import qualified GHC.Data.Strict as Strict +import GHC.Core.Utils +import GHC.Tc.Utils.TcType +import GHC.Types.SrcLoc +import GHC.Utils.Error import GHC.Utils.Logger import GHC.Utils.Outputable -import GHC.Utils.Misc -import GHC.Utils.Error (withTiming) -import GHC.Utils.Monad.State.Strict - - -{- Note [Collecting late cost centres] -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Usually cost centres defined by a module are collected -during tidy by collectCostCentres. However with `-fprof-late` -we insert cost centres after inlining. So we keep a list of -all the cost centres we inserted and combine that with the list -of cost centres found during tidy. - -To avoid overhead when using -fprof-inline there is a flag to stop -us from collecting them here when we run this pass before tidy. - -Note [Adding late cost centres] -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The basic idea is very simple. For every top level binder -`f = rhs` we compile it as if the user had written -`f = {-# SCC f #-} rhs`. - -If we do this after unfoldings for `f` have been created this -doesn't impact core-level optimizations at all. If we do it -before the cost centre will be included in the unfolding and -might inhibit optimizations at the call site. For this reason -we provide flags for both approaches as they have different -tradeoffs. - -We also don't add a cost centre for any binder that is a constructor -worker or wrapper. These will never meaningfully enrich the resulting -profile so we improve efficiency by omitting those. - --} - -addLateCostCentresMG :: ModGuts -> CoreM ModGuts -addLateCostCentresMG guts = do - dflags <- getDynFlags - let env :: Env - env = Env - { thisModule = mg_module guts - , countEntries = gopt Opt_ProfCountEntries dflags - , collectCCs = False -- See Note [Collecting late cost centres] - } - let guts' = guts { mg_binds = fstOf3 (addLateCostCentres env (mg_binds guts)) - } - return guts' - -addLateCostCentresPgm :: DynFlags -> Logger -> Module -> CoreProgram -> IO (CoreProgram, S.Set CostCentre, CostCentreState) -addLateCostCentresPgm dflags logger mod binds = - withTiming logger - (text "LateCC"<+>brackets (ppr mod)) - (\(a,b,c) -> a `seqList` (b `seq` (c `seq` ()))) $ do - let env = Env - { thisModule = mod - , countEntries = gopt Opt_ProfCountEntries dflags - , collectCCs = True -- See Note [Collecting late cost centres] - } - (binds', ccs, cc_state) = addLateCostCentres env binds - when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $ - putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr binds')) - return (binds', ccs, cc_state) -addLateCostCentres :: Env -> CoreProgram -> (CoreProgram, S.Set CostCentre, CostCentreState) -addLateCostCentres env binds = - let (binds', state) = runState (mapM (doBind env) binds) initLateCCState - in (binds', lcs_ccs state, lcs_state state) - - -doBind :: Env -> CoreBind -> M CoreBind -doBind env (NonRec b rhs) = NonRec b <$> doBndr env b rhs -doBind env (Rec bs) = Rec <$> mapM doPair bs +-- | Late cost center insertion logic used by the driver +addLateCostCenters :: + Logger + -- ^ Logger + -> LateCCConfig + -- ^ Late cost center configuration + -> CoreProgram + -- ^ The program + -> IO (CoreProgram, LateCCState (Strict.Maybe SrcSpan)) +addLateCostCenters logger LateCCConfig{..} core_binds = do + + -- If top-level late CCs are enabled via either -fprof-late or + -- -fprof-late-overloaded, add them + (top_level_cc_binds, top_level_late_cc_state) <- + case lateCCConfig_whichBinds of + LateCCNone -> + return (core_binds, initLateCCState ()) + _ -> + withTiming + logger + (text "LateTopLevelCCs" <+> brackets (ppr this_mod)) + (\(binds, late_cc_state) -> seqBinds binds `seq` late_cc_state `seq` ()) + $ {-# SCC lateTopLevelCCs #-} do + pure $ + doLateCostCenters + lateCCConfig_env + (initLateCCState ()) + (topLevelBindsCC top_level_cc_pred) + core_binds + + -- If overloaded call CCs are enabled via -fprof-late-overloaded-calls, add + -- them + (late_cc_binds, late_cc_state) <- + if lateCCConfig_overloadedCalls then + withTiming + logger + (text "LateOverloadedCallsCCs" <+> brackets (ppr this_mod)) + (\(binds, late_cc_state) -> seqBinds binds `seq` late_cc_state `seq` ()) + $ {-# SCC lateoverloadedCallsCCs #-} do + pure $ + doLateCostCenters + lateCCConfig_env + (top_level_late_cc_state { lateCCState_extra = Strict.Nothing }) + overloadedCallsCC + top_level_cc_binds + else + return + ( top_level_cc_binds + , top_level_late_cc_state { lateCCState_extra = Strict.Nothing } + ) + + return (late_cc_binds, late_cc_state) where - doPair :: ((Id, CoreExpr) -> M (Id, CoreExpr)) - doPair (b,rhs) = (b,) <$> doBndr env b rhs - -doBndr :: Env -> Id -> CoreExpr -> M CoreExpr -doBndr env bndr rhs - -- Cost centres on constructor workers are pretty much useless - -- so we don't emit them if we are looking at the rhs of a constructor - -- binding. - | Just _ <- isDataConId_maybe bndr = pure rhs - | otherwise = doBndr' env bndr rhs - - --- We want to put the cost centre below the lambda as we only care about executions of the RHS. -doBndr' :: Env -> Id -> CoreExpr -> State LateCCState CoreExpr -doBndr' env bndr (Lam b rhs) = Lam b <$> doBndr' env bndr rhs -doBndr' env bndr rhs = do - let name = idName bndr - name_loc = nameSrcSpan name - cc_name = getOccFS name - count = countEntries env - cc_flavour <- getCCFlavour cc_name - let cc_mod = thisModule env - bndrCC = NormalCC cc_flavour cc_name cc_mod name_loc - note = ProfNote bndrCC count True - addCC env bndrCC - return $ mkTick note rhs - -data LateCCState = LateCCState - { lcs_state :: !CostCentreState - , lcs_ccs :: S.Set CostCentre - } -type M = State LateCCState - -initLateCCState :: LateCCState -initLateCCState = LateCCState newCostCentreState mempty - -getCCFlavour :: FastString -> M CCFlavour -getCCFlavour name = mkLateCCFlavour <$> getCCIndex' name - -getCCIndex' :: FastString -> M CostCentreIndex -getCCIndex' name = do - state <- get - let (index,cc_state') = getCCIndex name (lcs_state state) - put (state { lcs_state = cc_state'}) - return index - -addCC :: Env -> CostCentre -> M () -addCC !env cc = do - state <- get - when (collectCCs env) $ do - let ccs' = S.insert cc (lcs_ccs state) - put (state { lcs_ccs = ccs'}) - -data Env = Env - { thisModule :: !Module - , countEntries:: !Bool - , collectCCs :: !Bool - } - + top_level_cc_pred :: CoreExpr -> Bool + top_level_cc_pred = + case lateCCConfig_whichBinds of + LateCCAllBinds -> + const True + LateCCOverloadedBinds -> + isOverloadedTy . exprType + LateCCNone -> + -- This is here for completeness, we won't actually use this + -- predicate in this case since we'll shortcut. + const False + + this_mod = lateCCEnv_module lateCCConfig_env diff --git a/compiler/GHC/Core/LateCC/OverloadedCalls.hs b/compiler/GHC/Core/LateCC/OverloadedCalls.hs new file mode 100644 index 0000000000000000000000000000000000000000..4a804ff0ed40b1fdba2beeb25db663605bc5db0d --- /dev/null +++ b/compiler/GHC/Core/LateCC/OverloadedCalls.hs @@ -0,0 +1,204 @@ +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE TupleSections #-} + +module GHC.Core.LateCC.OverloadedCalls + ( overloadedCallsCC + ) where + +import GHC.Prelude + +import Control.Monad.Trans.Class +import Control.Monad.Trans.Reader +import Control.Monad.Trans.State.Strict +import qualified GHC.Data.Strict as Strict + +import GHC.Data.FastString +import GHC.Core +import GHC.Core.LateCC.Utils +import GHC.Core.LateCC.Types +import GHC.Core.Make +import GHC.Core.Predicate +import GHC.Core.Type +import GHC.Core.Utils +import GHC.Tc.Utils.TcType +import GHC.Types.Id +import GHC.Types.Name +import GHC.Types.SrcLoc +import GHC.Types.Tickish +import GHC.Types.Var +import GHC.Utils.Outputable + +type OverloadedCallsCCState = Strict.Maybe SrcSpan + +-- | Insert cost centres on function applications with dictionary arguments. The +-- source locations attached to the cost centres is approximated based on the +-- "closest" source note encountered in the traversal. +overloadedCallsCC :: CoreBind -> LateCCM OverloadedCallsCCState CoreBind +overloadedCallsCC = + processBind + where + processBind :: CoreBind -> LateCCM OverloadedCallsCCState CoreBind + processBind core_bind = + case core_bind of + NonRec b e -> + NonRec b <$> wrap_if_join b (processExpr e) + Rec es -> + Rec <$> mapM (\(b,e) -> (b,) <$> wrap_if_join b (processExpr e)) es + where + -- If an overloaded function is turned into a join point, we won't add + -- SCCs directly to calls since it makes them non-tail calls. Instead, + -- we look for join points here and add an SCC to their RHS if they are + -- overloaded. + wrap_if_join :: + CoreBndr + -> LateCCM OverloadedCallsCCState CoreExpr + -> LateCCM OverloadedCallsCCState CoreExpr + wrap_if_join b pexpr = do + expr <- pexpr + if isJoinId b && isOverloadedTy (exprType expr) then do + let + cc_name :: FastString + cc_name = fsLit "join-rhs-" `appendFS` getOccFS b + + cc_srcspan <- + fmap (Strict.fromMaybe (UnhelpfulSpan UnhelpfulNoLocationInfo)) $ + lift $ gets lateCCState_extra + + insertCC cc_name cc_srcspan expr + else + return expr + + + processExpr :: CoreExpr -> LateCCM OverloadedCallsCCState CoreExpr + processExpr expr = + case expr of + -- The case we care about: Application + app@App{} -> do + -- Here we have some application like `f v1 ... vN`, where v1 ... vN + -- should be the function's type arguments followed by the value + -- arguments. To determine if the `f` is an overloaded function, we + -- check if any of the arguments v1 ... vN are dictionaries. + let + (f, xs) = collectArgs app + resultTy = applyTypeToArgs empty (exprType f) xs + + -- Recursively process the arguments first for no particular reason + args <- mapM processExpr xs + let app' = mkCoreApps f args + + if + -- Check if any of the arguments are dictionaries + any isDictExpr args + + -- Avoid instrumenting dictionary functions, which may be + -- overloaded if there are superclasses, by checking if the result + -- type of the function is a dictionary type. + && not (isDictTy resultTy) + + -- Avoid instrumenting constraint selectors like eq_sel + && (typeTypeOrConstraint resultTy /= ConstraintLike) + + -- Avoid instrumenting join points. + -- (See comment in processBind above) + && not (isJoinVarExpr f) + then do + -- Extract a name and source location from the function being + -- applied + let + cc_name :: FastString + cc_name = + fsLit $ maybe "<no name available>" getOccString (exprName app) + + cc_srcspan <- + fmap (Strict.fromMaybe (UnhelpfulSpan UnhelpfulNoLocationInfo)) $ + lift $ gets lateCCState_extra + + insertCC cc_name cc_srcspan app' + else + return app' + + -- For recursive constructors of Expr, we traverse the nested Exprs + Lam b e -> + mkCoreLams [b] <$> processExpr e + Let b e -> + mkCoreLet <$> processBind b <*> processExpr e + Case e b t alts -> + Case + <$> processExpr e + <*> pure b + <*> pure t + <*> mapM processAlt alts + Cast e co -> + mkCast <$> processExpr e <*> pure co + Tick t e -> do + trackSourceNote t $ + mkTick t <$> processExpr e + + -- For non-recursive constructors of Expr, we do nothing + x -> return x + + processAlt :: CoreAlt -> LateCCM OverloadedCallsCCState CoreAlt + processAlt (Alt c bs e) = Alt c bs <$> processExpr e + + trackSourceNote :: CoreTickish -> LateCCM OverloadedCallsCCState a -> LateCCM OverloadedCallsCCState a + trackSourceNote tick act = + case tick of + SourceNote rss _ -> do + -- Prefer source notes from the current file + in_current_file <- + maybe False ((== EQ) . lexicalCompareFS (srcSpanFile rss)) <$> + asks lateCCEnv_file + if not in_current_file then + act + else do + loc <- lift $ gets lateCCState_extra + lift . modify $ \s -> + s { lateCCState_extra = + Strict.Just $ RealSrcSpan rss mempty + } + x <- act + lift . modify $ \s -> + s { lateCCState_extra = loc + } + return x + _ -> + act + + -- Utility functions + + -- Extract a Name from an expression. If it is an application, attempt to + -- extract a name from the applied function. If it is a variable, return the + -- Name of the variable. If it is a tick/cast, attempt to extract a Name + -- from the expression held in the tick/cast. Otherwise return Nothing. + exprName :: CoreExpr -> Maybe Name + exprName = + \case + App f _ -> + exprName f + Var f -> + Just (idName f) + Tick _ e -> + exprName e + Cast e _ -> + exprName e + _ -> + Nothing + + -- Determine whether an expression is a dictionary + isDictExpr :: CoreExpr -> Bool + isDictExpr = + maybe False isDictTy . exprType' + where + exprType' :: CoreExpr -> Maybe Type + exprType' = \case + Type{} -> Nothing + expr -> Just $ exprType expr + + -- Determine whether an expression is a join variable + isJoinVarExpr :: CoreExpr -> Bool + isJoinVarExpr = + \case + Var var -> isJoinId var + Tick _ e -> isJoinVarExpr e + Cast e _ -> isJoinVarExpr e + _ -> False diff --git a/compiler/GHC/Core/LateCC/TopLevelBinds.hs b/compiler/GHC/Core/LateCC/TopLevelBinds.hs new file mode 100644 index 0000000000000000000000000000000000000000..299cf4040e8e708af059ebd7ed448487180507f6 --- /dev/null +++ b/compiler/GHC/Core/LateCC/TopLevelBinds.hs @@ -0,0 +1,106 @@ +{-# LANGUAGE TupleSections #-} +module GHC.Core.LateCC.TopLevelBinds where + +import GHC.Prelude + +import GHC.Core +-- import GHC.Core.LateCC +import GHC.Core.LateCC.Types +import GHC.Core.LateCC.Utils +import GHC.Core.Opt.Monad +import GHC.Driver.DynFlags +import GHC.Types.Id +import GHC.Types.Name +import GHC.Unit.Module.ModGuts + +{- Note [Collecting late cost centres] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Usually cost centres defined by a module are collected +during tidy by collectCostCentres. However with `-fprof-late` +we insert cost centres after inlining. So we keep a list of +all the cost centres we inserted and combine that with the list +of cost centres found during tidy. + +To avoid overhead when using -fprof-inline there is a flag to stop +us from collecting them here when we run this pass before tidy. + +Note [Adding late cost centres to top level bindings] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The basic idea is very simple. For every top level binder +`f = rhs` we compile it as if the user had written +`f = {-# SCC f #-} rhs`. + +If we do this after unfoldings for `f` have been created this +doesn't impact core-level optimizations at all. If we do it +before the cost centre will be included in the unfolding and +might inhibit optimizations at the call site. For this reason +we provide flags for both approaches as they have different +tradeoffs. + +We also don't add a cost centre for any binder that is a constructor +worker or wrapper. These will never meaningfully enrich the resulting +profile so we improve efficiency by omitting those. + +-} + +-- | Add late cost centres directly to the 'ModGuts'. This is used inside the +-- core pipeline with the -fprof-late-inline flag. It should not be used after +-- tidy, since it does not manually track inserted cost centers. See +-- Note [Collecting late cost centres]. +topLevelBindsCCMG :: ModGuts -> CoreM ModGuts +topLevelBindsCCMG guts = do + dflags <- getDynFlags + let + env = + LateCCEnv + { lateCCEnv_module = mg_module guts + + -- We don't use this for topLevelBindsCC, so Nothing is okay + , lateCCEnv_file = Nothing + + , lateCCEnv_countEntries= gopt Opt_ProfCountEntries dflags + , lateCCEnv_collectCCs = False + } + guts' = + guts + { mg_binds = + fst + ( doLateCostCenters + env + (initLateCCState ()) + (topLevelBindsCC (const True)) + (mg_binds guts) + ) + } + return guts' + +-- | Insert cost centres on top-level bindings in the module, depending on +-- whether or not they satisfy the given predicate. +topLevelBindsCC :: (CoreExpr -> Bool) -> CoreBind -> LateCCM s CoreBind +topLevelBindsCC pred core_bind = + case core_bind of + NonRec b rhs -> + NonRec b <$> doBndr b rhs + Rec bs -> + Rec <$> mapM doPair bs + where + doPair :: ((Id, CoreExpr) -> LateCCM s (Id, CoreExpr)) + doPair (b,rhs) = (b,) <$> doBndr b rhs + + doBndr :: Id -> CoreExpr -> LateCCM s CoreExpr + doBndr bndr rhs + -- Cost centres on constructor workers are pretty much useless + -- so we don't emit them if we are looking at the rhs of a constructor + -- binding. + | Just _ <- isDataConId_maybe bndr = pure rhs + | otherwise = if pred rhs then addCC bndr rhs else pure rhs + + -- We want to put the cost centre below the lambda as we only care about + -- executions of the RHS. + addCC :: Id -> CoreExpr -> LateCCM s CoreExpr + addCC bndr (Lam b rhs) = Lam b <$> addCC bndr rhs + addCC bndr rhs = do + let name = idName bndr + cc_loc = nameSrcSpan name + cc_name = getOccFS name + insertCC cc_name cc_loc rhs \ No newline at end of file diff --git a/compiler/GHC/Core/LateCC/Types.hs b/compiler/GHC/Core/LateCC/Types.hs new file mode 100644 index 0000000000000000000000000000000000000000..ca9ccc7b29c318add99cf601b4d48b5ffd83ca13 --- /dev/null +++ b/compiler/GHC/Core/LateCC/Types.hs @@ -0,0 +1,74 @@ +-- | Types related to late cost center insertion +module GHC.Core.LateCC.Types + ( LateCCConfig(..) + , LateCCBindSpec(..) + , LateCCEnv(..) + , LateCCState(..) + , initLateCCState + , LateCCM + ) where + +import GHC.Prelude + +import Control.Monad.Trans.Reader +import Control.Monad.Trans.State.Strict +import qualified Data.Set as S + +import GHC.Data.FastString +import GHC.Types.CostCentre +import GHC.Types.CostCentre.State +import GHC.Unit.Types + +-- | Late cost center insertion configuration. +-- +-- Specifies whether cost centers are added to overloaded function call sites +-- and/or top-level bindings, and which top-level bindings they are added to. +-- Also holds the cost center insertion environment. +data LateCCConfig = + LateCCConfig + { lateCCConfig_whichBinds :: !LateCCBindSpec + , lateCCConfig_overloadedCalls :: !Bool + , lateCCConfig_env :: !LateCCEnv + } + +-- | The types of top-level bindings we support adding cost centers to. +data LateCCBindSpec = + LateCCNone + | LateCCAllBinds + | LateCCOverloadedBinds + +-- | Late cost centre insertion environment +data LateCCEnv = LateCCEnv + { lateCCEnv_module :: !Module + -- ^ Current module + , lateCCEnv_file :: Maybe FastString + -- ^ Current file, if we have one + , lateCCEnv_countEntries:: !Bool + -- ^ Whether the inserted cost centers should count entries + , lateCCEnv_collectCCs :: !Bool + -- ^ Whether to collect the cost centres we insert. See + -- Note [Collecting late cost centres] + + } + +-- | Late cost centre insertion state, indexed by some extra state type that an +-- insertion method may require. +data LateCCState s = LateCCState + { lateCCState_ccs :: !(S.Set CostCentre) + -- ^ Cost centres that have been inserted + , lateCCState_ccState :: !CostCentreState + -- ^ Per-module state tracking for cost centre indices + , lateCCState_extra :: !s + } + +-- | The empty late cost centre insertion state +initLateCCState :: s -> LateCCState s +initLateCCState s = + LateCCState + { lateCCState_ccState = newCostCentreState + , lateCCState_ccs = mempty + , lateCCState_extra = s + } + +-- | Late cost centre insertion monad +type LateCCM s = ReaderT LateCCEnv (State (LateCCState s)) diff --git a/compiler/GHC/Core/LateCC/Utils.hs b/compiler/GHC/Core/LateCC/Utils.hs new file mode 100644 index 0000000000000000000000000000000000000000..3ad673ac7c67a3f98c3bdfc1c2691e35b1a620ce --- /dev/null +++ b/compiler/GHC/Core/LateCC/Utils.hs @@ -0,0 +1,80 @@ +module GHC.Core.LateCC.Utils + ( -- * Inserting cost centres + doLateCostCenters -- Might be useful for API users + + -- ** Helpers for defining insertion methods + , getCCFlavour + , insertCC + ) where + +import GHC.Prelude + +import Control.Monad +import Control.Monad.Trans.Class +import Control.Monad.Trans.Reader +import Control.Monad.Trans.State.Strict +import qualified Data.Set as S + +import GHC.Core +import GHC.Core.LateCC.Types +import GHC.Core.Utils +import GHC.Data.FastString +import GHC.Types.CostCentre +import GHC.Types.CostCentre.State +import GHC.Types.SrcLoc +import GHC.Types.Tickish + +-- | Insert cost centres into the 'CoreProgram' using the provided environment, +-- initial state, and insertion method. +doLateCostCenters + :: LateCCEnv + -- ^ Environment to run the insertion in + -> LateCCState s + -- ^ Initial state to run the insertion with + -> (CoreBind -> LateCCM s CoreBind) + -- ^ Insertion method + -> CoreProgram + -- ^ Bindings to consider + -> (CoreProgram, LateCCState s) +doLateCostCenters env state method binds = + runLateCC env state $ mapM method binds + +-- | Evaluate late cost centre insertion +runLateCC :: LateCCEnv -> LateCCState s -> LateCCM s a -> (a, LateCCState s) +runLateCC env state = (`runState` state) . (`runReaderT` env) + +-- | Given the name of a cost centre, get its flavour +getCCFlavour :: FastString -> LateCCM s CCFlavour +getCCFlavour name = mkLateCCFlavour <$> getCCIndex' name + where + getCCIndex' :: FastString -> LateCCM s CostCentreIndex + getCCIndex' name = do + cc_state <- lift $ gets lateCCState_ccState + let (index, cc_state') = getCCIndex name cc_state + lift . modify $ \s -> s { lateCCState_ccState = cc_state'} + return index + +-- | Insert a cost centre with the specified name and source span on the given +-- expression. The inserted cost centre will be appropriately tracked in the +-- late cost centre state. +insertCC + :: FastString + -- ^ Name of the cost centre to insert + -> SrcSpan + -- ^ Source location to associate with the cost centre + -> CoreExpr + -- ^ Expression to wrap in the cost centre + -> LateCCM s CoreExpr +insertCC cc_name cc_loc expr = do + cc_flavour <- getCCFlavour cc_name + env <- ask + let + cc_mod = lateCCEnv_module env + cc = NormalCC cc_flavour cc_name cc_mod cc_loc + note = ProfNote cc (lateCCEnv_countEntries env) True + when (lateCCEnv_collectCCs env) $ do + lift . modify $ \s -> + s { lateCCState_ccs = S.insert cc (lateCCState_ccs s) + } + return $ mkTick note expr + diff --git a/compiler/GHC/Core/Opt/Pipeline.hs b/compiler/GHC/Core/Opt/Pipeline.hs index d86cf24d4bd334da590b2b604d31126a9c380cfa..17b14a75b79620958ba1fe85637af69d820a57cd 100644 --- a/compiler/GHC/Core/Opt/Pipeline.hs +++ b/compiler/GHC/Core/Opt/Pipeline.hs @@ -43,7 +43,7 @@ import GHC.Core.Opt.CallArity ( callArityAnalProgram ) import GHC.Core.Opt.Exitify ( exitifyProgram ) import GHC.Core.Opt.WorkWrap ( wwTopBinds ) import GHC.Core.Opt.CallerCC ( addCallerCostCentres ) -import GHC.Core.LateCC (addLateCostCentresMG) +import GHC.Core.LateCC.TopLevelBinds (topLevelBindsCCMG) import GHC.Core.Seq (seqBinds) import GHC.Core.FamInstEnv @@ -520,7 +520,7 @@ doCorePass pass guts = do addCallerCostCentres guts CoreAddLateCcs -> {-# SCC "AddLateCcs" #-} - addLateCostCentresMG guts + topLevelBindsCCMG guts CoreDoPrintCore -> {-# SCC "PrintCore" #-} liftIO $ printCore logger (mg_binds guts) >> return guts diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs index 59647631f45452c905b00887003ced14f5b2b4b5..070a09394473c802021ab8ede108060dc2d36bfa 100644 --- a/compiler/GHC/Driver/Flags.hs +++ b/compiler/GHC/Driver/Flags.hs @@ -341,6 +341,8 @@ data GeneralFlag | Opt_ProfCountEntries | Opt_ProfLateInlineCcs | Opt_ProfLateCcs + | Opt_ProfLateOverloadedCcs + | Opt_ProfLateoverloadedCallsCCs | Opt_ProfManualCcs -- ^ Ignore manual SCC annotations -- misc opts diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs index cc5d192cf127f614362bef6922f78869a6608e59..1d597014270c3e95ec9b425352c4afbc9819c222 100644 --- a/compiler/GHC/Driver/Main.hs +++ b/compiler/GHC/Driver/Main.hs @@ -175,7 +175,6 @@ import GHC.Iface.Ext.Debug ( diffFile, validateScopes ) import GHC.Core import GHC.Core.Lint.Interactive ( interactiveInScope ) import GHC.Core.Tidy ( tidyExpr ) -import GHC.Core.Type ( Type, Kind ) import GHC.Core.Utils ( exprType ) import GHC.Core.ConLike import GHC.Core.Opt.Pipeline @@ -185,7 +184,8 @@ import GHC.Core.InstEnv import GHC.Core.FamInstEnv import GHC.Core.Rules import GHC.Core.Stats -import GHC.Core.LateCC (addLateCostCentresPgm) +import GHC.Core.LateCC +import GHC.Core.LateCC.Types import GHC.CoreToStg.Prep @@ -197,6 +197,7 @@ import GHC.Parser.Lexer as Lexer import GHC.Tc.Module import GHC.Tc.Utils.Monad +import GHC.Tc.Utils.TcType import GHC.Tc.Zonk.Env ( ZonkFlexi (DefaultFlexi) ) import GHC.Stg.Syntax @@ -297,7 +298,6 @@ import GHC.StgToCmm.Utils (IPEStats) import GHC.Types.Unique.FM import GHC.Types.Unique.DFM import GHC.Cmm.Config (CmmConfig) -import GHC.Types.CostCentre.State (newCostCentreState) {- ********************************************************************** @@ -1791,22 +1791,41 @@ hscGenHardCode hsc_env cgguts location output_filename = do ------------------- - -- Insert late cost centres if enabled. - -- If `-fprof-late-inline` is enabled we can skip this, as it will have added - -- a superset of cost centres we would add here already. - - (late_cc_binds, late_local_ccs, cc_state) <- - if gopt Opt_ProfLateCcs dflags && not (gopt Opt_ProfLateInlineCcs dflags) - then - withTiming - logger - (text "LateCCs"<+>brackets (ppr this_mod)) - (const ()) - $ {-# SCC lateCC #-} do - (binds, late_ccs, cc_state) <- addLateCostCentresPgm dflags logger this_mod core_binds - return ( binds, (S.toList late_ccs `mappend` local_ccs ), cc_state) + -- Insert late cost centres based on the provided flags. + -- + -- If -fprof-late-inline is enabled, we will skip adding CCs on any + -- top-level bindings here (via shortcut in `addLateCostCenters`), + -- since it will have already added a superset of the CCs we would add + -- here. + let + late_cc_config :: LateCCConfig + late_cc_config = + LateCCConfig + { lateCCConfig_whichBinds = + if gopt Opt_ProfLateInlineCcs dflags then + LateCCNone + else if gopt Opt_ProfLateCcs dflags then + LateCCAllBinds + else if gopt Opt_ProfLateOverloadedCcs dflags then + LateCCOverloadedBinds else - return (core_binds, local_ccs, newCostCentreState) + LateCCNone + , lateCCConfig_overloadedCalls = + gopt Opt_ProfLateoverloadedCallsCCs dflags + , lateCCConfig_env = + LateCCEnv + { lateCCEnv_module = this_mod + , lateCCEnv_file = fsLit <$> ml_hs_file location + , lateCCEnv_countEntries= gopt Opt_ProfCountEntries dflags + , lateCCEnv_collectCCs = True + } + } + + (late_cc_binds, late_cc_state) <- + addLateCostCenters logger late_cc_config core_binds + + when (dopt Opt_D_dump_late_cc dflags || dopt Opt_D_verbose_core2core dflags) $ + putDumpFileMaybe logger Opt_D_dump_late_cc "LateCC" FormatCore (vcat (map ppr late_cc_binds)) ------------------- -- Run late plugins @@ -1820,7 +1839,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do cg_hpc_info = hpc_info, cg_spt_entries = spt_entries, cg_binds = late_binds, - cg_ccs = late_local_ccs' + cg_ccs = late_local_ccs } , _ ) <- @@ -1833,9 +1852,9 @@ hscGenHardCode hsc_env cgguts location output_filename = do (($ hsc_env) . latePlugin) ( cgguts { cg_binds = late_cc_binds - , cg_ccs = late_local_ccs + , cg_ccs = S.toList (lateCCState_ccs late_cc_state) ++ local_ccs } - , cc_state + , lateCCState_ccState late_cc_state ) let @@ -1876,7 +1895,7 @@ hscGenHardCode hsc_env cgguts location output_filename = do let (stg_binds,_stg_deps) = unzip stg_binds_with_deps let cost_centre_info = - (late_local_ccs' ++ caf_ccs, caf_cc_stacks) + (late_local_ccs ++ caf_ccs, caf_cc_stacks) platform = targetPlatform dflags prof_init | sccProfilingEnabled dflags = profilingInitCode platform this_mod cost_centre_info diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs index 7cc7a562c4b9c38b35d5ba2e9996549c956f8a38..2289184525437152bab1778c5ddc49f30631ec70 100644 --- a/compiler/GHC/Driver/Session.hs +++ b/compiler/GHC/Driver/Session.hs @@ -2444,6 +2444,8 @@ fFlagsDeps = [ flagSpec "prof-cafs" Opt_AutoSccsOnIndividualCafs, flagSpec "prof-count-entries" Opt_ProfCountEntries, flagSpec "prof-late" Opt_ProfLateCcs, + flagSpec "prof-late-overloaded" Opt_ProfLateOverloadedCcs, + flagSpec "prof-late-overloaded-calls" Opt_ProfLateoverloadedCallsCCs, flagSpec "prof-manual" Opt_ProfManualCcs, flagSpec "prof-late-inline" Opt_ProfLateInlineCcs, flagSpec "regs-graph" Opt_RegsGraph, @@ -3763,6 +3765,10 @@ needSourceNotes :: DynFlags -> Bool needSourceNotes dflags = debugLevel dflags > 0 || gopt Opt_InfoTableMap dflags + -- Source ticks are used to approximate the location of + -- overloaded call cost centers + || gopt Opt_ProfLateoverloadedCallsCCs dflags + -- ----------------------------------------------------------------------------- -- Linker/compiler information diff --git a/compiler/GHC/Tc/Utils/TcType.hs b/compiler/GHC/Tc/Utils/TcType.hs index 40da653b1d8e5199eb6de43e5f8ce5b4a1701100..fc042d1a6687787c4c38d6fb9ee0aafb779ae5e7 100644 --- a/compiler/GHC/Tc/Utils/TcType.hs +++ b/compiler/GHC/Tc/Utils/TcType.hs @@ -1907,7 +1907,7 @@ isRhoExpTy (Infer {}) = True isOverloadedTy :: Type -> Bool -- Yes for a type of a function that might require evidence-passing --- Used only by bindLocalMethods +-- Used by bindLocalMethods and for -fprof-late-overloaded isOverloadedTy ty | Just ty' <- coreView ty = isOverloadedTy ty' isOverloadedTy (ForAllTy _ ty) = isOverloadedTy ty isOverloadedTy (FunTy { ft_af = af }) = isInvisibleFunArg af diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in index f6bbb9c758ef1beb494c687bdb6acfa6d47b6b1c..523f2d7bd197950a4fb90bae4ee715530cfd875a 100644 --- a/compiler/ghc.cabal.in +++ b/compiler/ghc.cabal.in @@ -336,6 +336,10 @@ Library GHC.Core.Lint GHC.Core.Lint.Interactive GHC.Core.LateCC + GHC.Core.LateCC.Types + GHC.Core.LateCC.TopLevelBinds + GHC.Core.LateCC.Utils + GHC.Core.LateCC.OverloadedCalls GHC.Core.Make GHC.Core.Map.Expr GHC.Core.Map.Type diff --git a/docs/users_guide/9.10.1-notes.rst b/docs/users_guide/9.10.1-notes.rst index 18af67c4a37c9f6154ab4c4213c70f125ad2f3e3..a72874e12f3599686b40037bb79c573018cfba5d 100644 --- a/docs/users_guide/9.10.1-notes.rst +++ b/docs/users_guide/9.10.1-notes.rst @@ -186,6 +186,15 @@ Compiler This means that if you are using ``-fllvm`` you now need ``llc``, ``opt`` and ``clang`` available. +- The :ghc-flag:`-fprof-late-overloaded` flag has been introduced. It causes + cost centres to be added to *overloaded* top level bindings, unlike + :ghc-flag:`-fprof-late` which adds cost centres to all top level bindings. + +- The :ghc-flag:`-fprof-late-overloaded-calls` flag has been introduced. It + causes cost centres to be inserted at call sites including instance dictionary + arguments. This may be preferred over :ghc-flag:`-fprof-late-overloaded` since + it may reveal whether imported functions are called overloaded. + JavaScript backend ~~~~~~~~~~~~~~~~~~ diff --git a/docs/users_guide/profiling.rst b/docs/users_guide/profiling.rst index c8c54ca7523ba7d54d0e4157236bf33f8cda9eae..bf10204df665f710bf9866d9e3b053294a7b21d0 100644 --- a/docs/users_guide/profiling.rst +++ b/docs/users_guide/profiling.rst @@ -518,6 +518,49 @@ of your profiled program will be different to that of the unprofiled one. You can try this mode if :ghc-flag:`-fprof-late` results in a profile that's too hard to interpret. +.. ghc-flag:: -fprof-late-overloaded + :shortdesc: Auto-add ``SCC``\\ s to all top level overloaded bindings *after* the core pipeline has run. + :type: dynamic + :reverse: -fno-prof-late-overloaded + :category: + + :since: 9.10.1 + + Adds an automatic ``SCC`` annotation to all *overloaded* top level bindings + late in the compilation pipeline after the optimizer has run and unfoldings + have been created. This means these cost centres will not interfere with + core-level optimizations and the resulting profile will be closer to the + performance profile of an optimized non-profiled executable. + + This flag can help determine which top level bindings encountered during a + program's execution are still overloaded after inlining and specialization. + +.. ghc-flag:: -fprof-late-overloaded-calls + :shortdesc: Auto-add ``SCC``\\ s to all call sites that include dictionary arguments *after* the core pipeline has run. + :type: dynamic + :reverse: -fno-prof-late-overloaded-calls + :category: + + :since: 9.10.1 + + Adds an automatic ``SCC`` annotation to all call sites that include + dictionary arguments late in the compilation pipeline after the optimizer + has run and unfoldings have been created. This means these cost centres will + not interfere with core-level optimizations and the resulting profile will + be closer to the performance profile of an optimized non-profiled + executable. + + This flag is potentially more useful than :ghc-flag:`-fprof-late-overloaded` + since it will also add ``SCC`` annotations to call sites of imported + overloaded functions. + + Some overloaded calls may not be annotated, specifically in cases where the + optimizer turns an overloaded function into a join point. Calls to such + functions will not be wrapped in ``SCC`` annotations, since it would make + them non-tail calls, which is a requirement for join points. Instead, + ``SCC`` annotations are added around the body of overloaded join variables + and given distinct names (``join-rhs-<var>``) to avoid confusion. + .. ghc-flag:: -fprof-cafs :shortdesc: Auto-add ``SCC``\\ s to all CAFs :type: dynamic diff --git a/testsuite/tests/profiling/should_run/all.T b/testsuite/tests/profiling/should_run/all.T index 11e6aa643d2a01f300737a959fd17e472df58a71..b343a9cd4495fa6a76c26032938ec8adc6db1d39 100644 --- a/testsuite/tests/profiling/should_run/all.T +++ b/testsuite/tests/profiling/should_run/all.T @@ -195,3 +195,30 @@ test('ignore_scc', [], compile_and_run, ['-fno-prof-manual']) test('T21446', [], makefile_test, ['T21446']) + + +test('scc-prof-overloaded001', + [], + compile_and_run, + ['-fno-prof-auto -fno-full-laziness -fprof-late-overloaded'] # See Note [consistent stacks] +) + +test('scc-prof-overloaded002', + [], + compile_and_run, + ['-fno-prof-auto -fno-full-laziness -fprof-late-overloaded'] # See Note [consistent stacks] +) + +test('scc-prof-overloaded-calls001', + [], + compile_and_run, + # Need optimizations to get rid of unwanted overloaded calls + ['-O -fno-prof-auto -fno-full-laziness -fprof-late-overloaded-calls'] # See Note [consistent stacks] +) + +test('scc-prof-overloaded-calls002', + [], + compile_and_run, + # Need optimizations to get rid of unwanted overloaded calls + ['-O -fno-prof-auto -fprof-late-overloaded-calls'] +) diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.hs b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.hs new file mode 100644 index 0000000000000000000000000000000000000000..558856b7c1a709c75998f31d31789db90eb6c6de --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.hs @@ -0,0 +1,24 @@ +-- Running this program should result in two calls to overloaded functions: One +-- with the $fShowX dictionary, the next with the $fShowList dictionary +-- constructor for X. +-- +-- Note that although the `$fShowList` dictionary constructor is itself +-- overloaded, it should not get an SCC since we avoid instrumenting overloaded +-- calls that result in dictionaries. +-- +-- With just -fprof-late-overloaded, only `invoke` should get an SCC, since it +-- is the only overloaded top level binding. With +-- `-fprof-late-overloaded-calls`, the calls to both `invoke` and `f` (in the +-- body of invoke) should get SCCs. + +module Main where + +{-# NOINLINE invoke #-} +invoke :: Show a => (Show [a] => [a] -> String) -> a -> String +invoke f x = f [x] + +data X = X + deriving Show + +main :: IO () +main = putStrLn (invoke show X) diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.prof.sample b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.prof.sample new file mode 100644 index 0000000000000000000000000000000000000000..e91d9477070766a68ad74710b9a91dfa10e2b2f9 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.prof.sample @@ -0,0 +1,26 @@ + Thu Jan 4 11:49 2024 Time and Allocation Profiling Report (Final) + + scc-prof-overloaded-calls001 +RTS -hc -p -RTS + + total time = 0.00 secs (0 ticks @ 1000 us, 1 processor) + total alloc = 48,320 bytes (excludes profiling overheads) + +COST CENTRE MODULE SRC %time %alloc + +MAIN MAIN <built-in> 0.0 20.5 +CAF GHC.IO.Handle.FD <entire-module> 0.0 71.9 +CAF GHC.IO.Encoding <entire-module> 0.0 5.1 +CAF GHC.Conc.Signal <entire-module> 0.0 1.3 + + + individual inherited +COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc + +MAIN MAIN <built-in> 128 0 0.0 20.5 0.0 100.0 + CAF Main <entire-module> 255 0 0.0 0.0 0.0 0.8 + invoke Main scc-prof-overloaded-calls001.hs:24:1-31 256 1 0.0 0.3 0.0 0.8 + f Main scc-prof-overloaded-calls001.hs:18:1-18 257 1 0.0 0.6 0.0 0.6 + CAF GHC.Conc.Signal <entire-module> 238 0 0.0 1.3 0.0 1.3 + CAF GHC.IO.Encoding <entire-module> 219 0 0.0 5.1 0.0 5.1 + CAF GHC.IO.Encoding.Iconv <entire-module> 217 0 0.0 0.4 0.0 0.4 + CAF GHC.IO.Handle.FD <entire-module> 208 0 0.0 71.9 0.0 71.9 diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.stdout b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.stdout new file mode 100644 index 0000000000000000000000000000000000000000..b6bf4fe8dcb70a3358fb828f325e29b9773d6e47 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls001.stdout @@ -0,0 +1 @@ +[X] diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.hs b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.hs new file mode 100644 index 0000000000000000000000000000000000000000..819a68a00c3bf1827725e5358f611063a0b2bc24 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.hs @@ -0,0 +1,65 @@ +-- Running this program should result in seven calls to overloaded functions +-- with increasing numbers of dictionary arguments. +-- +-- With just -fprof-late-overloaded, no SCCs should be added, since none of the +-- overloaded functions are top level. With `-fprof-late-overloaded-calls`, all +-- seven calls should get *distinct* SCCs with separate source locations even +-- though the overloaded functions share an OccName (`f`). + +module Main where + +data X = X + +instance Show X where +instance Num X where +instance Eq X where +instance Enum X where +instance Ord X where +instance Real X where +instance Integral X where + +-- No overloaded call +{-# NOINLINE invoke0 #-} +invoke0 :: (forall a. a -> a -> String) -> X -> String +invoke0 f val = f val val + +{-# NOINLINE invoke1 #-} +invoke1 :: (forall a. Show a => a -> a -> String) -> X -> String +invoke1 f val = f val val + +{-# NOINLINE invoke2 #-} +invoke2 :: (forall a. (Show a, Num a) => a -> a -> String) -> X -> String +invoke2 f val = f val val + +{-# NOINLINE invoke3 #-} +invoke3 :: (forall a. (Show a, Num a, Eq a) => a -> a -> String) -> X -> String +invoke3 f val = f val val + +{-# NOINLINE invoke4 #-} +invoke4 :: (forall a. (Show a, Num a, Eq a, Enum a) => a -> a -> String) -> X -> String +invoke4 f val = f val val + +{-# NOINLINE invoke5 #-} +invoke5 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a) => a -> a -> String) -> X -> String +invoke5 f val = f val val + +{-# NOINLINE invoke6 #-} +invoke6 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a) => a -> a -> String) -> X -> String +invoke6 f val = f val val + +{-# NOINLINE invoke7 #-} +invoke7 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a, Integral a) => a -> a -> String) -> X -> String +invoke7 f val = f val val + +main :: IO () +main = do + putStrLn $ invoke0 (\_ _ -> s) X + putStrLn $ invoke1 (\_ _ -> s) X + putStrLn $ invoke2 (\_ _ -> s) X + putStrLn $ invoke3 (\_ _ -> s) X + putStrLn $ invoke4 (\_ _ -> s) X + putStrLn $ invoke5 (\_ _ -> s) X + putStrLn $ invoke6 (\_ _ -> s) X + putStrLn $ invoke7 (\_ _ -> s) X + where + s = "wibbly" diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.prof.sample b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.prof.sample new file mode 100644 index 0000000000000000000000000000000000000000..e9b412240920991b7c8c01a77ed117b34788f363 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.prof.sample @@ -0,0 +1,31 @@ + Fri Jan 5 11:06 2024 Time and Allocation Profiling Report (Final) + + scc-prof-overloaded-calls002 +RTS -p -RTS + + total time = 0.00 secs (0 ticks @ 1000 us, 1 processor) + total alloc = 59,152 bytes (excludes profiling overheads) + +COST CENTRE MODULE SRC %time %alloc + +MAIN MAIN <built-in> 0.0 34.8 +CAF GHC.IO.Handle.FD <entire-module> 0.0 58.7 +CAF GHC.IO.Encoding <entire-module> 0.0 4.1 +CAF GHC.Conc.Signal <entire-module> 0.0 1.1 + + + individual inherited +COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc + +MAIN MAIN <built-in> 128 0 0.0 34.8 0.0 100.0 + CAF Main <entire-module> 255 0 0.0 0.6 0.0 0.9 + f Main scc-prof-overloaded-calls002.hs:52:1-25 262 1 0.0 0.1 0.0 0.1 + f Main scc-prof-overloaded-calls002.hs:48:1-25 261 1 0.0 0.1 0.0 0.1 + f Main scc-prof-overloaded-calls002.hs:44:1-25 260 1 0.0 0.1 0.0 0.1 + f Main scc-prof-overloaded-calls002.hs:40:1-25 259 1 0.0 0.0 0.0 0.0 + f Main scc-prof-overloaded-calls002.hs:36:1-25 258 1 0.0 0.0 0.0 0.0 + f Main scc-prof-overloaded-calls002.hs:32:1-25 257 1 0.0 0.0 0.0 0.0 + f Main scc-prof-overloaded-calls002.hs:28:1-25 256 1 0.0 0.0 0.0 0.0 + CAF GHC.Conc.Signal <entire-module> 238 0 0.0 1.1 0.0 1.1 + CAF GHC.IO.Encoding <entire-module> 219 0 0.0 4.1 0.0 4.1 + CAF GHC.IO.Encoding.Iconv <entire-module> 217 0 0.0 0.3 0.0 0.3 + CAF GHC.IO.Handle.FD <entire-module> 208 0 0.0 58.7 0.0 58.7 diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.stdout b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.stdout new file mode 100644 index 0000000000000000000000000000000000000000..7d7651234f393886a920ada34ff317131fa1259e --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded-calls002.stdout @@ -0,0 +1,8 @@ +wibbly +wibbly +wibbly +wibbly +wibbly +wibbly +wibbly +wibbly diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded001.hs b/testsuite/tests/profiling/should_run/scc-prof-overloaded001.hs new file mode 100644 index 0000000000000000000000000000000000000000..558856b7c1a709c75998f31d31789db90eb6c6de --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded001.hs @@ -0,0 +1,24 @@ +-- Running this program should result in two calls to overloaded functions: One +-- with the $fShowX dictionary, the next with the $fShowList dictionary +-- constructor for X. +-- +-- Note that although the `$fShowList` dictionary constructor is itself +-- overloaded, it should not get an SCC since we avoid instrumenting overloaded +-- calls that result in dictionaries. +-- +-- With just -fprof-late-overloaded, only `invoke` should get an SCC, since it +-- is the only overloaded top level binding. With +-- `-fprof-late-overloaded-calls`, the calls to both `invoke` and `f` (in the +-- body of invoke) should get SCCs. + +module Main where + +{-# NOINLINE invoke #-} +invoke :: Show a => (Show [a] => [a] -> String) -> a -> String +invoke f x = f [x] + +data X = X + deriving Show + +main :: IO () +main = putStrLn (invoke show X) diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded001.prof.sample b/testsuite/tests/profiling/should_run/scc-prof-overloaded001.prof.sample new file mode 100644 index 0000000000000000000000000000000000000000..f8804f0ad9f97ba8a0c7382a35e1254582ade982 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded001.prof.sample @@ -0,0 +1,25 @@ + Thu Jan 4 11:26 2024 Time and Allocation Profiling Report (Final) + + scc-prof-overloaded001 +RTS -hc -p -RTS + + total time = 0.00 secs (0 ticks @ 1000 us, 1 processor) + total alloc = 48,304 bytes (excludes profiling overheads) + +COST CENTRE MODULE SRC %time %alloc + +MAIN MAIN <built-in> 0.0 20.5 +CAF GHC.IO.Handle.FD <entire-module> 0.0 71.9 +CAF GHC.IO.Encoding <entire-module> 0.0 5.1 +CAF GHC.Conc.Signal <entire-module> 0.0 1.3 + + + individual inherited +COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc + +MAIN MAIN <built-in> 128 0 0.0 20.5 0.0 100.0 + CAF Main <entire-module> 255 0 0.0 0.0 0.0 0.8 + invoke Main scc-prof-overloaded001.hs:18:1-6 256 1 0.0 0.8 0.0 0.8 + CAF GHC.Conc.Signal <entire-module> 238 0 0.0 1.3 0.0 1.3 + CAF GHC.IO.Encoding <entire-module> 219 0 0.0 5.1 0.0 5.1 + CAF GHC.IO.Encoding.Iconv <entire-module> 217 0 0.0 0.4 0.0 0.4 + CAF GHC.IO.Handle.FD <entire-module> 208 0 0.0 71.9 0.0 71.9 diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded001.stdout b/testsuite/tests/profiling/should_run/scc-prof-overloaded001.stdout new file mode 100644 index 0000000000000000000000000000000000000000..b6bf4fe8dcb70a3358fb828f325e29b9773d6e47 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded001.stdout @@ -0,0 +1 @@ +[X] diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded002.hs b/testsuite/tests/profiling/should_run/scc-prof-overloaded002.hs new file mode 100644 index 0000000000000000000000000000000000000000..819a68a00c3bf1827725e5358f611063a0b2bc24 --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded002.hs @@ -0,0 +1,65 @@ +-- Running this program should result in seven calls to overloaded functions +-- with increasing numbers of dictionary arguments. +-- +-- With just -fprof-late-overloaded, no SCCs should be added, since none of the +-- overloaded functions are top level. With `-fprof-late-overloaded-calls`, all +-- seven calls should get *distinct* SCCs with separate source locations even +-- though the overloaded functions share an OccName (`f`). + +module Main where + +data X = X + +instance Show X where +instance Num X where +instance Eq X where +instance Enum X where +instance Ord X where +instance Real X where +instance Integral X where + +-- No overloaded call +{-# NOINLINE invoke0 #-} +invoke0 :: (forall a. a -> a -> String) -> X -> String +invoke0 f val = f val val + +{-# NOINLINE invoke1 #-} +invoke1 :: (forall a. Show a => a -> a -> String) -> X -> String +invoke1 f val = f val val + +{-# NOINLINE invoke2 #-} +invoke2 :: (forall a. (Show a, Num a) => a -> a -> String) -> X -> String +invoke2 f val = f val val + +{-# NOINLINE invoke3 #-} +invoke3 :: (forall a. (Show a, Num a, Eq a) => a -> a -> String) -> X -> String +invoke3 f val = f val val + +{-# NOINLINE invoke4 #-} +invoke4 :: (forall a. (Show a, Num a, Eq a, Enum a) => a -> a -> String) -> X -> String +invoke4 f val = f val val + +{-# NOINLINE invoke5 #-} +invoke5 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a) => a -> a -> String) -> X -> String +invoke5 f val = f val val + +{-# NOINLINE invoke6 #-} +invoke6 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a) => a -> a -> String) -> X -> String +invoke6 f val = f val val + +{-# NOINLINE invoke7 #-} +invoke7 :: (forall a. (Show a, Num a, Eq a, Enum a, Ord a, Real a, Integral a) => a -> a -> String) -> X -> String +invoke7 f val = f val val + +main :: IO () +main = do + putStrLn $ invoke0 (\_ _ -> s) X + putStrLn $ invoke1 (\_ _ -> s) X + putStrLn $ invoke2 (\_ _ -> s) X + putStrLn $ invoke3 (\_ _ -> s) X + putStrLn $ invoke4 (\_ _ -> s) X + putStrLn $ invoke5 (\_ _ -> s) X + putStrLn $ invoke6 (\_ _ -> s) X + putStrLn $ invoke7 (\_ _ -> s) X + where + s = "wibbly" diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded002.prof.sample b/testsuite/tests/profiling/should_run/scc-prof-overloaded002.prof.sample new file mode 100644 index 0000000000000000000000000000000000000000..e28b4b04f62a8dccc2c828551cabc398e911582b --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded002.prof.sample @@ -0,0 +1,23 @@ + Thu Jan 4 11:55 2024 Time and Allocation Profiling Report (Final) + + scc-prof-overloaded002 +RTS -hc -p -RTS + + total time = 0.00 secs (0 ticks @ 1000 us, 1 processor) + total alloc = 56,472 bytes (excludes profiling overheads) + +COST CENTRE MODULE SRC %time %alloc + +MAIN MAIN <built-in> 0.0 32.7 +CAF GHC.IO.Handle.FD <entire-module> 0.0 61.5 +CAF GHC.IO.Encoding <entire-module> 0.0 4.3 +CAF GHC.Conc.Signal <entire-module> 0.0 1.1 + + + individual inherited +COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc + +MAIN MAIN <built-in> 128 0 0.0 32.7 0.0 100.0 + CAF GHC.Conc.Signal <entire-module> 238 0 0.0 1.1 0.0 1.1 + CAF GHC.IO.Encoding <entire-module> 219 0 0.0 4.3 0.0 4.3 + CAF GHC.IO.Encoding.Iconv <entire-module> 217 0 0.0 0.4 0.0 0.4 + CAF GHC.IO.Handle.FD <entire-module> 208 0 0.0 61.5 0.0 61.5 diff --git a/testsuite/tests/profiling/should_run/scc-prof-overloaded002.stdout b/testsuite/tests/profiling/should_run/scc-prof-overloaded002.stdout new file mode 100644 index 0000000000000000000000000000000000000000..7d7651234f393886a920ada34ff317131fa1259e --- /dev/null +++ b/testsuite/tests/profiling/should_run/scc-prof-overloaded002.stdout @@ -0,0 +1,8 @@ +wibbly +wibbly +wibbly +wibbly +wibbly +wibbly +wibbly +wibbly