From bc4d67b72b600a1ccdedaa67066f1ac85260b98d Mon Sep 17 00:00:00 2001 From: Matthew Craven <5086-clyring@users.noreply.gitlab.haskell.org> Date: Sat, 16 Dec 2023 09:24:20 -0500 Subject: [PATCH] StgToCmm: Detect some no-op case-continuations ...and generate no code for them. Fixes #24264. --- compiler/GHC/StgToCmm/Expr.hs | 52 +++++++ .../tests/codeGen/should_compile/T24264.hs | 42 ++++++ .../codeGen/should_compile/T24264.stderr | 142 ++++++++++++++++++ testsuite/tests/codeGen/should_compile/all.T | 2 + .../tests/codeGen/should_run/T24264run.hs | 32 ++++ testsuite/tests/codeGen/should_run/all.T | 1 + 6 files changed, 271 insertions(+) create mode 100644 testsuite/tests/codeGen/should_compile/T24264.hs create mode 100644 testsuite/tests/codeGen/should_compile/T24264.stderr create mode 100644 testsuite/tests/codeGen/should_run/T24264run.hs diff --git a/compiler/GHC/StgToCmm/Expr.hs b/compiler/GHC/StgToCmm/Expr.hs index 3e39d3d6e87f..ca5b4b43fdcf 100644 --- a/compiler/GHC/StgToCmm/Expr.hs +++ b/compiler/GHC/StgToCmm/Expr.hs @@ -570,6 +570,58 @@ cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr alt_type alts -- Use the same return convention as vanilla 'a'. cgCase (StgApp a []) bndr alt_type alts +{- +Note [Eliminate trivial Solo# continuations] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If we have code like this: + + case scrut of bndr { + alt -> Solo# bndr + } + +The RHS of the only branch does nothing except wrap the case-binder +returned by 'scrut' in a unary unboxed tuple. But unboxed tuples +don't exist at run-time, i.e. the branch is a no-op! So we can +generate code as if we just had 'scrut' instead of a case-expression. + +This situation can easily arise for IO or ST code, where the last +operation a function performs is commonly 'pure $! someExpr'. +See also #24264 and !11778. More concretely, as of December 2023, +when building a stage2 "perf+no_profiled_libs" ghc: + + * The special case is reached 398 times. + * Of these, 158 have scrutinees that call a function or enter a + potential thunk, and would need to push a useless stack frame if + not for this optimisation. + +We might consider rewriting such case expressions in GHC.Stg.CSE as a +slight extension of Note [All alternatives are the binder]. But the +RuntimeReps of 'bndr' and 'Solo# bndr' are not exactly the same, and +per Note [Typing the STG language] in GHC.Stg.Lint, we do expect Stg +code to remain RuntimeRep-correct. So we just detect the situation in +StgToCmm instead. + +Crucially, the return conventions for 'ty' and '(# ty #)' are compatible: +The returned value is passed in the same register(s) or stack slot in +both conventions, and the set of allowed return values for 'ty' +is a subset of the allowed return values for '(# ty #)': + + * For a lifted type 'ty', the return convention for 'ty' promises to + return an evaluated-properly-tagged heap pointer, while a return + type '(# ty #)' only promises to return a heap pointer to an object + that can be evaluated later if need be. + + * If 'ty' is unlifted, the allowed return + values for 'ty' and '(# ty #)' are identical. +-} + +cgCase scrut bndr _alt_type [GenStgAlt { alt_rhs = rhs}] + -- see Note [Eliminate trivial Solo# continuations] + | StgConApp dc _ [StgVarArg v] _ <- rhs + , isUnboxedTupleDataCon dc + , v == bndr + = cgExpr scrut + cgCase scrut bndr alt_type alts = -- the general case do { platform <- getPlatform diff --git a/testsuite/tests/codeGen/should_compile/T24264.hs b/testsuite/tests/codeGen/should_compile/T24264.hs new file mode 100644 index 000000000000..a52ca4dc747b --- /dev/null +++ b/testsuite/tests/codeGen/should_compile/T24264.hs @@ -0,0 +1,42 @@ +{-# LANGUAGE MagicHash #-} + +module T24264 where + +import Control.Exception (evaluate) +import GHC.Exts (seq#, noinline) +import GHC.IO (IO(..)) + +fun1 :: a -> IO a +{-# OPAQUE fun1 #-} +fun1 x = do + pure () + pure $! x + -- This should not push a continuation to the stack before entering 'x' + +fun2 :: a -> IO a +{-# OPAQUE fun2 #-} +fun2 x = do + pure () + evaluate x + -- This should not push a continuation to the stack before entering 'x' + +fun3 :: a -> IO a +{-# OPAQUE fun3 #-} +fun3 x = do + pure () + -- "evaluate $! x" + case x of !x' -> IO (noinline seq# x') + -- noinline to work around the bogus seqRule + -- This ideally also should not push a continuation to the stack + -- before entering 'x'. It currently does, but let's wait for + -- !11515 to land before worrying about that. + +funPair :: a -> IO (a, a) +{-# OPAQUE funPair #-} +funPair x = do + pure () + x' <- pure $! x + -- This should push a continuation to the stack before entering 'x', + -- so the pair can be returned instead. (It's here to make sure + -- that the 'returns to' detection continues working correctly.) + pure (x', x') diff --git a/testsuite/tests/codeGen/should_compile/T24264.stderr b/testsuite/tests/codeGen/should_compile/T24264.stderr new file mode 100644 index 000000000000..c2b5b10f6c23 --- /dev/null +++ b/testsuite/tests/codeGen/should_compile/T24264.stderr @@ -0,0 +1,142 @@ + +==================== Cmm produced by codegen ==================== +[T24264.fun1_entry() { // [R2] + { info_tbls: [(cQO, + label: T24264.fun1_info + rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} } + srt: Nothing)] + stack_info: arg_space: 8 + } + {offset + cQO: // global + _sQw::P64 = R2; // CmmAssign + goto cQM; // CmmBranch + cQM: // global + if ((old + 0) - <highSp> < SpLim) (likely: False) goto cQP; else goto cQQ; // CmmCondBranch + cQP: // global + R2 = _sQw::P64; // CmmAssign + R1 = T24264.fun1_closure; // CmmAssign + call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall + cQQ: // global + goto cQL; // CmmBranch + cQL: // global + // slowCall + R1 = _sQw::P64; // CmmAssign + call stg_ap_0_fast(R1) args: 8, res: 0, upd: 8; // CmmCall + } + }] + + + +==================== Cmm produced by codegen ==================== +[T24264.fun2_entry() { // [R2] + { info_tbls: [(cQY, + label: T24264.fun2_info + rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} } + srt: Nothing)] + stack_info: arg_space: 8 + } + {offset + cQY: // global + _sQz::P64 = R2; // CmmAssign + goto cQW; // CmmBranch + cQW: // global + if ((old + 0) - <highSp> < SpLim) (likely: False) goto cQZ; else goto cR0; // CmmCondBranch + cQZ: // global + R2 = _sQz::P64; // CmmAssign + R1 = T24264.fun2_closure; // CmmAssign + call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall + cR0: // global + goto cQV; // CmmBranch + cQV: // global + // slowCall + R1 = _sQz::P64; // CmmAssign + call stg_ap_0_fast(R1) args: 8, res: 0, upd: 8; // CmmCall + } + }] + + + +==================== Cmm produced by codegen ==================== +[T24264.fun3_entry() { // [R2] + { info_tbls: [(cRb, + label: T24264.fun3_info + rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} } + srt: Nothing)] + stack_info: arg_space: 8 + } + {offset + cRb: // global + _sQB::P64 = R2; // CmmAssign + goto cR6; // CmmBranch + cR6: // global + if ((old + 0) - <highSp> < SpLim) (likely: False) goto cRc; else goto cRd; // CmmCondBranch + cRc: // global + R2 = _sQB::P64; // CmmAssign + R1 = T24264.fun3_closure; // CmmAssign + call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall + cRd: // global + goto cR5; // CmmBranch + cR5: // global + // slowCall + I64[(young<cR8> + 8)] = cR8; // CmmStore + R1 = _sQB::P64; // CmmAssign + call stg_ap_0_fast(R1) returns to cR8, args: 8, res: 8, upd: 8; // CmmCall + cR8: // global + _sQD::P64 = R1; // CmmAssign + // slow_call for _sQB::P64 with pat stg_ap_0 + R1 = _sQD::P64; // CmmAssign + call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8; // CmmCall + } + }] + + + +==================== Cmm produced by codegen ==================== +[T24264.funPair_entry() { // [R2] + { info_tbls: [(cRq, + label: T24264.funPair_info + rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} } + srt: Nothing)] + stack_info: arg_space: 8 + } + {offset + cRq: // global + _sQE::P64 = R2; // CmmAssign + goto cRl; // CmmBranch + cRl: // global + if ((old + 0) - <highSp> < SpLim) (likely: False) goto cRr; else goto cRs; // CmmCondBranch + cRr: // global + R2 = _sQE::P64; // CmmAssign + R1 = T24264.funPair_closure; // CmmAssign + call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall + cRs: // global + goto cRk; // CmmBranch + cRk: // global + // slowCall + I64[(young<cRn> + 8)] = cRn; // CmmStore + R1 = _sQE::P64; // CmmAssign + call stg_ap_0_fast(R1) returns to cRn, args: 8, res: 8, upd: 8; // CmmCall + cRn: // global + _sQG::P64 = R1; // CmmAssign + // slow_call for _sQE::P64 with pat stg_ap_0 + Hp = Hp + 24; // CmmAssign + if (Hp > HpLim) (likely: False) goto cRv; else goto cRu; // CmmCondBranch + cRv: // global + HpAlloc = 24; // CmmAssign + goto cRt; // CmmBranch + cRt: // global + R1 = _sQG::P64; // CmmAssign + call stg_gc_unpt_r1(R1) returns to cRn, args: 8, res: 8, upd: 8; // CmmCall + cRu: // global + // allocHeapClosure + I64[Hp - 16] = (,)_con_info; // CmmStore + P64[Hp - 8] = _sQG::P64; // CmmStore + P64[Hp] = _sQG::P64; // CmmStore + _cRp::P64 = Hp - 15; // CmmAssign + R1 = _cRp::P64; // CmmAssign + call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8; // CmmCall + } + }] + + diff --git a/testsuite/tests/codeGen/should_compile/all.T b/testsuite/tests/codeGen/should_compile/all.T index f2eefa113507..64c20d26047c 100644 --- a/testsuite/tests/codeGen/should_compile/all.T +++ b/testsuite/tests/codeGen/should_compile/all.T @@ -128,3 +128,5 @@ test('T21710a', [ unless(tables_next_to_code(), skip) , when(wordsize(32), skip) , grep_errmsg('(call)',[1]) ] , compile, ['-ddump-cmm -dno-typeable-binds']) test('T23002', normal, compile, ['-fregs-graph']) +test('T24264', [req_cmm, grep_errmsg(r'(.*\().*(\) returns to)', [1,2])], + compile, ['-O -ddump-cmm-from-stg -dno-typeable-binds']) diff --git a/testsuite/tests/codeGen/should_run/T24264run.hs b/testsuite/tests/codeGen/should_run/T24264run.hs new file mode 100644 index 000000000000..8a8e05c00a95 --- /dev/null +++ b/testsuite/tests/codeGen/should_run/T24264run.hs @@ -0,0 +1,32 @@ +module Main where + +import Control.Exception (evaluate) +import GHC.Exts (lazy, noinline) + +data StrictPair a b = !a :*: !b + +tailEval1 :: a -> IO a +{-# OPAQUE tailEval1 #-} +tailEval1 = lazy $ \x -> do + pure () + pure $! x + +tailEval2 :: a -> IO a +{-# OPAQUE tailEval2 #-} +tailEval2 x = evaluate x + +go :: [a] -> IO () +go = noinline mapM_ $ \x -> do + y1 <- tailEval1 x + y2 <- tailEval2 x + evaluate (y1 :*: y2) + +main :: IO () +main = do + let ints :: [Int] + ints = take 1000 $ noinline iterate (\x -> x * 35) 1 + go ints + go [LT, EQ, GT] + go $ noinline map (toEnum @Ordering . flip mod 3) ints + go $ noinline map Left ints + go $ noinline map (+) ints diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T index 3a26b8677c80..31a2a2c67e70 100644 --- a/testsuite/tests/codeGen/should_run/all.T +++ b/testsuite/tests/codeGen/should_run/all.T @@ -240,3 +240,4 @@ test('MulMayOflo_full', ignore_stdout], multi_compile_and_run, ['MulMayOflo', [('MulMayOflo_full.cmm', '')], '']) +test('T24264run', normal, compile_and_run, ['']) -- GitLab