Skip to content
Snippets Groups Projects
Commit bc4d67b7 authored by Matthew Craven's avatar Matthew Craven Committed by Marge Bot
Browse files

StgToCmm: Detect some no-op case-continuations

...and generate no code for them. Fixes #24264.
parent 685b467c
No related branches found
No related tags found
No related merge requests found
......@@ -570,6 +570,58 @@ cgCase (StgOpApp (StgPrimOp SeqOp) [StgVarArg a, _] _) bndr alt_type alts
-- Use the same return convention as vanilla 'a'.
cgCase (StgApp a []) bndr alt_type alts
{-
Note [Eliminate trivial Solo# continuations]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If we have code like this:
case scrut of bndr {
alt -> Solo# bndr
}
The RHS of the only branch does nothing except wrap the case-binder
returned by 'scrut' in a unary unboxed tuple. But unboxed tuples
don't exist at run-time, i.e. the branch is a no-op! So we can
generate code as if we just had 'scrut' instead of a case-expression.
This situation can easily arise for IO or ST code, where the last
operation a function performs is commonly 'pure $! someExpr'.
See also #24264 and !11778. More concretely, as of December 2023,
when building a stage2 "perf+no_profiled_libs" ghc:
* The special case is reached 398 times.
* Of these, 158 have scrutinees that call a function or enter a
potential thunk, and would need to push a useless stack frame if
not for this optimisation.
We might consider rewriting such case expressions in GHC.Stg.CSE as a
slight extension of Note [All alternatives are the binder]. But the
RuntimeReps of 'bndr' and 'Solo# bndr' are not exactly the same, and
per Note [Typing the STG language] in GHC.Stg.Lint, we do expect Stg
code to remain RuntimeRep-correct. So we just detect the situation in
StgToCmm instead.
Crucially, the return conventions for 'ty' and '(# ty #)' are compatible:
The returned value is passed in the same register(s) or stack slot in
both conventions, and the set of allowed return values for 'ty'
is a subset of the allowed return values for '(# ty #)':
* For a lifted type 'ty', the return convention for 'ty' promises to
return an evaluated-properly-tagged heap pointer, while a return
type '(# ty #)' only promises to return a heap pointer to an object
that can be evaluated later if need be.
* If 'ty' is unlifted, the allowed return
values for 'ty' and '(# ty #)' are identical.
-}
cgCase scrut bndr _alt_type [GenStgAlt { alt_rhs = rhs}]
-- see Note [Eliminate trivial Solo# continuations]
| StgConApp dc _ [StgVarArg v] _ <- rhs
, isUnboxedTupleDataCon dc
, v == bndr
= cgExpr scrut
cgCase scrut bndr alt_type alts
= -- the general case
do { platform <- getPlatform
......
{-# LANGUAGE MagicHash #-}
module T24264 where
import Control.Exception (evaluate)
import GHC.Exts (seq#, noinline)
import GHC.IO (IO(..))
fun1 :: a -> IO a
{-# OPAQUE fun1 #-}
fun1 x = do
pure ()
pure $! x
-- This should not push a continuation to the stack before entering 'x'
fun2 :: a -> IO a
{-# OPAQUE fun2 #-}
fun2 x = do
pure ()
evaluate x
-- This should not push a continuation to the stack before entering 'x'
fun3 :: a -> IO a
{-# OPAQUE fun3 #-}
fun3 x = do
pure ()
-- "evaluate $! x"
case x of !x' -> IO (noinline seq# x')
-- noinline to work around the bogus seqRule
-- This ideally also should not push a continuation to the stack
-- before entering 'x'. It currently does, but let's wait for
-- !11515 to land before worrying about that.
funPair :: a -> IO (a, a)
{-# OPAQUE funPair #-}
funPair x = do
pure ()
x' <- pure $! x
-- This should push a continuation to the stack before entering 'x',
-- so the pair can be returned instead. (It's here to make sure
-- that the 'returns to' detection continues working correctly.)
pure (x', x')
==================== Cmm produced by codegen ====================
[T24264.fun1_entry() { // [R2]
{ info_tbls: [(cQO,
label: T24264.fun1_info
rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} }
srt: Nothing)]
stack_info: arg_space: 8
}
{offset
cQO: // global
_sQw::P64 = R2; // CmmAssign
goto cQM; // CmmBranch
cQM: // global
if ((old + 0) - <highSp> < SpLim) (likely: False) goto cQP; else goto cQQ; // CmmCondBranch
cQP: // global
R2 = _sQw::P64; // CmmAssign
R1 = T24264.fun1_closure; // CmmAssign
call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall
cQQ: // global
goto cQL; // CmmBranch
cQL: // global
// slowCall
R1 = _sQw::P64; // CmmAssign
call stg_ap_0_fast(R1) args: 8, res: 0, upd: 8; // CmmCall
}
}]
==================== Cmm produced by codegen ====================
[T24264.fun2_entry() { // [R2]
{ info_tbls: [(cQY,
label: T24264.fun2_info
rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} }
srt: Nothing)]
stack_info: arg_space: 8
}
{offset
cQY: // global
_sQz::P64 = R2; // CmmAssign
goto cQW; // CmmBranch
cQW: // global
if ((old + 0) - <highSp> < SpLim) (likely: False) goto cQZ; else goto cR0; // CmmCondBranch
cQZ: // global
R2 = _sQz::P64; // CmmAssign
R1 = T24264.fun2_closure; // CmmAssign
call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall
cR0: // global
goto cQV; // CmmBranch
cQV: // global
// slowCall
R1 = _sQz::P64; // CmmAssign
call stg_ap_0_fast(R1) args: 8, res: 0, upd: 8; // CmmCall
}
}]
==================== Cmm produced by codegen ====================
[T24264.fun3_entry() { // [R2]
{ info_tbls: [(cRb,
label: T24264.fun3_info
rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} }
srt: Nothing)]
stack_info: arg_space: 8
}
{offset
cRb: // global
_sQB::P64 = R2; // CmmAssign
goto cR6; // CmmBranch
cR6: // global
if ((old + 0) - <highSp> < SpLim) (likely: False) goto cRc; else goto cRd; // CmmCondBranch
cRc: // global
R2 = _sQB::P64; // CmmAssign
R1 = T24264.fun3_closure; // CmmAssign
call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall
cRd: // global
goto cR5; // CmmBranch
cR5: // global
// slowCall
I64[(young<cR8> + 8)] = cR8; // CmmStore
R1 = _sQB::P64; // CmmAssign
call stg_ap_0_fast(R1) returns to cR8, args: 8, res: 8, upd: 8; // CmmCall
cR8: // global
_sQD::P64 = R1; // CmmAssign
// slow_call for _sQB::P64 with pat stg_ap_0
R1 = _sQD::P64; // CmmAssign
call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8; // CmmCall
}
}]
==================== Cmm produced by codegen ====================
[T24264.funPair_entry() { // [R2]
{ info_tbls: [(cRq,
label: T24264.funPair_info
rep: HeapRep static { Fun {arity: 2 fun_type: ArgSpec 5} }
srt: Nothing)]
stack_info: arg_space: 8
}
{offset
cRq: // global
_sQE::P64 = R2; // CmmAssign
goto cRl; // CmmBranch
cRl: // global
if ((old + 0) - <highSp> < SpLim) (likely: False) goto cRr; else goto cRs; // CmmCondBranch
cRr: // global
R2 = _sQE::P64; // CmmAssign
R1 = T24264.funPair_closure; // CmmAssign
call (stg_gc_fun)(R2, R1) args: 8, res: 0, upd: 8; // CmmCall
cRs: // global
goto cRk; // CmmBranch
cRk: // global
// slowCall
I64[(young<cRn> + 8)] = cRn; // CmmStore
R1 = _sQE::P64; // CmmAssign
call stg_ap_0_fast(R1) returns to cRn, args: 8, res: 8, upd: 8; // CmmCall
cRn: // global
_sQG::P64 = R1; // CmmAssign
// slow_call for _sQE::P64 with pat stg_ap_0
Hp = Hp + 24; // CmmAssign
if (Hp > HpLim) (likely: False) goto cRv; else goto cRu; // CmmCondBranch
cRv: // global
HpAlloc = 24; // CmmAssign
goto cRt; // CmmBranch
cRt: // global
R1 = _sQG::P64; // CmmAssign
call stg_gc_unpt_r1(R1) returns to cRn, args: 8, res: 8, upd: 8; // CmmCall
cRu: // global
// allocHeapClosure
I64[Hp - 16] = (,)_con_info; // CmmStore
P64[Hp - 8] = _sQG::P64; // CmmStore
P64[Hp] = _sQG::P64; // CmmStore
_cRp::P64 = Hp - 15; // CmmAssign
R1 = _cRp::P64; // CmmAssign
call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8; // CmmCall
}
}]
......@@ -128,3 +128,5 @@ test('T21710a', [ unless(tables_next_to_code(), skip) , when(wordsize(32), skip)
, grep_errmsg('(call)',[1]) ]
, compile, ['-ddump-cmm -dno-typeable-binds'])
test('T23002', normal, compile, ['-fregs-graph'])
test('T24264', [req_cmm, grep_errmsg(r'(.*\().*(\) returns to)', [1,2])],
compile, ['-O -ddump-cmm-from-stg -dno-typeable-binds'])
module Main where
import Control.Exception (evaluate)
import GHC.Exts (lazy, noinline)
data StrictPair a b = !a :*: !b
tailEval1 :: a -> IO a
{-# OPAQUE tailEval1 #-}
tailEval1 = lazy $ \x -> do
pure ()
pure $! x
tailEval2 :: a -> IO a
{-# OPAQUE tailEval2 #-}
tailEval2 x = evaluate x
go :: [a] -> IO ()
go = noinline mapM_ $ \x -> do
y1 <- tailEval1 x
y2 <- tailEval2 x
evaluate (y1 :*: y2)
main :: IO ()
main = do
let ints :: [Int]
ints = take 1000 $ noinline iterate (\x -> x * 35) 1
go ints
go [LT, EQ, GT]
go $ noinline map (toEnum @Ordering . flip mod 3) ints
go $ noinline map Left ints
go $ noinline map (+) ints
......@@ -240,3 +240,4 @@ test('MulMayOflo_full',
ignore_stdout],
multi_compile_and_run,
['MulMayOflo', [('MulMayOflo_full.cmm', '')], ''])
test('T24264run', normal, compile_and_run, [''])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment