Commit 394231b3 authored by Simon Marlow's avatar Simon Marlow Committed by Ben Gamari

Fix cost-centre-stacks bug (#5654)

This fixes some cases of wrong stacks being generated by the profiler.
For background and details on the fix see
`Note [Evaluating functions with profiling]` in `rts/Apply.cmm`.

This does have an impact on allocations for some programs when
profiling.  nofib results:

```
   k-nucleotide          +0.0%     +8.8%    +11.0%    +11.0%      0.0%
         puzzle          +0.0%    +12.5%     0.244     0.246      0.0%
      typecheck           0.0%     +8.7%    +16.1%    +16.2%      0.0%
------------------------------------------------------------------------
--------
            Min          -0.0%     -0.0%    -34.4%    -35.5%    -25.0%
            Max          +0.0%    +12.5%    +48.9%    +49.4%    +10.6%
 Geometric Mean          +0.0%     +0.6%     +2.0%     +1.8%     -0.3%

```

But runtimes don't seem to be affected much, and the examples I looked
at were completely legitimate.  For example, in puzzle we have this:

```
position :: ItemType -> StateType ->  BankType
position Bono = bonoPos
position Edge = edgePos
position Larry = larryPos
position Adam = adamPos
```

where the identifiers on the rhs are all record selectors.  Previously
the profiler gave a stack that looked like

```
  position
  bonoPos
  ...
```

i.e. `bonoPos` was at the same level of the call stack as `position`,
but now it looks like

```
  position
   bonoPos
   ...
```

I used the normaliser from the testsuite to diff the profiling output
from other nofib programs and they all looked better.

Test Plan:
* the broken test passes
* validate
* compiled and ran all of nofib, measured perf, diff'd several .prof
files

Reviewers: niteria, erikd, austin, scpmw, bgamari

Reviewed By: bgamari

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D2804

GHC Trac Issues: #5654, #10007
parent 5c76f834
......@@ -565,8 +565,10 @@ getCallMethod dflags _ id _ n_args v_args _cg_loc
getCallMethod dflags name id (LFReEntrant _ _ arity _ _) n_args _v_args _cg_loc
_self_loop_info
| n_args == 0 = ASSERT( arity /= 0 )
ReturnIt -- No args at all
| n_args == 0 -- No args at all
&& not (gopt Opt_SccProfilingOn dflags)
-- See Note [Evaluating functions with profiling] in rts/Apply.cmm
= ASSERT( arity /= 0 ) ReturnIt
| n_args < arity = SlowCall -- Not enough args
| otherwise = DirectEntry (enterIdLabel dflags name (idCafInfo id)) arity
......
......@@ -288,7 +288,13 @@
// explicit jumps, for use when we are doing the stack management
// ourselves.
#if defined(PROFILING)
// See Note [Evaluating functions with profiling] in rts/Apply.cmm
#define ENTER(x) jump stg_ap_0_fast(x);
#else
#define ENTER(x) ENTER_(return,x)
#endif
#define ENTER_R1() ENTER_(RET_R1,R1)
#define RET_R1(x) jump %ENTRY_CODE(Sp(0)) [R1]
......
......@@ -30,7 +30,114 @@ stg_ap_0_fast ( P_ fun )
IF_DEBUG(sanity,
ccall checkStackFrame(Sp "ptr"));
#if !defined(PROFILING)
ENTER(fun);
#else
/*
Note [Evaluating functions with profiling]
If we evaluate something like
let f = {-# SCC "f" #-} g
where g is a function, then updating the thunk for f to point to g
would be incorrect: we've lost the SCC annotation. In general, when
we evaluate a function and the current CCS is different from the one
stored in the function, we need to return a function with the
correct CCS in it.
The mechanism we use to wrap the function is to create a
zero-argument PAP as a proxy object to hold the new CCS, and return
that.
*/
again:
W_ info;
W_ untaggedfun;
untaggedfun = UNTAG(fun);
info = %INFO_PTR(untaggedfun);
switch [INVALID_OBJECT .. N_CLOSURE_TYPES]
(TO_W_( %INFO_TYPE(%STD_INFO(info)) )) {
case
IND,
IND_STATIC:
{
fun = StgInd_indirectee(fun);
goto again;
}
case
FUN,
FUN_1_0,
FUN_0_1,
FUN_2_0,
FUN_1_1,
FUN_0_2,
FUN_STATIC,
BCO:
{
if (CCCS == StgHeader_ccs(untaggedfun)) {
return (fun);
} else {
// We're going to build a new PAP, with zero extra
// arguments and therefore the same arity as the
// original function. In other words, we're using a
// zero-argument PAP as an indirection to the
// function, so that we can attach a different CCS to
// it.
HP_CHK_GEN(SIZEOF_StgPAP);
TICK_ALLOC_PAP(SIZEOF_StgPAP, 0);
// attribute this allocation to the "overhead of profiling"
CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
P_ pap;
W_ arity;
pap = Hp - SIZEOF_StgPAP + WDS(1);
SET_HDR(pap, stg_PAP_info, CCCS);
arity = TO_W_(StgFunInfoExtra_arity(%FUN_INFO(info)));
StgPAP_arity(pap) = arity;
StgPAP_fun(pap) = fun;
StgPAP_n_args(pap) = 0;
return (pap);
}
}
case PAP:
{
if (CCCS == StgHeader_ccs(untaggedfun)) {
return (fun);
} else {
// We're going to copy this PAP, and put the new CCS in it
fun = untaggedfun;
W_ size;
size = SIZEOF_StgPAP + WDS(TO_W_(StgPAP_n_args(fun)));
HP_CHK_GEN(size);
TICK_ALLOC_PAP(size, 0);
// attribute this allocation to the "overhead of profiling"
CCS_ALLOC(BYTES_TO_WDS(SIZEOF_StgPAP), CCS_OVERHEAD);
P_ pap;
pap = Hp - size + WDS(1);
SET_HDR(pap, stg_PAP_info, CCCS);
StgPAP_arity(pap) = StgPAP_arity(fun);
StgPAP_n_args(pap) = StgPAP_n_args(fun);
StgPAP_fun(pap) = StgPAP_fun(fun);
W_ i;
i = TO_W_(StgPAP_n_args(fun));
loop:
if (i == 0) {
return (pap);
}
i = i - 1;
StgPAP_payload(pap,i) = StgPAP_payload(fun,i);
goto loop;
}
}
default:
{
jump %ENTRY_CODE(info) (UNTAG(fun));
}
}
#endif
}
/* -----------------------------------------------------------------------------
......
Thu Dec 8 11:35 2016 Time and Allocation Profiling Report (Final)
T5654-O0 +RTS -p -RTS
total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
total alloc = 39,184 bytes (excludes profiling overheads)
COST CENTRE MODULE SRC %time %alloc
MAIN MAIN <built-in> 0.0 1.0
CAF GHC.IO.Handle.FD <entire-module> 0.0 88.3
CAF GHC.IO.Encoding <entire-module> 0.0 7.1
CAF GHC.Conc.Signal <entire-module> 0.0 1.6
individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
MAIN MAIN <built-in> 104 0 0.0 1.0 0.0 100.0
CAF Main <entire-module> 207 0 0.0 0.9 0.0 1.4
f Main T5654-O0.hs:7:1-5 209 1 0.0 0.1 0.0 0.1
main Main T5654-O0.hs:13:1-21 208 1 0.0 0.3 0.0 0.4
f Main T5654-O0.hs:7:1-5 210 0 0.0 0.0 0.0 0.1
g Main T5654-O0.hs:11:1-11 211 1 0.0 0.1 0.0 0.1
CAF GHC.Conc.Signal <entire-module> 201 0 0.0 1.6 0.0 1.6
CAF GHC.IO.Encoding <entire-module> 191 0 0.0 7.1 0.0 7.1
CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.6 0.0 0.6
CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 88.3 0.0 88.3
-- Tests for a bug in the handling of cost-centre stacks in the
-- runtime, where we lose the current cost-centre stack when
-- evaluating a function.
{-# NOINLINE f #-}
f :: Int -> Int
f = g -- here we should remember the stack under which g was evaluated
{-# NOINLINE g #-}
g :: Int -> Int
g x = x + 1
main = return $! f 3
Thu Dec 8 11:34 2016 Time and Allocation Profiling Report (Final)
T5654-O1 +RTS -p -RTS
total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
total alloc = 39,064 bytes (excludes profiling overheads)
COST CENTRE MODULE SRC %time %alloc
MAIN MAIN <built-in> 0.0 1.9
CAF GHC.IO.Handle.FD <entire-module> 0.0 88.6
CAF GHC.IO.Encoding <entire-module> 0.0 7.1
CAF GHC.Conc.Signal <entire-module> 0.0 1.6
individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
MAIN MAIN <built-in> 104 0 0.0 1.9 0.0 100.0
CAF Main <entire-module> 207 0 0.0 0.0 0.0 0.2
main Main T5654-O1.hs:13:1-21 208 1 0.0 0.1 0.0 0.2
f Main T5654-O1.hs:7:1-5 209 1 0.0 0.0 0.0 0.0
g Main T5654-O1.hs:11:1-11 210 1 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal <entire-module> 201 0 0.0 1.6 0.0 1.6
CAF GHC.IO.Encoding <entire-module> 191 0 0.0 7.1 0.0 7.1
CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.6 0.0 0.6
CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 88.6 0.0 88.6
Wed Nov 23 11:52 2011 Time and Allocation Profiling Report (Final)
scc004 +RTS -p -RTS
total time = 0.00 secs (0 ticks @ 20 ms)
total alloc = 39,400 bytes (excludes profiling overheads)
COST CENTRE MODULE %time %alloc
MAIN MAIN 0.0 1.7
CAF GHC.Conc.Signal 0.0 1.7
CAF GHC.IO.Encoding 0.0 7.1
CAF GHC.IO.Handle.FD 0.0 88.7
individual inherited
COST CENTRE MODULE no. entries %time %alloc %time %alloc
MAIN MAIN 98 0 0.0 1.7 0.0 100.0
CAF GHC.IO.Handle.FD 124 0 0.0 88.7 0.0 88.7
CAF GHC.IO.Encoding.Iconv 116 0 0.0 0.6 0.0 0.6
CAF GHC.IO.Encoding 114 0 0.0 7.1 0.0 7.1
CAF GHC.Conc.Signal 106 0 0.0 1.7 0.0 1.7
CAF Main 104 0 0.0 0.1 0.0 0.2
f Main 197 1 0.0 0.0 0.0 0.0
main Main 196 1 0.0 0.0 0.0 0.1
f Main 198 0 0.0 0.0 0.0 0.0
g Main 199 1 0.0 0.0 0.0 0.0
Sat Jun 4 11:59 2016 Time and Allocation Profiling Report (Final)
Thu Dec 8 15:23 2016 Time and Allocation Profiling Report (Final)
T680 +RTS -hc -p -RTS
total time = 0.00 secs (0 ticks @ 1000 us, 1 processor)
total alloc = 752,952 bytes (excludes profiling overheads)
total alloc = 753,032 bytes (excludes profiling overheads)
COST CENTRE MODULE SRC %time %alloc
......@@ -17,26 +17,26 @@ foo.bar.\ Main T680.hs:(8,11)-(9,38) 0.0 38.2
individual inherited
COST CENTRE MODULE SRC no. entries %time %alloc %time %alloc
MAIN MAIN <built-in> 46 0 0.0 0.1 0.0 100.0
CAF Main <entire-module> 91 0 0.0 0.0 0.0 93.6
k Main T680.hs:12:1-17 95 1 0.0 0.0 0.0 0.0
foo Main T680.hs:(2,1)-(9,38) 96 1 0.0 0.0 0.0 0.0
foo.bar Main T680.hs:(5,3)-(9,38) 100 1 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 104 1 0.0 0.0 0.0 0.0
k.\ Main T680.hs:12:16 105 1 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 92 1 0.0 0.0 0.0 0.0
r Main T680.hs:18:1-26 94 1 0.0 0.0 0.0 93.5
k Main T680.hs:12:1-17 97 0 0.0 0.0 0.0 93.5
foo Main T680.hs:(2,1)-(9,38) 98 0 0.0 0.0 0.0 93.5
foo.\ Main T680.hs:3:12-40 99 4001 0.0 25.5 0.0 25.5
foo.bar Main T680.hs:(5,3)-(9,38) 101 0 0.0 29.8 0.0 68.0
foo.bar.\ Main T680.hs:(8,11)-(9,38) 102 4001 0.0 38.2 0.0 38.2
foo.bar.\.k'' Main T680.hs:8:15-27 103 4000 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 106 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Handle.FD <entire-module> 84 0 0.0 4.6 0.0 4.6
CAF GHC.IO.Handle.Text <entire-module> 83 0 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal <entire-module> 80 0 0.0 0.1 0.0 0.1
CAF GHC.IO.Encoding <entire-module> 78 0 0.0 0.4 0.0 0.4
CAF GHC.Show <entire-module> 76 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Encoding.Iconv <entire-module> 62 0 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 93 0 0.0 1.2 0.0 1.2
MAIN MAIN <built-in> 104 0 0.0 0.1 0.0 100.0
CAF Main <entire-module> 207 0 0.0 0.0 0.0 93.5
k Main T680.hs:12:1-17 211 1 0.0 0.0 0.0 0.0
foo Main T680.hs:(2,1)-(9,38) 212 1 0.0 0.0 0.0 0.0
foo.bar Main T680.hs:(5,3)-(9,38) 216 1 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 220 1 0.0 0.0 0.0 0.0
k.\ Main T680.hs:12:16 221 1 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 208 1 0.0 0.0 0.0 0.0
r Main T680.hs:18:1-26 210 1 0.0 0.0 0.0 93.5
k Main T680.hs:12:1-17 213 0 0.0 0.0 0.0 93.5
foo Main T680.hs:(2,1)-(9,38) 214 0 0.0 0.0 0.0 93.5
foo.\ Main T680.hs:3:12-40 215 4001 0.0 25.5 0.0 25.5
foo.bar Main T680.hs:(5,3)-(9,38) 217 0 0.0 29.8 0.0 68.0
foo.bar.\ Main T680.hs:(8,11)-(9,38) 218 4001 0.0 38.2 0.0 38.2
foo.bar.\.k'' Main T680.hs:8:15-27 219 4000 0.0 0.0 0.0 0.0
foo.bar.k' Main T680.hs:6:9-34 222 0 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal <entire-module> 201 0 0.0 0.1 0.0 0.1
CAF GHC.IO.Encoding <entire-module> 191 0 0.0 0.4 0.0 0.4
CAF GHC.IO.Encoding.Iconv <entire-module> 189 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Handle.FD <entire-module> 181 0 0.0 4.6 0.0 4.6
CAF GHC.IO.Handle.Text <entire-module> 179 0 0.0 0.0 0.0 0.0
CAF GHC.Show <entire-module> 165 0 0.0 0.0 0.0 0.0
main Main T680.hs:20:1-14 209 0 0.0 1.2 0.0 1.2
......@@ -47,7 +47,9 @@ test('scc002', [], compile_and_run, [''])
test('scc003', [], compile_and_run,
['-fno-state-hack']) # Note [consistent stacks]
test('T5654', [expect_broken(5654)], compile_and_run, [''])
test('T5654-O0', [only_ways(['prof'])], compile_and_run, [''])
test('T5654-O1', [only_ways(['profasm'])], compile_and_run, [''])
test('T5654b-O0', [only_ways(['prof'])], compile_and_run, [''])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment