Commit 19670bc3 authored by Ömer Sinan Ağacan's avatar Ömer Sinan Ağacan

Fix negative mutator time in GC stats in prof builds

Because garbage collector calls `retainerProfile()` and `heapCensus()`,
GC times normally include some of PROF times too. To fix this we have
these lines:

    // heapCensus() is called by the GC, so RP and HC time are
    // included in the GC stats.  We therefore subtract them to
    // obtain the actual GC cpu time.
    stats.gc_cpu_ns      -=  prof_cpu;
    stats.gc_elapsed_ns  -=  prof_elapsed;

These variables are later used for calculating GC time excluding the
final GC (which should be attributed to EXIT).

    exit_gc_elapsed      = stats.gc_elapsed_ns - start_exit_gc_elapsed;

The problem is if we subtract PROF times from `gc_elapsed_ns` and then
subtract `start_exit_gc_elapsed` from the result, we end up subtracting
PROF times twice, because `start_exit_gc_elapsed` also includes PROF
times.

We now subtract PROF times from GC after the calculations for EXIT and
MUT times. The existing assertion that checks

    INIT + MUT + GC + EXIT = TOTAL

now holds. When we subtract PROF numbers from GC, and a new assertion

    INIT + MUT + GC + PROF + EXIT = TOTAL

also holds.

Fixes #15897. New assertions added in this commit also revealed #16102,
which is also fixed by this commit.
parent 74cd4ec5
......@@ -2018,6 +2018,10 @@ forkProcess(HsStablePtr *entry
} else { // child
// Current process times reset in the child process, so we should reset
// the stats too. See #16102.
resetChildProcessStats();
#if defined(THREADED_RTS)
initMutex(&sched_mutex);
initMutex(&sm_mutex);
......
......@@ -43,12 +43,6 @@ static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
#endif
#if defined(PROFILING)
#define PROF_VAL(x) (x)
#else
#define PROF_VAL(x) 0
#endif
#if defined(PROF_SPIN)
volatile StgWord64 whitehole_lockClosure_spin = 0;
volatile StgWord64 whitehole_lockClosure_yield = 0;
......@@ -204,11 +198,11 @@ initStats0(void)
initStats1() can be called after setupRtsFlags()
------------------------------------------------------------------------ */
void initGenerationStats(void);
void
initStats1 (void)
{
uint32_t i;
if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
statsPrintf(" bytes bytes bytes user elap user elap\n");
......@@ -225,13 +219,29 @@ initStats1 (void)
(Time *)stgMallocBytes(
sizeof(Time)*RtsFlags.GcFlags.generations,
"initStats");
for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
initGenerationStats();
}
void
initGenerationStats()
{
for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
GC_coll_cpu[i] = 0;
GC_coll_elapsed[i] = 0;
GC_coll_max_pause[i] = 0;
}
}
/* ---------------------------------------------------------------------------
Reset stats of child process after fork()
------------------------------------------------------------------------ */
void resetChildProcessStats()
{
initStats0();
initGenerationStats();
}
/* -----------------------------------------------------------------------------
Initialisation time...
-------------------------------------------------------------------------- */
......@@ -517,14 +527,14 @@ stat_endRP(
RP_tot_time += user - RP_start_time;
RPe_tot_time += elapsed - RPe_start_time;
fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
retainerGeneration, mut_user_time_during_RP());
fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
retainerGeneration, mut_user_time_during_RP());
#if defined(DEBUG_RETAINER)
fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
#endif
fprintf(prof_file, "\tAverage number of visits per object = %f\n",
averageNumVisit);
fprintf(prof_file, "\tAverage number of visits per object = %f\n",
averageNumVisit);
}
#endif /* PROFILING */
......@@ -1049,21 +1059,15 @@ void
stat_exit (void)
{
RTSSummaryStats sum;
uint32_t g;
init_RTSSummaryStats(&sum);
if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
// First we tidy the times in stats, and populate the times in sum.
// In particular, we adjust the gc_* time in stats to remove
// profiling times.
{
Time now_cpu_ns, now_elapsed_ns;
Time exit_gc_cpu = 0;
Time exit_gc_elapsed = 0;
Time prof_cpu = 0;
Time prof_elapsed = 0;
getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
stats.cpu_ns = now_cpu_ns - start_init_cpu;
stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
......@@ -1072,15 +1076,6 @@ stat_exit (void)
if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
// heapCensus() is called by the GC, so RP and HC time are
// included in the GC stats. We therefore subtract them to
// obtain the actual GC cpu time.
stats.gc_cpu_ns -= prof_cpu;
stats.gc_elapsed_ns -= prof_elapsed;
#if defined(PROFILING)
sum.rp_cpu_ns = RP_tot_time;
sum.rp_elapsed_ns = RPe_tot_time;
......@@ -1091,8 +1086,11 @@ stat_exit (void)
// We do a GC during the EXIT phase. We'll attribute the cost of
// that to GC instead of EXIT, so carefully subtract it from the
// EXIT time.
exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
// Note that exit_gc includes RP and HC for the exit GC too.
Time exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
ASSERT(exit_gc_elapsed > 0);
sum.exit_cpu_ns = end_exit_cpu
- start_exit_cpu
......@@ -1101,14 +1099,16 @@ stat_exit (void)
- start_exit_elapsed
- exit_gc_elapsed;
ASSERT(sum.exit_elapsed_ns >= 0);
stats.mutator_cpu_ns = start_exit_cpu
- end_init_cpu
- (stats.gc_cpu_ns - exit_gc_cpu)
- prof_cpu;
- (stats.gc_cpu_ns - exit_gc_cpu);
stats.mutator_elapsed_ns = start_exit_elapsed
- end_init_elapsed
- (stats.gc_elapsed_ns - exit_gc_elapsed)
- prof_elapsed;
- (stats.gc_elapsed_ns - exit_gc_elapsed);
ASSERT(stats.mutator_elapsed_ns >= 0);
if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
......@@ -1116,12 +1116,29 @@ stat_exit (void)
// and subtracting, so the parts should add up to the total exactly.
// Note that stats->total_ns is captured a tiny bit later than
// end_exit_elapsed, so we don't use it here.
ASSERT(stats.init_elapsed_ns \
+ stats.mutator_elapsed_ns \
+ stats.gc_elapsed_ns \
+ sum.exit_elapsed_ns \
ASSERT(stats.init_elapsed_ns // INIT
+ stats.mutator_elapsed_ns // MUT
+ stats.gc_elapsed_ns // GC
+ sum.exit_elapsed_ns // EXIT
== end_exit_elapsed - start_init_elapsed);
// heapCensus() is called by the GC, so RP and HC time are
// included in the GC stats. We therefore subtract them to
// obtain the actual GC cpu time.
Time prof_cpu = sum.rp_cpu_ns + sum.hc_cpu_ns;
Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
stats.gc_cpu_ns -= prof_cpu;
stats.gc_elapsed_ns -= prof_elapsed;
// This assertion is probably not necessary; make sure the
// subdivision with PROF also makes sense
ASSERT(stats.init_elapsed_ns // INIT
+ stats.mutator_elapsed_ns // MUT
+ stats.gc_elapsed_ns // GC
+ sum.exit_elapsed_ns // EXIT
+ (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
== end_exit_elapsed - start_init_elapsed);
}
// REVIEWERS: it's not clear to me why the following isn't done in
......@@ -1142,10 +1159,9 @@ stat_exit (void)
// We populate the remainder (non-time elements) of sum
{
#if defined(THREADED_RTS)
uint32_t i;
sum.bound_task_count = taskCount - workerCount;
for (i = 0; i < n_capabilities; i++) {
for (uint32_t i = 0; i < n_capabilities; i++) {
sum.sparks.created += capabilities[i]->spark_stats.created;
sum.sparks.dud += capabilities[i]->spark_stats.dud;
sum.sparks.overflowed+=
......@@ -1195,22 +1211,22 @@ stat_exit (void)
sum.productivity_cpu_percent =
TimeToSecondsDbl(stats.cpu_ns
- stats.gc_cpu_ns
- sum.rp_cpu_ns
- sum.hc_cpu_ns
- stats.init_cpu_ns
- sum.exit_cpu_ns)
/ TimeToSecondsDbl(stats.cpu_ns);
ASSERT(sum.productivity_cpu_percent >= 0);
sum.productivity_elapsed_percent =
TimeToSecondsDbl(stats.elapsed_ns
- stats.gc_elapsed_ns
- sum.rp_elapsed_ns
- sum.hc_elapsed_ns
- stats.init_elapsed_ns
- sum.exit_elapsed_ns)
/ TimeToSecondsDbl(stats.elapsed_ns);
for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
ASSERT(sum.productivity_elapsed_percent >= 0);
for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
const generation* gen = &generations[g];
GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
gen_stats->collections = gen->collections;
......
......@@ -59,6 +59,7 @@ void stat_workerStop(void);
void initStats0(void);
void initStats1(void);
void resetChildProcessStats(void);
double mut_user_time_until(Time t);
double mut_user_time(void);
......
......@@ -22,3 +22,11 @@ T14257:
./T14257 +RTS -hc
# Make sure that samples are monotonically increasing
awk 'BEGIN{t=0} /BEGIN_SAMPLE/{if ($$2 < t) print "uh oh", $$t, $$0; t=$$2;}' T14257.hp
.PHONY: T15897
T15897:
# The bug is caught by an assertion so we run the tests with debug runtime
# and ignore the outputs
"$(TEST_HC)" -prof -fprof-auto -debug -v0 T15897.hs
./T15897 10000000 +RTS -s -hc 2>/dev/null
./T15897 10000000 +RTS -s -hr 2>/dev/null
import Control.Monad
import Data.IORef
import System.Environment
main = do
[n] <- getArgs
replicateM (read n) (newIORef [1,2,3])
......@@ -145,3 +145,8 @@ test('toplevel_scc_1',
test('T12962', [], compile_and_run, [''])
test('T14257', [], run_command, ['$MAKE -s --no-print-directory T14257'])
test('T15897',
[extra_ways(['profasm']), only_ways(['profasm']), run_timeout_multiplier(2)],
run_command,
['$MAKE -s --no-print-directory T15897'])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment