Commit 7c9e356d authored by duog's avatar duog Committed by Ben Gamari

Fix Work Balance computation in RTS stats

An additional stat is tracked per gc: par_balanced_copied This is the
the number of bytes copied by each gc thread under the balanced lmit,
which is simply (copied_bytes / num_gc_threads).  The stat is added to
all the appropriate GC structures, so is visible in the eventlog and in
GHC.Stats.

A note is added explaining how work balance is computed.

Remove some end of line whitespace

Test Plan:
./validate
experiment with the program attached to the ticket
examine code changes carefully

Reviewers: simonmar, austin, hvr, bgamari, erikd

Reviewed By: simonmar

Subscribers: Phyx, rwbarton, thomie

GHC Trac Issues: #13830

Differential Revision: https://phabricator.haskell.org/D3658
parent 905dc8bc
......@@ -135,6 +135,8 @@ typedef struct GCDetails_ {
uint64_t copied_bytes;
// In parallel GC, the max amount of data copied by any one thread
uint64_t par_max_copied_bytes;
// In parallel GC, the amount of balanced data copied by all threads
uint64_t par_balanced_copied_bytes;
// The time elapsed during synchronisation before GC
Time sync_elapsed_ns;
// The CPU time used during GC itself
......@@ -176,6 +178,8 @@ typedef struct _RTSStats {
uint64_t par_copied_bytes;
// Sum of par_max_copied_bytes across all parallel GCs
uint64_t cumulative_par_max_copied_bytes;
// Sum of par_balanced_copied_byes across all parallel GCs.
uint64_t cumulative_par_balanced_copied_bytes;
// -----------------------------------
// Cumulative stats about time use
......
......@@ -3,16 +3,16 @@
* (c) The GHC Team, 2008-2009
*
* Event log format
*
*
* The log format is designed to be extensible: old tools should be
* able to parse (but not necessarily understand all of) new versions
* of the format, and new tools will be able to understand old log
* files.
*
*
* Each event has a specific format. If you add new events, give them
* new numbers: we never re-use old event numbers.
*
* - The format is endian-independent: all values are represented in
* - The format is endian-independent: all values are represented in
* bigendian order.
*
* - The format is extensible:
......@@ -51,7 +51,7 @@
* Word8* -- extra info (for future extensions)
* EVENT_ET_END
*
* Event :
* Event :
* Word16 -- event_type
* Word64 -- time (nanosecs)
* [Word16] -- length of the rest (for variable-sized events only)
......@@ -154,7 +154,8 @@
#define EVENT_GC_STATS_GHC 53 /* (heap_capset, generation,
copied_bytes, slop_bytes, frag_bytes,
par_n_threads,
par_max_copied, par_tot_copied) */
par_max_copied,
par_tot_copied, par_balanced_copied) */
#define EVENT_GC_GLOBAL_SYNC 54 /* () */
#define EVENT_TASK_CREATE 55 /* (taskID, cap, tid) */
#define EVENT_TASK_MIGRATE 56 /* (taskID, cap, new_cap) */
......
......@@ -81,8 +81,10 @@ data RTSStats = RTSStats {
, copied_bytes :: Word64
-- | Sum of copied_bytes across all parallel GCs
, par_copied_bytes :: Word64
-- | Sum of par_max_copied_bytes across all parallel GCs
-- | Sum of par_max_copied_bytes across all parallel GCs. Deprecated.
, cumulative_par_max_copied_bytes :: Word64
-- | Sum of par_balanced_copied bytes across all parallel GCs
, cumulative_par_balanced_copied_bytes :: Word64
-- -----------------------------------
-- Cumulative stats about time use
......@@ -130,8 +132,11 @@ data GCDetails = GCDetails {
, gcdetails_mem_in_use_bytes :: Word64
-- | Total amount of data copied during this GC
, gcdetails_copied_bytes :: Word64
-- | In parallel GC, the max amount of data copied by any one thread
-- | In parallel GC, the max amount of data copied by any one thread.
-- Deprecated.
, gcdetails_par_max_copied_bytes :: Word64
-- | In parallel GC, the amount of balanced data copied by all threads
, gcdetails_par_balanced_copied_bytes :: Word64
-- | The time elapsed during synchronisation before GC
, gcdetails_sync_elapsed_ns :: RtsTime
-- | The CPU time used during GC itself
......@@ -170,6 +175,8 @@ getRTSStats = do
par_copied_bytes <- (# peek RTSStats, par_copied_bytes) p
cumulative_par_max_copied_bytes <-
(# peek RTSStats, cumulative_par_max_copied_bytes) p
cumulative_par_balanced_copied_bytes <-
(# peek RTSStats, cumulative_par_balanced_copied_bytes) p
mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p
mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p
gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p
......@@ -190,6 +197,8 @@ getRTSStats = do
gcdetails_copied_bytes <- (# peek GCDetails, copied_bytes) pgc
gcdetails_par_max_copied_bytes <-
(# peek GCDetails, par_max_copied_bytes) pgc
gcdetails_par_balanced_copied_bytes <-
(# peek GCDetails, par_balanced_copied_bytes) pgc
gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc
gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc
gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc
......@@ -259,8 +268,19 @@ data GCStats = GCStats
-- thread each GC. The ratio of 'parTotBytesCopied' divided by
-- 'parMaxBytesCopied' approaches 1 for a maximally sequential
-- run and approaches the number of threads (set by the RTS flag
-- @-N@) for a maximally parallel run.
-- @-N@) for a maximally parallel run. This is included for
-- backwards compatibility; to compute work balance use
-- `parBalancedBytesCopied`.
, parMaxBytesCopied :: !Int64
-- | Sum of number of balanced bytes copied on each thread of each GC.
-- Balanced bytes are those up to a
-- limit = (parTotBytesCopied / num_gc_threads).
-- This number is normalized so that when balance is perfect
-- @parBalancedBytesCopied = parTotBytesCopied@ and when all
-- gc is done by a single thread @parBalancedBytesCopied = 0@.
, parBalancedBytesCopied :: !Int64
} deriving (Show, Read)
-- | Retrieves garbage collection and memory statistics as of the last
......@@ -306,6 +326,8 @@ getGCStats = do
wallSeconds <- nsToSecs <$> (# peek RTSStats, elapsed_ns) p
parTotBytesCopied <- (# peek RTSStats, par_copied_bytes) p
parMaxBytesCopied <- (# peek RTSStats, cumulative_par_max_copied_bytes) p
parBalancedBytesCopied <-
(# peek RTSStats, cumulative_par_balanced_copied_bytes) p
return GCStats { .. }
nsToSecs :: Int64 -> Double
......
......@@ -62,7 +62,7 @@ provider HaskellEvent {
probe gc__work (EventCapNo);
probe gc__done (EventCapNo);
probe gc__global__sync (EventCapNo);
probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord);
probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord);
probe heap__info (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord);
probe heap__allocated (EventCapNo, EventCapsetID, StgWord64);
probe heap__size (EventCapsetID, StgWord);
......
......@@ -149,6 +149,7 @@ initStats0(void)
.copied_bytes = 0,
.par_copied_bytes = 0,
.cumulative_par_max_copied_bytes = 0,
.cumulative_par_balanced_copied_bytes = 0,
.mutator_cpu_ns = 0,
.mutator_elapsed_ns = 0,
.gc_cpu_ns = 0,
......@@ -166,6 +167,7 @@ initStats0(void)
.mem_in_use_bytes = 0,
.copied_bytes = 0,
.par_max_copied_bytes = 0,
.par_balanced_copied_bytes = 0,
.sync_elapsed_ns = 0,
.cpu_ns = 0,
.elapsed_ns = 0
......@@ -283,7 +285,8 @@ stat_startGC (Capability *cap, gc_thread *gct)
void
stat_endGC (Capability *cap, gc_thread *gct,
W_ live, W_ copied, W_ slop, uint32_t gen,
uint32_t par_n_threads, W_ par_max_copied)
uint32_t par_n_threads, W_ par_max_copied,
W_ par_balanced_copied)
{
// -------------------------------------------------
// Collect all the stats about this GC in stats.gc. We always do this since
......@@ -305,6 +308,7 @@ stat_endGC (Capability *cap, gc_thread *gct,
stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
stats.gc.copied_bytes = copied * sizeof(W_);
stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
// -------------------------------------------------
// Update the cumulative stats
......@@ -324,6 +328,8 @@ stat_endGC (Capability *cap, gc_thread *gct,
stats.par_copied_bytes += stats.gc.copied_bytes;
stats.cumulative_par_max_copied_bytes +=
stats.gc.par_max_copied_bytes;
stats.cumulative_par_balanced_copied_bytes +=
stats.gc.par_balanced_copied_bytes;
}
stats.gc_cpu_ns += stats.gc.cpu_ns;
stats.gc_elapsed_ns += stats.gc.elapsed_ns;
......@@ -385,7 +391,8 @@ stat_endGC (Capability *cap, gc_thread *gct,
* BLOCK_SIZE,
par_n_threads,
stats.gc.par_max_copied_bytes,
stats.gc.copied_bytes);
stats.gc.copied_bytes,
stats.gc.par_balanced_copied_bytes);
// Post EVENT_GC_END with the same timestamp as used for stats
// (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
......@@ -672,11 +679,11 @@ stat_exit (void)
}
#if defined(THREADED_RTS)
if (RtsFlags.ParFlags.parGcEnabled && n_capabilities > 1) {
if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
// See Note [Work Balance]
statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
100 * (((double)stats.par_copied_bytes / (double)stats.cumulative_par_max_copied_bytes) - 1)
/ (n_capabilities - 1)
);
100 * (double)stats.cumulative_par_balanced_copied_bytes /
(double)stats.par_copied_bytes);
}
#endif
statsPrintf("\n");
......@@ -825,6 +832,84 @@ stat_exit (void)
}
}
/* Note [Work Balance]
----------------------
Work balance is a measure of how evenly the work done during parallel garbage
collection is spread across threads. To compute work balance we must take care
to account for the number of GC threads changing between GCs. The statistics we
track must have the number of GC threads "integrated out".
We accumulate two values from each garbage collection:
* par_copied: is a measure of the total work done during parallel garbage
collection
* par_balanced_copied: is a measure of the balanced work done
during parallel garbage collection.
par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
complicated:
For a given garbage collection:
Let gc_copied := total copies during the gc
gc_copied_i := copies by the ith thread during the gc
num_gc_threads := the number of threads participating in the gc
balance_limit := (gc_copied / num_gc_threads)
If we were to graph gc_copied_i, sorted from largest to smallest we would see
something like:
|X
^ |X X
| |X X X X: unbalanced copies
copies |----------- Y: balanced copies by the busiest GC thread
|Y Z Z Z: other balanced copies
|Y Z Z Z
|Y Z Z Z Z
|Y Z Z Z Z Z
|===========
|1 2 3 4 5 6
i ->
where the --- line is at balance_limit. Balanced copies are those under the ---
line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
always equal balance_limit. Completely balanced gc has every thread copying
balance_limit and a completely unbalanced gc has a single thread copying
gc_copied.
One could define par_balance_copied as the areas of the Ys and Zs in the graph
above, however we would like the ratio of (par_balance_copied / gc_copied) to
range from 0 to 1, so that work_balance will be a nice percentage, also ranging
from 0 to 1. We therefore define par_balanced_copied as:
( num_gc_threads )
{Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
i (num_gc_threads - 1)
vvv vvv
S T
Where the S and T terms serve to remove the area of the Ys, and
to normalize the result to lie between 0 and gc_copied.
Note that the implementation orders these operations differently to minimize
error due to integer rounding.
Then cumulative work balance is computed as
(cumulative_par_balanced_copied_bytes / par_copied_byes)
Previously, cumulative work balance was computed as:
(cumulative_par_max_copied_bytes)
(-------------------------------) - 1
( par_copied_bytes )
-------------------------------------
(n_capabilities - 1)
This was less accurate than the current method, and invalid whenever a garbage
collection had occurred with num_gc_threads /= n_capabilities; which can happen
when setNumCapabilities is called, when -qn is passed as an RTS option, or when
the number of gc threads is limited to the number of cores.
See #13830
*/
/* -----------------------------------------------------------------------------
stat_describe_gens
......
......@@ -30,7 +30,7 @@ void stat_startGCSync(struct gc_thread_ *_gct);
void stat_startGC(Capability *cap, struct gc_thread_ *_gct);
void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live,
W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
W_ par_max_copied);
W_ par_max_copied, W_ par_balanced_copied);
#if defined(PROFILING)
void stat_startRP(void);
......
......@@ -347,7 +347,8 @@ void traceEventGcStats_ (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
W_ par_tot_copied)
W_ par_tot_copied,
W_ par_balanced_copied)
{
#if defined(DEBUG)
if (RtsFlags.TraceFlags.tracing == TRACE_STDERR) {
......@@ -357,7 +358,8 @@ void traceEventGcStats_ (Capability *cap,
{
postEventGcStats(cap, heap_capset, gen,
copied, slop, fragmentation,
par_n_threads, par_max_copied, par_tot_copied);
par_n_threads, par_max_copied,
par_tot_copied, par_balanced_copied);
}
}
......
......@@ -151,7 +151,8 @@ void traceEventGcStats_ (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
W_ par_tot_copied);
W_ par_tot_copied,
W_ par_balanced_copied);
/*
* Record a spark event
......@@ -302,7 +303,8 @@ void traceHeapProfSampleCostCentre(StgWord8 profile_id,
#define traceGcEventAtT(cap, ts, tag) /* nothing */
#define traceEventGcStats_(cap, heap_capset, gen, \
copied, slop, fragmentation, \
par_n_threads, par_max_copied, par_tot_copied) /* nothing */
par_n_threads, par_max_copied, \
par_tot_copied, par_balanced_copied) /* nothing */
#define traceHeapEvent(cap, tag, heap_capset, info1) /* nothing */
#define traceEventHeapInfo_(heap_capset, gens, \
maxHeapSize, allocAreaSize, \
......@@ -395,11 +397,13 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg);
copies, slop, fragmentation, \
par_n_threads, \
par_max_copied, \
par_tot_copied) \
par_tot_copied, \
par_balanced_copied)
HASKELLEVENT_GC_STATS(heap_capset, gens, \
copies, slop, fragmentation, \
par_n_threads, \
par_max_copied, \
par_balanced_copied, \
par_tot_copied)
#define dtraceHeapInfo(heap_capset, gens, \
maxHeapSize, allocAreaSize, \
......@@ -470,7 +474,8 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg);
copies, slop, fragmentation, \
par_n_threads, \
par_max_copied, \
par_tot_copied) /* nothing */
par_tot_copied, \
par_balanced_copied) /* nothing */
#define dtraceHeapInfo(heap_capset, gens, \
maxHeapSize, allocAreaSize, \
mblockSize, blockSize) /* nothing */
......@@ -663,16 +668,19 @@ INLINE_HEADER void traceEventGcStats(Capability *cap STG_UNUSED,
W_ fragmentation STG_UNUSED,
uint32_t par_n_threads STG_UNUSED,
W_ par_max_copied STG_UNUSED,
W_ par_tot_copied STG_UNUSED)
W_ par_tot_copied STG_UNUSED,
W_ par_balanced_copied STG_UNUSED)
{
if (RTS_UNLIKELY(TRACE_gc)) {
traceEventGcStats_(cap, heap_capset, gen,
copied, slop, fragmentation,
par_n_threads, par_max_copied, par_tot_copied);
par_n_threads, par_max_copied,
par_tot_copied, par_balanced_copied);
}
dtraceEventGcStats(heap_capset, gen,
copied, slop, fragmentation,
par_n_threads, par_max_copied, par_tot_copied);
par_n_threads, par_max_copied,
par_tot_copied, par_balanced_copied);
}
INLINE_HEADER void traceEventHeapInfo(CapsetID heap_capset STG_UNUSED,
......
......@@ -412,12 +412,14 @@ initEventLogging(const EventLogWriter *ev_writer)
case EVENT_GC_STATS_GHC: // (heap_capset, generation,
// copied_bytes, slop_bytes, frag_bytes,
// par_n_threads,
// par_max_copied, par_tot_copied)
// par_max_copied, par_tot_copied,
// par_balanced_copied
// )
eventTypes[t].size = sizeof(EventCapsetID)
+ sizeof(StgWord16)
+ sizeof(StgWord64) * 3
+ sizeof(StgWord32)
+ sizeof(StgWord64) * 2;
+ sizeof(StgWord64) * 3;
break;
case EVENT_TASK_CREATE: // (taskId, cap, tid)
......@@ -903,7 +905,8 @@ void postEventGcStats (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
W_ par_tot_copied)
W_ par_tot_copied,
W_ par_balanced_copied)
{
EventsBuf *eb = &capEventBuf[cap->no];
ensureRoomForEvent(eb, EVENT_GC_STATS_GHC);
......@@ -911,7 +914,8 @@ void postEventGcStats (Capability *cap,
postEventHeader(eb, EVENT_GC_STATS_GHC);
/* EVENT_GC_STATS_GHC (heap_capset, generation,
copied_bytes, slop_bytes, frag_bytes,
par_n_threads, par_max_copied, par_tot_copied) */
par_n_threads, par_max_copied,
par_tot_copied, par_balanced_copied) */
postCapsetID(eb, heap_capset);
postWord16(eb, gen);
postWord64(eb, copied);
......@@ -920,6 +924,7 @@ void postEventGcStats (Capability *cap,
postWord32(eb, par_n_threads);
postWord64(eb, par_max_copied);
postWord64(eb, par_tot_copied);
postWord64(eb, par_balanced_copied);
}
void postTaskCreateEvent (EventTaskId taskId,
......
......@@ -121,7 +121,8 @@ void postEventGcStats (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
W_ par_tot_copied);
W_ par_tot_copied,
W_ par_balanced_copied);
void postTaskCreateEvent (EventTaskId taskId,
EventCapNo cap,
......
......@@ -188,7 +188,7 @@ GarbageCollect (uint32_t collect_gen,
{
bdescr *bd;
generation *gen;
StgWord live_blocks, live_words, par_max_copied;
StgWord live_blocks, live_words, par_max_copied, par_balanced_copied;
#if defined(THREADED_RTS)
gc_thread *saved_gct;
#endif
......@@ -460,8 +460,14 @@ GarbageCollect (uint32_t collect_gen,
copied = 0;
par_max_copied = 0;
par_balanced_copied = 0;
{
uint32_t i;
uint64_t par_balanced_copied_acc = 0;
for (i=0; i < n_gc_threads; i++) {
copied += gc_threads[i]->copied;
}
for (i=0; i < n_gc_threads; i++) {
if (n_gc_threads > 1) {
debugTrace(DEBUG_gc,"thread %d:", i);
......@@ -471,11 +477,20 @@ GarbageCollect (uint32_t collect_gen,
debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work);
debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work);
}
copied += gc_threads[i]->copied;
par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
par_balanced_copied_acc +=
stg_min(n_gc_threads * gc_threads[i]->copied, copied);
}
if (n_gc_threads == 1) {
par_max_copied = 0;
par_balanced_copied = 0;
}
else
{
// See Note [Work Balance] for an explanation of this computation
par_balanced_copied =
(par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
(n_gc_threads - 1);
}
}
......@@ -782,7 +797,7 @@ GarbageCollect (uint32_t collect_gen,
// ok, GC over: tell the stats department what happened.
stat_endGC(cap, gct, live_words, copied,
live_blocks * BLOCK_SIZE_W - live_words /* slop */,
N, n_gc_threads, par_max_copied);
N, n_gc_threads, par_max_copied, par_balanced_copied);
#if defined(RTS_USER_SIGNALS)
if (RtsFlags.MiscFlags.install_signal_handlers) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment