Commit 2f463873 authored by Simon Marlow's avatar Simon Marlow

Detect overly long GC sync

Summary:
GC sync is the time between a GC being intiated and all the mutator
threads finally stopping so that the GC can start. Problems that cause
the GC sync to be delayed are hard to find and can cause dramatic
slowdowns for heavily parallel programs.

The new flag --long-gc-sync=<time> helps by emitting a warning and
calling a user-overridable hook when the GC sync time exceeds the
specified threshold. A debugger can be used to set a breakpoint when
this happens and inspect the stacks of threads to find the culprit.

Test Plan:
```
$ ./inplace/bin/ghc-stage2 +RTS --long-gc-sync=0.0000001 -S
    Alloc    Copied     Live     GC     GC      TOT      TOT  Page Flts
    bytes     bytes     bytes   user   elap     user     elap
  1135856     51144    153736  0.000  0.000    0.002    0.002    0    0  (Gen:  0)
  1034760     94704    188752  0.000  0.000    0.002    0.002    0    0  (Gen:  0)
  1038888    134832    228888  0.009  0.009    0.011    0.011    0    0  (Gen:  1)
  1025288     90128    235184  0.000  0.000    0.012    0.012    0    0  (Gen:  0)
  1049088    130080    333984  0.000  0.000    0.013    0.013    0    0  (Gen:  0)
Warning: waited 0us for GC sync
  1034424     73360    331976  0.000  0.000    0.013    0.013    0    0  (Gen:  0)
```

Also tested on a real production problem.

Reviewers: niteria, bgamari, erikd

Subscribers: rwbarton, thomie

Differential Revision: https://phabricator.haskell.org/D4193
parent 07ac921f
......@@ -728,6 +728,44 @@ performance.
that indicates the NUMA nodes on which to run the program. For
example, ``--numa=3`` would run the program on NUMA nodes 0 and 1.
.. rts-flag:: --long-gc-sync
--long-gc-sync=<seconds>
.. index::
single: GC sync time, measuring
When a GC starts, all the running mutator threads have to stop and
synchronise. The period between when the GC is initiated and all
the mutator threads are stopped is called the GC synchronisation
phase. If this phase is taking a long time (longer than 1ms is
considered long), then it can have a severe impact on overall
throughput.
A long GC sync can be caused by a mutator thread that is inside an
``unsafe`` FFI call, or running in a loop that doesn't allocate
memory and so doesn't yield. To fix the former, make the call
``safe``, and to fix the latter, either avoid calling the code in
question or compile it with :ghc-flag:`-fomit-yields`.
By default, the flag will cause a warning to be emitted to stderr
when the sync time exceeds the specified time. This behaviour can
be overriden, however: the ``longGCSync()`` hook is called when
the sync time is exceeded during the sync period, and the
``longGCSyncEnd()`` hook at the end. Both of these hooks can be
overriden in the ``RtsConfig`` when the runtime is started with
``hs_init_ghc()``. The default implementations of these hooks
(``LongGcSync()`` and ``LongGCSyncEnd()`` respectively) print
warnings to stderr.
One way to use this flag is to set a breakpoint on
``LongGCSync()`` in the debugger, and find the thread that is
delaying the sync. You probably want to use :ghc-flag:`-g` to
provide more info to the debugger.
The GC sync time, along with other GC stats, are available by
calling the ``getRTSStats()`` function from C, or
``GHC.Stats.getRTSStats`` from Haskell.
.. _rts-options-statistics:
RTS options to produce runtime statistics
......
......@@ -102,6 +102,10 @@ typedef struct {
// Called for every GC
void (* gcDoneHook) (const struct GCDetails_ *stats);
// Called when GC sync takes too long (+RTS --long-gc-sync=<time>)
void (* longGCSync) (uint32_t this_cap, Time time_ns);
void (* longGCSyncEnd) (Time time_ns);
} RtsConfig;
// Clients should start with defaultRtsConfig and then customise it.
......
......@@ -65,6 +65,8 @@ typedef struct _GC_FLAGS {
Time idleGCDelayTime; /* units: TIME_RESOLUTION */
bool doIdleGC;
Time longGCSync; /* units: TIME_RESOLUTION */
StgWord heapBase; /* address to ask the OS for memory */
StgWord allocLimitGrace; /* units: *blocks*
......
......@@ -69,7 +69,9 @@ const RtsConfig defaultRtsConfig = {
.stackOverflowHook = StackOverflowHook,
.outOfHeapHook = OutOfHeapHook,
.mallocFailHook = MallocFailHook,
.gcDoneHook = NULL
.gcDoneHook = NULL,
.longGCSync = LongGCSync,
.longGCSyncEnd = LongGCSyncEnd
};
/*
......@@ -165,6 +167,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.GcFlags.numa = false;
RtsFlags.GcFlags.numaMask = 1;
RtsFlags.GcFlags.ringBell = false;
RtsFlags.GcFlags.longGCSync = 0; /* detection turned off */
RtsFlags.DebugFlags.scheduler = false;
RtsFlags.DebugFlags.interpreter = false;
......@@ -937,6 +940,16 @@ error = true;
}
}
#endif
else if (!strncmp("long-gc-sync=", &rts_argv[arg][2], 13)) {
OPTION_SAFE;
if (rts_argv[arg][2] == '\0') {
/* use default */
} else {
RtsFlags.GcFlags.longGCSync =
fsecondsToTime(atof(rts_argv[arg]+16));
}
break;
}
else {
OPTION_SAFE;
errorBelch("unknown RTS option: %s",rts_argv[arg]);
......
......@@ -22,5 +22,7 @@ extern void StackOverflowHook (W_ stack_size);
extern void OutOfHeapHook (W_ request_size, W_ heap_size);
extern void MallocFailHook (W_ request_size /* in bytes */, const char *msg);
extern void FlagDefaultsHook (void);
extern void LongGCSync (uint32_t capno, Time t);
extern void LongGCSyncEnd (Time t);
#include "EndPrivate.h"
/* -----------------------------------------------------------------------------
*
* User-overridable RTS hooks.
*
* ---------------------------------------------------------------------------*/
#include "PosixSource.h"
#include "Rts.h"
#include "sm/GC.h"
#include "sm/GCThread.h"
#include "Hooks.h"
/*
* Called when --long-gc-sync=<time> has expired during a GC sync. The idea is
* that you can set a breakpoint on this function in gdb and try to determine
* which thread was holding up the GC sync.
*/
void LongGCSync (uint32_t me USED_IF_THREADS, Time t STG_UNUSED)
{
#if defined(THREADED_RTS)
{
uint32_t i;
for (i=0; i < n_capabilities; i++) {
if (i != me && gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
debugBelch("Warning: slow GC sync: still waiting for cap %d\n",
i);
}
}
}
#endif
}
/*
* Called at the end of a GC sync which was longer than --long-gc-sync=<time>.
* The idea is that you can use this function to log stats about the length of
* GC syncs.
*/
void LongGCSyncEnd (Time t)
{
debugBelch("Warning: waited %" FMT_Word64 "us for GC sync\n", TimeToUS(t));
}
......@@ -48,6 +48,7 @@
#include "Stable.h"
#include "CheckUnload.h"
#include "CNF.h"
#include "RtsFlags.h"
#include <string.h> // for memset()
#include <unistd.h>
......@@ -825,11 +826,6 @@ static void heapOverflow(void)
Initialise the gc_thread structures.
-------------------------------------------------------------------------- */
#define GC_THREAD_INACTIVE 0
#define GC_THREAD_STANDING_BY 1
#define GC_THREAD_RUNNING 2
#define GC_THREAD_WAITING_TO_CONTINUE 3
static void
new_gc_thread (uint32_t n, gc_thread *t)
{
......@@ -1132,6 +1128,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
const uint32_t me = cap->no;
uint32_t i, j;
bool retry = true;
Time t0, t1, t2;
t0 = t1 = t2 = getProcessElapsedTime();
while(retry) {
for (i=0; i < n_threads; i++) {
......@@ -1153,6 +1152,19 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
if (!retry) break;
yieldThread();
}
t2 = getProcessElapsedTime();
if (RtsFlags.GcFlags.longGCSync != 0 &&
t2 - t1 > RtsFlags.GcFlags.longGCSync) {
/* call this every longGCSync of delay */
rtsConfig.longGCSync(cap->no, t2 - t0);
t1 = t2;
}
}
if (RtsFlags.GcFlags.longGCSync != 0 &&
t2 - t0 > RtsFlags.GcFlags.longGCSync) {
rtsConfig.longGCSyncEnd(t2 - t0);
}
}
......
......@@ -116,6 +116,12 @@ typedef struct gen_workspace_ {
of the GC threads
------------------------------------------------------------------------- */
/* values for the wakeup field */
#define GC_THREAD_INACTIVE 0
#define GC_THREAD_STANDING_BY 1
#define GC_THREAD_RUNNING 2
#define GC_THREAD_WAITING_TO_CONTINUE 3
typedef struct gc_thread_ {
Capability *cap;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment