Skip to content
Commits on Source (26)
......@@ -940,19 +940,23 @@
return (dst);
//
// Nonmoving write barrier helpers
//
// See Note [Update remembered set] in NonMovingMark.c.
#if defined(THREADED_RTS)
#define IF_WRITE_BARRIER_ENABLED \
#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False)
#else
// A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C--
#define IF_WRITE_BARRIER_ENABLED \
#define IF_NONMOVING_WRITE_BARRIER_ENABLED \
if (0)
#define nonmoving_write_barrier_enabled 0
#endif
// A useful helper for pushing a pointer to the update remembered set.
// See Note [Update remembered set] in NonMovingMark.c.
#define updateRemembSetPushPtr(p) \
IF_WRITE_BARRIER_ENABLED { \
IF_NONMOVING_WRITE_BARRIER_ENABLED { \
ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr"); \
}
......@@ -74,6 +74,10 @@ extern "C" {
#define RTS_UNREACHABLE abort()
#endif
/* Prefetch primitives */
#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
/* Fix for mingw stat problem (done here so it's early enough) */
#if defined(mingw32_HOST_OS)
#define __MSVCRT__ 1
......
......@@ -151,6 +151,23 @@ typedef struct GCDetails_ {
Time cpu_ns;
// The time elapsed during GC itself
Time elapsed_ns;
//
// Concurrent garbage collector
//
// The CPU time used during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_sync_cpu_ns;
// The time elapsed during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_sync_elapsed_ns;
// The CPU time used during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_cpu_ns;
// The time elapsed during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_elapsed_ns;
} GCDetails;
//
......@@ -241,6 +258,28 @@ typedef struct _RTSStats {
// The number of times a GC thread has iterated it's outer loop across all
// parallel GCs
uint64_t scav_find_work;
// ----------------------------------
// Concurrent garbage collector
// The CPU time used during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_sync_cpu_ns;
// The time elapsed during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_sync_elapsed_ns;
// The maximum time elapsed during the post-mark pause phase of the
// concurrent nonmoving GC.
Time nonmoving_gc_sync_max_elapsed_ns;
// The CPU time used during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_cpu_ns;
// The time elapsed during the post-mark pause phase of the concurrent
// nonmoving GC.
Time nonmoving_gc_elapsed_ns;
// The maximum time elapsed during the post-mark pause phase of the
// concurrent nonmoving GC.
Time nonmoving_gc_max_elapsed_ns;
} RTSStats;
void getRTSStats (RTSStats *s);
......
......@@ -182,12 +182,21 @@
#define EVENT_USER_BINARY_MSG 181
#define EVENT_CONC_MARK_BEGIN 200
#define EVENT_CONC_MARK_END 201
#define EVENT_CONC_SYNC_BEGIN 202
#define EVENT_CONC_SYNC_END 203
#define EVENT_CONC_SWEEP_BEGIN 204
#define EVENT_CONC_SWEEP_END 205
#define EVENT_CONC_UPD_REM_SET_FLUSH 206
#define EVENT_NONMOVING_HEAP_CENSUS 207
/*
* The highest event code +1 that ghc itself emits. Note that some event
* ranges higher than this are reserved but not currently emitted by ghc.
* This must match the size of the EventDesc[] array in EventLog.c
*/
#define NUM_GHC_EVENT_TAGS 182
#define NUM_GHC_EVENT_TAGS 208
#if 0 /* DEPRECATED EVENTS: */
/* we don't actually need to record the thread, it's implicit */
......
......@@ -169,6 +169,7 @@ typedef struct _TRACE_FLAGS {
bool timestamp; /* show timestamp in stderr output */
bool scheduler; /* trace scheduler events */
bool gc; /* trace GC events */
bool nonmoving_gc; /* trace nonmoving GC events */
bool sparks_sampled; /* trace spark events by a sampled method */
bool sparks_full; /* trace spark events 100% accurately */
bool user; /* trace user events (emitted from Haskell code) */
......
......@@ -21,4 +21,7 @@ void updateRemembSetPushClosure(Capability *cap, StgClosure *p);
void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p);
// Note that RTS code should not condition on this directly by rather
// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that
// the barrier is eliminated in the non-threaded RTS.
extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled);
......@@ -88,17 +88,23 @@ typedef struct bdescr_ {
StgPtr start; // [READ ONLY] start addr of memory
StgPtr free; // First free byte of memory.
// allocGroup() sets this to the value of start.
// NB. during use this value should lie
// between start and start + blocks *
// BLOCK_SIZE. Values outside this
// range are reserved for use by the
// block allocator. In particular, the
// value (StgPtr)(-1) is used to
// indicate that a block is unallocated.
//
// Unused by the non-moving allocator.
union {
StgPtr free; // First free byte of memory.
// allocGroup() sets this to the value of start.
// NB. during use this value should lie
// between start and start + blocks *
// BLOCK_SIZE. Values outside this
// range are reserved for use by the
// block allocator. In particular, the
// value (StgPtr)(-1) is used to
// indicate that a block is unallocated.
//
// Unused by the non-moving allocator.
struct NonmovingSegmentInfo {
StgWord8 log_block_size;
StgWord16 next_free_snap;
} nonmoving_segment;
};
struct bdescr_ *link; // used for chaining blocks together
......
......@@ -231,6 +231,9 @@ typedef struct StgTSO_ {
* setting the stack's mark bit (either the BF_MARKED bit for large objects
* or otherwise its bit in its segment's mark bitmap).
*
* To ensure that mutation does not proceed until the stack is fully marked the
* mark phase must not set the mark bit until it has finished tracing.
*
*/
#define STACK_DIRTY 1
......
......@@ -292,6 +292,8 @@ data TraceFlags = TraceFlags
, timestamp :: Bool -- ^ show timestamp in stderr output
, traceScheduler :: Bool -- ^ trace scheduler events
, traceGc :: Bool -- ^ trace GC events
, traceNonmovingGc
:: Bool -- ^ trace nonmoving GC heap census samples
, sparksSampled :: Bool -- ^ trace spark events by a sampled method
, sparksFull :: Bool -- ^ trace spark events 100% accurately
, user :: Bool -- ^ trace user events (emitted from Haskell code)
......@@ -525,6 +527,8 @@ getTraceFlags = do
(#{peek TRACE_FLAGS, scheduler} ptr :: IO CBool))
<*> (toBool <$>
(#{peek TRACE_FLAGS, gc} ptr :: IO CBool))
<*> (toBool <$>
(#{peek TRACE_FLAGS, nonmoving_gc} ptr :: IO CBool))
<*> (toBool <$>
(#{peek TRACE_FLAGS, sparks_sampled} ptr :: IO CBool))
<*> (toBool <$>
......
......@@ -103,6 +103,25 @@ data RTSStats = RTSStats {
-- | Total elapsed time (at the previous GC)
, elapsed_ns :: RtsTime
-- | The CPU time used during the post-mark pause phase of the concurrent
-- nonmoving GC.
, nonmoving_gc_sync_cpu_ns :: RtsTime
-- | The time elapsed during the post-mark pause phase of the concurrent
-- nonmoving GC.
, nonmoving_gc_sync_elapsed_ns :: RtsTime
-- | The maximum time elapsed during the post-mark pause phase of the
-- concurrent nonmoving GC.
, nonmoving_gc_sync_max_elapsed_ns :: RtsTime
-- | The CPU time used during the post-mark pause phase of the concurrent
-- nonmoving GC.
, nonmoving_gc_cpu_ns :: RtsTime
-- | The time elapsed during the post-mark pause phase of the concurrent
-- nonmoving GC.
, nonmoving_gc_elapsed_ns :: RtsTime
-- | The maximum time elapsed during the post-mark pause phase of the
-- concurrent nonmoving GC.
, nonmoving_gc_max_elapsed_ns :: RtsTime
-- | Details about the most recent GC
, gc :: GCDetails
} deriving ( Read -- ^ @since 4.10.0.0
......@@ -146,6 +165,13 @@ data GCDetails = GCDetails {
, gcdetails_cpu_ns :: RtsTime
-- | The time elapsed during GC itself
, gcdetails_elapsed_ns :: RtsTime
-- | The CPU time used during the post-mark pause phase of the concurrent
-- nonmoving GC.
, gcdetails_nonmoving_gc_sync_cpu_ns :: RtsTime
-- | The time elapsed during the post-mark pause phase of the concurrent
-- nonmoving GC.
, gcdetails_nonmoving_gc_sync_elapsed_ns :: RtsTime
} deriving ( Read -- ^ @since 4.10.0.0
, Show -- ^ @since 4.10.0.0
)
......@@ -192,6 +218,12 @@ getRTSStats = do
gc_elapsed_ns <- (# peek RTSStats, gc_elapsed_ns) p
cpu_ns <- (# peek RTSStats, cpu_ns) p
elapsed_ns <- (# peek RTSStats, elapsed_ns) p
nonmoving_gc_sync_cpu_ns <- (# peek RTSStats, nonmoving_gc_sync_cpu_ns) p
nonmoving_gc_sync_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_elapsed_ns) p
nonmoving_gc_sync_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_max_elapsed_ns) p
nonmoving_gc_cpu_ns <- (# peek RTSStats, nonmoving_gc_cpu_ns) p
nonmoving_gc_elapsed_ns <- (# peek RTSStats, nonmoving_gc_elapsed_ns) p
nonmoving_gc_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_max_elapsed_ns) p
let pgc = (# ptr RTSStats, gc) p
gc <- do
gcdetails_gen <- (# peek GCDetails, gen) pgc
......@@ -211,5 +243,7 @@ getRTSStats = do
gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc
gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc
gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc
gcdetails_nonmoving_gc_sync_cpu_ns <- (# peek GCDetails, nonmoving_gc_sync_cpu_ns) pgc
gcdetails_nonmoving_gc_sync_elapsed_ns <- (# peek GCDetails, nonmoving_gc_sync_elapsed_ns) pgc
return GCDetails{..}
return RTSStats{..}
......@@ -13,6 +13,7 @@
void initializeTimer (void);
Time getProcessCPUTime (void);
Time getMyThreadCPUTime (void);
void getProcessTimes (Time *user, Time *elapsed);
/* Get the current date and time.
......
......@@ -256,7 +256,7 @@ loop:
// point to the BLOCKING_QUEUE from the BLACKHOLE
write_barrier(); // make the BQ visible
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
updateRemembSetPushClosure(cap, (StgClosure*)p);
}
((StgInd*)bh)->indirectee = (StgClosure *)bq;
......@@ -287,7 +287,7 @@ loop:
}
#endif
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
// We are about to overwrite bq->queue; make sure its current value
// makes it into the update remembered set
updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);
......
......@@ -474,7 +474,7 @@ stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n)
end = p + WDS(n);
again:
IF_WRITE_BARRIER_ENABLED {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr");
}
p = p + WDS(1);
......@@ -490,7 +490,7 @@ stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
W_ dst_p, src_p, bytes;
if (n > 0) {
IF_WRITE_BARRIER_ENABLED {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
dst, dst_off, n);
}
......@@ -511,7 +511,7 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
W_ dst_p, src_p, bytes;
if (n > 0) {
IF_WRITE_BARRIER_ENABLED {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
dst, dst_off, n);
}
......
......@@ -218,6 +218,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.TraceFlags.timestamp = false;
RtsFlags.TraceFlags.scheduler = false;
RtsFlags.TraceFlags.gc = false;
RtsFlags.TraceFlags.nonmoving_gc = false;
RtsFlags.TraceFlags.sparks_sampled= false;
RtsFlags.TraceFlags.sparks_full = false;
RtsFlags.TraceFlags.user = false;
......@@ -2103,6 +2104,10 @@ static void read_trace_flags(const char *arg)
RtsFlags.TraceFlags.gc = enabled;
enabled = true;
break;
case 'n':
RtsFlags.TraceFlags.nonmoving_gc = enabled;
enabled = true;
break;
case 'u':
RtsFlags.TraceFlags.user = enabled;
enabled = true;
......
......@@ -297,8 +297,10 @@ static StgClosure *lock_tvar(Capability *cap,
} while (cas((void *)&(s -> current_value),
(StgWord)result, (StgWord)trec) != (StgWord)result);
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
updateRemembSetPushClosure(cap, result);
IF_NONMOVING_WRITE_BARRIER_ENABLED {
if (result)
updateRemembSetPushClosure(cap, result);
}
return result;
}
......@@ -323,8 +325,9 @@ static StgBool cond_lock_tvar(Capability *cap,
TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected);
w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec);
result = (StgClosure *)w;
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
updateRemembSetPushClosure(cap, expected);
IF_NONMOVING_WRITE_BARRIER_ENABLED {
if (result)
updateRemembSetPushClosure(cap, expected);
}
TRACE("%p : %s", trec, result ? "success" : "failure");
return (result == expected);
......
......@@ -164,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t );
static bool scheduleHandleThreadFinished( Capability *cap, Task *task,
StgTSO *t );
static bool scheduleNeedHeapProfile(bool ready_to_gc);
static void scheduleDoGC(Capability **pcap, Task *task, bool force_major);
static void scheduleDoGC( Capability **pcap, Task *task,
bool force_major, bool deadlock_detect );
static void deleteThread (StgTSO *tso);
static void deleteAllThreads (void);
......@@ -264,7 +265,7 @@ schedule (Capability *initialCapability, Task *task)
case SCHED_INTERRUPTING:
debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
/* scheduleDoGC() deletes all the threads */
scheduleDoGC(&cap,task,true);
scheduleDoGC(&cap,task,true,false);
// after scheduleDoGC(), we must be shutting down. Either some
// other Capability did the final GC, or we did it above,
......@@ -561,7 +562,7 @@ run_thread:
}
if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
scheduleDoGC(&cap,task,false);
scheduleDoGC(&cap,task,false,false);
}
} /* end of while() */
}
......@@ -935,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
// they are unreachable and will therefore be sent an
// exception. Any threads thus released will be immediately
// runnable.
scheduleDoGC (pcap, task, true/*force major GC*/);
scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/);
cap = *pcap;
// when force_major == true. scheduleDoGC sets
// recent_activity to ACTIVITY_DONE_GC and turns off the timer
......@@ -1005,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
while (!emptyInbox(cap)) {
// Executing messages might use heap, so we should check for GC.
if (doYouWantToGC(cap)) {
scheduleDoGC(pcap, cap->running_task, false);
scheduleDoGC(pcap, cap->running_task, false, false);
cap = *pcap;
}
......@@ -1552,9 +1553,11 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
* Perform a garbage collection if necessary
* -------------------------------------------------------------------------- */
// N.B. See Note [Deadlock detection under nonmoving collector] for rationale
// behind deadlock_detect argument.
static void
scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
bool force_major)
bool force_major, bool deadlock_detect)
{
Capability *cap = *pcap;
bool heap_census;
......@@ -1847,9 +1850,9 @@ delete_threads_and_gc:
// emerge they don't immediately re-enter the GC.
pending_sync = 0;
signalCondition(&sync_finished_cond);
GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap);
#else
GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL);
#endif
// If we're shutting down, don't leave any idle GC work to do.
......@@ -2500,7 +2503,7 @@ resumeThread (void *task_)
incall->suspended_tso = NULL;
incall->suspended_cap = NULL;
// we will modify tso->_link
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
updateRemembSetPushClosure(cap, (StgClosure *)tso->_link);
}
tso->_link = END_TSO_QUEUE;
......@@ -2717,7 +2720,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
nonmovingExit();
Capability *cap = task->cap;
waitForCapability(&cap,task);
scheduleDoGC(&cap,task,true);
scheduleDoGC(&cap,task,true,false);
ASSERT(task->incall->tso == NULL);
releaseCapability(cap);
}
......@@ -2785,7 +2788,7 @@ performGC_(bool force_major)
// TODO: do we need to traceTask*() here?
waitForCapability(&cap,task);
scheduleDoGC(&cap,task,force_major);
scheduleDoGC(&cap,task,force_major,false);
releaseCapability(cap);
boundTaskExiting(task);
}
......
......@@ -33,7 +33,9 @@ static Time
end_init_cpu, end_init_elapsed,
start_exit_cpu, start_exit_elapsed,
start_exit_gc_elapsed, start_exit_gc_cpu,
end_exit_cpu, end_exit_elapsed;
end_exit_cpu, end_exit_elapsed,
start_nonmoving_gc_cpu, start_nonmoving_gc_elapsed,
start_nonmoving_gc_sync_elapsed;
#if defined(PROFILING)
static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time
......@@ -84,7 +86,7 @@ Time stat_getElapsedTime(void)
double
mut_user_time_until( Time t )
{
return TimeToSecondsDbl(t - stats.gc_cpu_ns);
return TimeToSecondsDbl(t - stats.gc_cpu_ns - stats.nonmoving_gc_cpu_ns);
// heapCensus() time is included in GC_tot_cpu, so we don't need
// to subtract it here.
......@@ -125,6 +127,10 @@ initStats0(void)
end_init_cpu = 0;
end_init_elapsed = 0;
start_nonmoving_gc_cpu = 0;
start_nonmoving_gc_elapsed = 0;
start_nonmoving_gc_sync_elapsed = 0;
start_exit_cpu = 0;
start_exit_elapsed = 0;
start_exit_gc_cpu = 0;
......@@ -175,6 +181,11 @@ initStats0(void)
.gc_elapsed_ns = 0,
.cpu_ns = 0,
.elapsed_ns = 0,
.nonmoving_gc_cpu_ns = 0,
.nonmoving_gc_elapsed_ns = 0,
.nonmoving_gc_max_elapsed_ns = 0,
.nonmoving_gc_sync_elapsed_ns = 0,
.nonmoving_gc_sync_max_elapsed_ns = 0,
.gc = {
.gen = 0,
.threads = 0,
......@@ -189,7 +200,10 @@ initStats0(void)
.par_balanced_copied_bytes = 0,
.sync_elapsed_ns = 0,
.cpu_ns = 0,
.elapsed_ns = 0
.elapsed_ns = 0,
.nonmoving_gc_cpu_ns = 0,
.nonmoving_gc_elapsed_ns = 0,
.nonmoving_gc_sync_elapsed_ns = 0,
}
};
}
......@@ -274,6 +288,11 @@ stat_startExit(void)
start_exit_gc_cpu = stats.gc_cpu_ns;
}
/* -----------------------------------------------------------------------------
Nonmoving (concurrent) collector statistics
These two measure the time taken in the concurrent mark & sweep collector.
-------------------------------------------------------------------------- */
void
stat_endExit(void)
{
......@@ -286,10 +305,87 @@ stat_startGCSync (gc_thread *gct)
gct->gc_sync_start_elapsed = getProcessElapsedTime();
}
void
stat_startNonmovingGc ()
{
start_nonmoving_gc_cpu = getMyThreadCPUTime();
start_nonmoving_gc_elapsed = getProcessCPUTime();
}
void
stat_endNonmovingGc ()
{
Time cpu = getMyThreadCPUTime();
Time elapsed = getProcessCPUTime();
stats.gc.nonmoving_gc_elapsed_ns = elapsed - start_nonmoving_gc_elapsed;
stats.nonmoving_gc_elapsed_ns += stats.gc.nonmoving_gc_elapsed_ns;
stats.gc.nonmoving_gc_cpu_ns = cpu - start_nonmoving_gc_cpu;
stats.nonmoving_gc_cpu_ns += stats.gc.nonmoving_gc_cpu_ns;
stats.nonmoving_gc_max_elapsed_ns =
stg_max(stats.gc.nonmoving_gc_elapsed_ns,
stats.nonmoving_gc_max_elapsed_ns);
}
void
stat_startNonmovingGcSync ()
{
start_nonmoving_gc_sync_elapsed = getProcessElapsedTime();
traceConcSyncBegin();
}
void
stat_endNonmovingGcSync ()
{
Time end_elapsed = getProcessElapsedTime();
stats.gc.nonmoving_gc_sync_elapsed_ns = end_elapsed - start_nonmoving_gc_sync_elapsed;
stats.nonmoving_gc_sync_elapsed_ns += stats.gc.nonmoving_gc_sync_elapsed_ns;
stats.nonmoving_gc_sync_max_elapsed_ns =
stg_max(stats.gc.nonmoving_gc_sync_elapsed_ns,
stats.nonmoving_gc_sync_max_elapsed_ns);
traceConcSyncEnd();
}
/* -----------------------------------------------------------------------------
Called at the beginning of each GC
-------------------------------------------------------------------------- */
/*
* GC CPU time is collected on a per-gc_thread basis: The CPU time of each GC
* thread worker is recorded in its gc_thread at the beginning and end of
* scavenging. These are then summed over at the end of the GC.
*
* By contrast, the elapsed time is recorded only by the thread driving the GC.
*
* Mutator time is derived from the process's CPU time, subtracting out
* contributions from stop-the-world and concurrent GCs.
*/
void
stat_startGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
{
bool stats_enabled =
RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
rtsConfig.gcDoneHook != NULL;
if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
gct->gc_start_cpu = getMyThreadCPUTime();
}
}
void
stat_endGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
{
bool stats_enabled =
RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
rtsConfig.gcDoneHook != NULL;
if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
gct->gc_end_cpu = getMyThreadCPUTime();
}
}
void
stat_startGC (Capability *cap, gc_thread *gct)
{
......@@ -297,7 +393,15 @@ stat_startGC (Capability *cap, gc_thread *gct)
debugBelch("\007");
}
getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
bool stats_enabled =
RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
rtsConfig.gcDoneHook != NULL;
if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
gct->gc_start_cpu = getMyThreadCPUTime();
}
gct->gc_start_elapsed = getProcessElapsedTime();
// Post EVENT_GC_START with the same timestamp as used for stats
// (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
......@@ -320,9 +424,9 @@ stat_startGC (Capability *cap, gc_thread *gct)
-------------------------------------------------------------------------- */
void
stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
W_ scav_find_work)
{
......@@ -364,9 +468,13 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
stats.elapsed_ns = current_elapsed - start_init_elapsed;
stats.gc.sync_elapsed_ns =
gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed;
stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed;
stats.gc.cpu_ns = 0;
for (unsigned int i=0; i < par_n_threads; i++) {
gc_thread *gct = gc_threads[i];
stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
}
}
// -------------------------------------------------
// Update the cumulative stats
......@@ -473,8 +581,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
TimeToSecondsDbl(stats.gc.elapsed_ns),
TimeToSecondsDbl(stats.cpu_ns),
TimeToSecondsDbl(stats.elapsed_ns),
faults - gct->gc_start_faults,
gct->gc_start_faults - GC_end_faults,
faults - initiating_gct->gc_start_faults,
initiating_gct->gc_start_faults - GC_end_faults,
gen);
GC_end_faults = faults;
......@@ -706,6 +814,21 @@ static void report_summary(const RTSSummaryStats* sum)
TimeToSecondsDbl(gen_stats->avg_pause_ns),
TimeToSecondsDbl(gen_stats->max_pause_ns));
}
if (RtsFlags.GcFlags.useNonmoving) {
const int n_major_colls = sum->gc_summary_stats[RtsFlags.GcFlags.generations-1].collections;
statsPrintf(" Gen 1 %5d syncs"
", %6.3fs %3.4fs %3.4fs\n",
n_major_colls,
TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns),
TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns) / n_major_colls,
TimeToSecondsDbl(stats.nonmoving_gc_sync_max_elapsed_ns));
statsPrintf(" Gen 1 concurrent"
", %6.3fs %6.3fs %3.4fs %3.4fs\n",
TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns),
TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns) / n_major_colls,
TimeToSecondsDbl(stats.nonmoving_gc_max_elapsed_ns));
}
statsPrintf("\n");
......@@ -742,6 +865,12 @@ static void report_summary(const RTSSummaryStats* sum)
statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
TimeToSecondsDbl(stats.gc_cpu_ns),
TimeToSecondsDbl(stats.gc_elapsed_ns));
if (RtsFlags.GcFlags.useNonmoving) {
statsPrintf(
" CONC GC time %7.3fs (%7.3fs elapsed)\n",
TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns));
}
#if defined(PROFILING)
statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
......@@ -1100,7 +1229,8 @@ stat_exit (void)
stats.mutator_cpu_ns = start_exit_cpu
- end_init_cpu
- (stats.gc_cpu_ns - exit_gc_cpu);
- (stats.gc_cpu_ns - exit_gc_cpu)
- stats.nonmoving_gc_cpu_ns;
stats.mutator_elapsed_ns = start_exit_elapsed
- end_init_elapsed
- (stats.gc_elapsed_ns - exit_gc_elapsed);
......@@ -1510,7 +1640,8 @@ void getRTSStats( RTSStats *s )
s->cpu_ns = current_cpu - end_init_cpu;
s->elapsed_ns = current_elapsed - end_init_elapsed;
s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns;
s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns -
stats.nonmoving_gc_cpu_ns;
s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
stats.gc_elapsed_ns;
}
......
......@@ -30,13 +30,21 @@ void stat_endInit(void);
void stat_startGCSync(struct gc_thread_ *_gct);
void stat_startGC(Capability *cap, struct gc_thread_ *_gct);
void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live,
W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
void stat_startGCWorker (Capability *cap, struct gc_thread_ *_gct);
void stat_endGCWorker (Capability *cap, struct gc_thread_ *_gct);
void stat_endGC (Capability *cap, struct gc_thread_ *initiating_gct, W_ live,
W_ copied, W_ slop, uint32_t gen,
uint32_t n_gc_threads, struct gc_thread_ **gc_threads,
W_ par_max_copied, W_ par_balanced_copied,
W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
W_ mut_spin_yield, W_ any_work, W_ no_work,
W_ scav_find_work);
void stat_startNonmovingGcSync(void);
void stat_endNonmovingGcSync(void);
void stat_startNonmovingGc (void);
void stat_endNonmovingGc (void);
#if defined(PROFILING)
void stat_startRP(void);
void stat_endRP(uint32_t,
......
......@@ -330,15 +330,16 @@ threadPaused(Capability *cap, StgTSO *tso)
}
#endif
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled
&& ip_THUNK(INFO_PTR_TO_STRUCT(bh_info)))) {
// We are about to replace a thunk with a blackhole.
// Add the free variables of the closure we are about to
// overwrite to the update remembered set.
// N.B. We caught the WHITEHOLE case above.
updateRemembSetPushThunkEager(cap,
THUNK_INFO_PTR_TO_STRUCT(bh_info),
(StgThunk *) bh);
IF_NONMOVING_WRITE_BARRIER_ENABLED {
if (ip_THUNK(INFO_PTR_TO_STRUCT(bh_info))) {
// We are about to replace a thunk with a blackhole.
// Add the free variables of the closure we are about to
// overwrite to the update remembered set.
// N.B. We caught the WHITEHOLE case above.
updateRemembSetPushThunkEager(cap,
THUNK_INFO_PTR_TO_STRUCT(bh_info),
(StgThunk *) bh);
}
}
// The payload of the BLACKHOLE points to the TSO
......
......@@ -711,7 +711,7 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
barf("threadStackUnderflow: not enough space for return values");
}
if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
IF_NONMOVING_WRITE_BARRIER_ENABLED {
// ensure that values that we copy into the new stack are marked
// for the nonmoving collector. Note that these values won't
// necessarily form a full closure so we need to handle them
......