Commit e22bc0de authored by Simon Marlow's avatar Simon Marlow

Make clearNursery free

Summary:
clearNursery resets all the bd->free pointers of nursery blocks to
make the blocks empty.  In profiles we've seen clearNursery taking
significant amounts of time particularly with large -N and -A values.

This patch moves the work of clearNursery to the point at which we
actually need the new block, thereby introducing an invariant that
blocks to the right of the CurrentNursery pointer still need their
bd->free pointer reset.  This should make things faster overall,
because we don't need to clear blocks that we don't use.

Test Plan: validate

Reviewers: AndreasVoellmy, ezyang, austin

Subscribers: thomie, carter, ezyang, simonmar

Differential Revision: https://phabricator.haskell.org/D318
parent e159e08a
......@@ -66,6 +66,27 @@ typedef struct nursery_ {
memcount n_blocks;
} nursery;
// Nursery invariants:
//
// - cap->r.rNursery points to the nursery for this capability
//
// - cap->r.rCurrentNursery points to the block in the nursery that we are
// currently allocating into. While in Haskell the current heap pointer is
// in Hp, outside Haskell it is stored in cap->r.rCurrentNursery->free.
//
// - the blocks *after* cap->rCurrentNursery in the chain are empty
// (although their bd->free pointers have not been updated to
// reflect that)
//
// - the blocks *before* cap->rCurrentNursery have been used. Except
// for rCurrentAlloc.
//
// - cap->r.rCurrentAlloc is either NULL, or it points to a block in
// the nursery *before* cap->r.rCurrentNursery.
//
// See also Note [allocation accounting] to understand how total
// memory allocation is tracked.
typedef struct generation_ {
nat no; // generation number
......
......@@ -98,6 +98,10 @@ struct Capability_ {
// reset after we have executed the context switch.
int interrupt;
// Total words allocated by this cap since rts start
// See [Note allocation accounting] in Storage.c
W_ total_allocated;
#if defined(THREADED_RTS)
// Worker Tasks waiting in the wings. Singly-linked.
Task *spare_workers;
......@@ -131,8 +135,6 @@ struct Capability_ {
int io_manager_control_wr_fd;
#endif
#endif
// Total words allocated by this cap since rts start
W_ total_allocated;
// Per-capability STM-related data
StgTVarWatchQueue *free_tvar_watch_queues;
......
......@@ -97,7 +97,12 @@ stg_gc_noregs
&& bdescr_link(CurrentNursery) != NULL) {
HpAlloc = 0;
CLOSE_NURSERY();
Capability_total_allocated(MyCapability()) =
Capability_total_allocated(MyCapability()) +
BYTES_TO_WDS(bdescr_free(CurrentNursery) -
bdescr_start(CurrentNursery));
CurrentNursery = bdescr_link(CurrentNursery);
bdescr_free(CurrentNursery) = bdescr_start(CurrentNursery);
OPEN_NURSERY();
if (Capability_context_switch(MyCapability()) != 0 :: CInt ||
Capability_interrupt(MyCapability()) != 0 :: CInt ||
......
......@@ -1125,21 +1125,16 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
// don't do this if the nursery is (nearly) full, we'll GC first.
if (cap->r.rCurrentNursery->link != NULL ||
cap->r.rNursery->n_blocks == 1) { // paranoia to prevent infinite loop
// if the nursery has only one block.
cap->r.rNursery->n_blocks == 1) { // paranoia to prevent
// infinite loop if the
// nursery has only one
// block.
bd = allocGroup_lock(blocks);
cap->r.rNursery->n_blocks += blocks;
// link the new group into the list
bd->link = cap->r.rCurrentNursery;
bd->u.back = cap->r.rCurrentNursery->u.back;
if (cap->r.rCurrentNursery->u.back != NULL) {
cap->r.rCurrentNursery->u.back->link = bd;
} else {
cap->r.rNursery->blocks = bd;
}
cap->r.rCurrentNursery->u.back = bd;
// link the new group after CurrentNursery
dbl_link_insert_after(bd, cap->r.rCurrentNursery);
// initialise it as a nursery block. We initialise the
// step, gen_no, and flags field of *every* sub-block in
......@@ -1162,6 +1157,7 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
// now update the nursery to point to the new block
finishedNurseryBlock(cap, cap->r.rCurrentNursery);
cap->r.rCurrentNursery = bd;
// we might be unlucky and have another thread get on the
......
......@@ -291,27 +291,8 @@ stat_startGC (Capability *cap, gc_thread *gct)
{
gct->gc_start_faults = getPageFaults();
}
}
/* -----------------------------------------------------------------------------
* Calculate the total allocated memory since the start of the
* program. Also emits events reporting the per-cap allocation
* totals.
* -------------------------------------------------------------------------- */
static StgWord
calcTotalAllocated(void)
{
W_ tot_alloc = 0;
W_ n;
for (n = 0; n < n_capabilities; n++) {
tot_alloc += capabilities[n]->total_allocated;
traceEventHeapAllocated(capabilities[n],
CAPSET_HEAP_DEFAULT,
capabilities[n]->total_allocated * sizeof(W_));
}
return tot_alloc;
updateNurseriesStats();
}
/* -----------------------------------------------------------------------------
......
......@@ -532,6 +532,7 @@ assignNurseriesToCapabilities (nat from, nat to)
for (i = from; i < to; i++) {
capabilities[i]->r.rCurrentNursery = nurseries[i].blocks;
newNurseryBlock(nurseries[i].blocks);
capabilities[i]->r.rCurrentAlloc = NULL;
}
}
......@@ -551,17 +552,16 @@ allocNurseries (nat from, nat to)
}
void
clearNursery (Capability *cap)
clearNursery (Capability *cap USED_IF_DEBUG)
{
#ifdef DEBUG
bdescr *bd;
for (bd = nurseries[cap->no].blocks; bd; bd = bd->link) {
cap->total_allocated += (W_)(bd->free - bd->start);
bd->free = bd->start;
ASSERT(bd->gen_no == 0);
ASSERT(bd->gen == g0);
IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
}
#endif
}
void
......@@ -734,14 +734,16 @@ StgPtr allocate (Capability *cap, W_ n)
bd = cap->r.rCurrentAlloc;
if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
if (bd) finishedNurseryBlock(cap,bd);
// The CurrentAlloc block is full, we need to find another
// one. First, we try taking the next block from the
// nursery:
bd = cap->r.rCurrentNursery->link;
if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
// The nursery is empty, or the next block is already
// full: allocate a fresh block (we can't fail here).
if (bd == NULL) {
// The nursery is empty: allocate a fresh block (we can't
// fail here).
ACQUIRE_SM_LOCK;
bd = allocBlock();
cap->r.rNursery->n_blocks++;
......@@ -752,6 +754,7 @@ StgPtr allocate (Capability *cap, W_ n)
// pretty quickly now, because MAYBE_GC() will
// notice that CurrentNursery->link is NULL.
} else {
newNurseryBlock(bd);
// we have a block in the nursery: take it and put
// it at the *front* of the nursery list, and use it
// to allocate() from.
......@@ -846,9 +849,9 @@ allocatePinned (Capability *cap, W_ n)
// next GC cycle these objects will be moved to
// g0->large_objects.
if (bd != NULL) {
dbl_link_onto(bd, &cap->pinned_object_blocks);
// add it to the allocation stats when the block is full
cap->total_allocated += bd->free - bd->start;
finishedNurseryBlock(cap, bd);
dbl_link_onto(bd, &cap->pinned_object_blocks);
}
// We need to find another block. We could just allocate one,
......@@ -861,7 +864,7 @@ allocatePinned (Capability *cap, W_ n)
// an *empty* block, because we're about to mark it as
// BF_PINNED | BF_LARGE.
bd = cap->r.rCurrentNursery->link;
if (bd == NULL || bd->free != bd->start) { // must be empty!
if (bd == NULL) { // must be empty!
// The nursery is empty, or the next block is non-empty:
// allocate a fresh block (we can't fail here).
......@@ -878,6 +881,7 @@ allocatePinned (Capability *cap, W_ n)
RELEASE_SM_LOCK;
initBdescr(bd, g0, g0);
} else {
newNurseryBlock(bd);
// we have a block in the nursery: steal it
cap->r.rCurrentNursery->link = bd->link;
if (bd->link != NULL) {
......@@ -1001,21 +1005,57 @@ dirty_MVAR(StgRegTable *reg, StgClosure *p)
* -------------------------------------------------------------------------- */
/* -----------------------------------------------------------------------------
* updateNurseriesStats()
* [Note allocation accounting]
*
* Update the per-cap total_allocated numbers with an approximation of
* the amount of memory used in each cap's nursery.
* - When cap->r.rCurrentNusery moves to a new block in the nursery,
* we add the size of the used portion of the previous block to
* cap->total_allocated. (see finishedNurseryBlock())
*
* - When we start a GC, the allocated portion of CurrentNursery and
* CurrentAlloc are added to cap->total_allocated. (see
* updateNurseriesStats())
*
* Since this update is also performed by clearNurseries() then we only
* need this function for the final stats when the RTS is shutting down.
* -------------------------------------------------------------------------- */
void updateNurseriesStats (void)
//
// Calculate the total allocated memory since the start of the
// program. Also emits events reporting the per-cap allocation
// totals.
//
StgWord
calcTotalAllocated (void)
{
W_ tot_alloc = 0;
W_ n;
for (n = 0; n < n_capabilities; n++) {
tot_alloc += capabilities[n]->total_allocated;
traceEventHeapAllocated(capabilities[n],
CAPSET_HEAP_DEFAULT,
capabilities[n]->total_allocated * sizeof(W_));
}
return tot_alloc;
}
//
// Update the per-cap total_allocated numbers with an approximation of
// the amount of memory used in each cap's nursery.
//
void
updateNurseriesStats (void)
{
nat i;
bdescr *bd;
for (i = 0; i < n_capabilities; i++) {
capabilities[i]->total_allocated += countOccupied(nurseries[i].blocks);
// The current nursery block and the current allocate block have not
// yet been accounted for in cap->total_allocated, so we add them here.
bd = capabilities[i]->r.rCurrentNursery;
if (bd) finishedNurseryBlock(capabilities[i], bd);
bd = capabilities[i]->r.rCurrentAlloc;
if (bd) finishedNurseryBlock(capabilities[i], bd);
}
}
......
......@@ -87,11 +87,31 @@ void resizeNurseries ( W_ blocks );
void resizeNurseriesFixed ( W_ blocks );
W_ countNurseryBlocks ( void );
/* -----------------------------------------------------------------------------
Allocation accounting
See [Note allocation accounting] in Storage.c
-------------------------------------------------------------------------- */
//
// Called when we are finished allocating into a block; account for the amount
// allocated in cap->total_allocated.
//
INLINE_HEADER void finishedNurseryBlock (Capability *cap, bdescr *bd) {
cap->total_allocated += bd->free - bd->start;
}
INLINE_HEADER void newNurseryBlock (bdescr *bd) {
bd->free = bd->start;
}
void updateNurseriesStats (void);
StgWord calcTotalAllocated (void);
/* -----------------------------------------------------------------------------
Stats 'n' DEBUG stuff
-------------------------------------------------------------------------- */
void updateNurseriesStats (void);
W_ countLargeAllocated (void);
W_ countOccupied (bdescr *bd);
W_ calcNeeded (rtsBool force_major, W_ *blocks_needed);
......
......@@ -123,3 +123,4 @@ test('T9001', normal, compile_and_run, [''])
test('T9013', omit_ways(['ghci']), # ghci doesn't support unboxed tuples
compile_and_run, [''])
test('T9340', normal, compile_and_run, [''])
test('cgrun074', normal, compile_and_run, [''])
-- This test exercises the "large block" allocation code in the
-- scheduler, where the mutator requests more than a block's worth of
-- memory.
longlistof x = [
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,
x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x
]
main = print (sum (concat (map longlistof [1..100])))
......@@ -349,6 +349,7 @@ wanteds = concat
,structField C "Capability" "context_switch"
,structField C "Capability" "interrupt"
,structField C "Capability" "sparks"
,structField C "Capability" "total_allocated"
,structField C "Capability" "weak_ptr_list_hd"
,structField C "Capability" "weak_ptr_list_tl"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment