diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs index 44316cacb0919a4d3b897c85577013859b282aa2..9e192a0ac8a86631fd25b6dcb540fcb9e07127e4 100644 --- a/compiler/GHC/StgToCmm/Bind.hs +++ b/compiler/GHC/StgToCmm/Bind.hs @@ -631,6 +631,7 @@ emitBlackHoleCode node = do -- work with profiling. when eager_blackholing $ do + whenUpdRemSetEnabled dflags $ emitUpdRemSetPushThunk node emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr -- See Note [Heap memory barriers] in SMP.h. emitPrimCall [] MO_WriteBarrier [] diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs index cdbc8d9fd9a4052d7415061f6961d4ef2ee4de32..155cdcbf80a988ff6c98785ddb21557d80b8bb91 100644 --- a/compiler/GHC/StgToCmm/Prim.hs +++ b/compiler/GHC/StgToCmm/Prim.hs @@ -42,6 +42,7 @@ import BlockId import MkGraph import StgSyn import Cmm +import Module ( rtsUnitId ) import Type ( Type, tyConAppTyCon ) import TyCon import CLabel @@ -339,14 +340,20 @@ dispatchPrimop dflags = \case emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags)) WriteMutVarOp -> \[mutv, var] -> OpDest_AllDone $ \res@[] -> do + old_val <- CmmLocal <$> newTemp (cmmExprType dflags var) + emitAssign old_val (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags)) + -- Without this write barrier, other CPUs may see this pointer before -- the writes for the closure it points to have occurred. + -- Note that this also must come after we read the old value to ensure + -- that the read of old_val comes before another core's write to the + -- MutVar's value. emitPrimCall res MO_WriteBarrier [] emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var emitCCall [{-no results-}] (CmmLit (CmmLabel mkDirty_MUT_VAR_Label)) - [(baseExpr, AddrHint), (mutv,AddrHint)] + [(baseExpr, AddrHint), (mutv, AddrHint), (CmmReg old_val, AddrHint)] -- #define sizzeofByteArrayzh(r,a) \ -- r = ((StgArrBytes *)(a))->bytes @@ -1983,17 +1990,21 @@ doWritePtrArrayOp :: CmmExpr doWritePtrArrayOp addr idx val = do dflags <- getDynFlags let ty = cmmExprType dflags val + hdr_size = arrPtrsHdrSize dflags + -- Update remembered set for non-moving collector + whenUpdRemSetEnabled dflags + $ emitUpdRemSetPush (cmmLoadIndexOffExpr dflags hdr_size ty addr ty idx) -- This write barrier is to ensure that the heap writes to the object -- referred to by val have happened before we write val into the array. -- See #12469 for details. emitPrimCall [] MO_WriteBarrier [] - mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val + mkBasicIndexedWrite hdr_size Nothing addr ty idx val emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) - -- the write barrier. We must write a byte into the mark table: - -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] + -- the write barrier. We must write a byte into the mark table: + -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] emit $ mkStore ( cmmOffsetExpr dflags - (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags)) + (cmmOffsetExprW dflags (cmmOffsetB dflags addr hdr_size) (loadArrPtrsSize dflags addr)) (CmmMachOp (mo_wordUShr dflags) [idx, mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)]) @@ -2584,6 +2595,9 @@ emitCopyArray copy src0 src_off dst0 dst_off0 n = dst <- assignTempE dst0 dst_off <- assignTempE dst_off0 + -- Nonmoving collector write barrier + emitCopyUpdRemSetPush dflags (arrPtrsHdrSizeW dflags) dst dst_off n + -- Set the dirty bit in the header. emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) @@ -2646,6 +2660,9 @@ emitCopySmallArray copy src0 src_off dst0 dst_off n = src <- assignTempE src0 dst <- assignTempE dst0 + -- Nonmoving collector write barrier + emitCopyUpdRemSetPush dflags (smallArrPtrsHdrSizeW dflags) dst dst_off n + -- Set the dirty bit in the header. emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) @@ -2774,6 +2791,12 @@ doWriteSmallPtrArrayOp :: CmmExpr doWriteSmallPtrArrayOp addr idx val = do dflags <- getDynFlags let ty = cmmExprType dflags val + + -- Update remembered set for non-moving collector + tmp <- newTemp ty + mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing ty tmp addr ty idx + whenUpdRemSetEnabled dflags $ emitUpdRemSetPush (CmmReg (CmmLocal tmp)) + emitPrimCall [] MO_WriteBarrier [] -- #12469 mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) @@ -2953,3 +2976,31 @@ emitCtzCall res x width = do [ res ] (MO_Ctz width) [ x ] + +--------------------------------------------------------------------------- +-- Pushing to the update remembered set +--------------------------------------------------------------------------- + +-- | Push a range of pointer-array elements that are about to be copied over to +-- the update remembered set. +emitCopyUpdRemSetPush :: DynFlags + -> WordOff -- ^ array header size + -> CmmExpr -- ^ destination array + -> CmmExpr -- ^ offset in destination array (in words) + -> Int -- ^ number of elements to copy + -> FCode () +emitCopyUpdRemSetPush _dflags _hdr_size _dst _dst_off 0 = return () +emitCopyUpdRemSetPush dflags hdr_size dst dst_off n = + whenUpdRemSetEnabled dflags $ do + updfr_off <- getUpdFrameOff + graph <- mkCall lbl (NativeNodeCall,NativeReturn) [] args updfr_off [] + emit graph + where + lbl = mkLblExpr $ mkPrimCallLabel + $ PrimCall (fsLit "stg_copyArray_barrier") rtsUnitId + args = + [ mkIntExpr dflags hdr_size + , dst + , dst_off + , mkIntExpr dflags n + ] diff --git a/compiler/GHC/StgToCmm/Utils.hs b/compiler/GHC/StgToCmm/Utils.hs index 30e37bb930b4f1a1603db74bf25c8d04c9a1f2b9..0b3a8d8b080f3e4cfd8523aa893ea8bc774ec6df 100644 --- a/compiler/GHC/StgToCmm/Utils.hs +++ b/compiler/GHC/StgToCmm/Utils.hs @@ -39,6 +39,11 @@ module GHC.StgToCmm.Utils ( mkWordCLit, newStringCLit, newByteStringCLit, blankWord, + + -- * Update remembered set operations + whenUpdRemSetEnabled, + emitUpdRemSetPush, + emitUpdRemSetPushThunk, ) where #include "HsVersions.h" @@ -576,3 +581,40 @@ assignTemp' e let reg = CmmLocal lreg emitAssign reg e return (CmmReg reg) + + +--------------------------------------------------------------------------- +-- Pushing to the update remembered set +--------------------------------------------------------------------------- + +whenUpdRemSetEnabled :: DynFlags -> FCode a -> FCode () +whenUpdRemSetEnabled dflags code = do + do_it <- getCode code + the_if <- mkCmmIfThenElse' is_enabled do_it mkNop (Just False) + emit the_if + where + enabled = CmmLoad (CmmLit $ CmmLabel mkNonmovingWriteBarrierEnabledLabel) (bWord dflags) + zero = zeroExpr dflags + is_enabled = cmmNeWord dflags enabled zero + +-- | Emit code to add an entry to a now-overwritten pointer to the update +-- remembered set. +emitUpdRemSetPush :: CmmExpr -- ^ value of pointer which was overwritten + -> FCode () +emitUpdRemSetPush ptr = do + emitRtsCall + rtsUnitId + (fsLit "updateRemembSetPushClosure_") + [(CmmReg (CmmGlobal BaseReg), AddrHint), + (ptr, AddrHint)] + False + +emitUpdRemSetPushThunk :: CmmExpr -- ^ the thunk + -> FCode () +emitUpdRemSetPushThunk ptr = do + emitRtsCall + rtsUnitId + (fsLit "updateRemembSetPushThunk_") + [(CmmReg (CmmGlobal BaseReg), AddrHint), + (ptr, AddrHint)] + False diff --git a/compiler/cmm/CLabel.hs b/compiler/cmm/CLabel.hs index 0c3dae8001bdf2c93c02d25bfada3831761f81de..66e39f0d69d75c1d4d04d6ccf6ee670acc4ccc6e 100644 --- a/compiler/cmm/CLabel.hs +++ b/compiler/cmm/CLabel.hs @@ -40,6 +40,7 @@ module CLabel ( mkAsmTempDieLabel, mkDirty_MUT_VAR_Label, + mkNonmovingWriteBarrierEnabledLabel, mkUpdInfoLabel, mkBHUpdInfoLabel, mkIndStaticInfoLabel, @@ -484,7 +485,9 @@ mkBlockInfoTableLabel name c = IdLabel name c BlockInfoTable -- See Note [Proc-point local block entry-point]. -- Constructing Cmm Labels -mkDirty_MUT_VAR_Label, mkUpdInfoLabel, +mkDirty_MUT_VAR_Label, + mkNonmovingWriteBarrierEnabledLabel, + mkUpdInfoLabel, mkBHUpdInfoLabel, mkIndStaticInfoLabel, mkMainCapabilityLabel, mkMAP_FROZEN_CLEAN_infoLabel, mkMAP_FROZEN_DIRTY_infoLabel, mkMAP_DIRTY_infoLabel, @@ -494,6 +497,8 @@ mkDirty_MUT_VAR_Label, mkUpdInfoLabel, mkSMAP_FROZEN_CLEAN_infoLabel, mkSMAP_FROZEN_DIRTY_infoLabel, mkSMAP_DIRTY_infoLabel, mkBadAlignmentLabel :: CLabel mkDirty_MUT_VAR_Label = mkForeignLabel (fsLit "dirty_MUT_VAR") Nothing ForeignLabelInExternalPackage IsFunction +mkNonmovingWriteBarrierEnabledLabel + = CmmLabel rtsUnitId (fsLit "nonmoving_write_barrier_enabled") CmmData mkUpdInfoLabel = CmmLabel rtsUnitId (fsLit "stg_upd_frame") CmmInfo mkBHUpdInfoLabel = CmmLabel rtsUnitId (fsLit "stg_bh_upd_frame" ) CmmInfo mkIndStaticInfoLabel = CmmLabel rtsUnitId (fsLit "stg_IND_STATIC") CmmInfo diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst index 665c8c08e062bf096fa9235ed47e456832346782..add0b6c53724f0492147cf162383d4cfe868e576 100644 --- a/docs/users_guide/runtime_control.rst +++ b/docs/users_guide/runtime_control.rst @@ -313,6 +313,24 @@ collection. Hopefully, you won't need any of these in normal operation, but there are several things that can be tweaked for maximum performance. +.. rts-flag:: -xn + + :default: off + :since: 8.8.1 + + .. index:: + single: concurrent mark and sweep + + Enable the concurrent mark-and-sweep garbage collector for old generation + collectors. Typically GHC uses a stop-the-world copying garbage collector + for all generations. This can cause long pauses in execution during major + garbage collections. :rts-flag:`-xn` enables the use of a concurrent + mark-and-sweep garbage collector for oldest generation collections. + Under this collection strategy oldest-generation garbage collection + can proceed concurrently with mutation. + + Note that :rts-flag:`-xn` cannot be used with ``-G1`` nor :rts-flag:`-c`. + .. rts-flag:: -A ⟨size⟩ :default: 1MB diff --git a/includes/Cmm.h b/includes/Cmm.h index 21d5da310c341855744ead6f1358418d473ed03e..546e81e8f6b7088afcf0a87c401b55c138282c48 100644 --- a/includes/Cmm.h +++ b/includes/Cmm.h @@ -842,6 +842,10 @@ __gen = TO_W_(bdescr_gen_no(__bd)); \ if (__gen > 0) { recordMutableCap(__p, __gen); } +/* ----------------------------------------------------------------------------- + Update remembered set write barrier + -------------------------------------------------------------------------- */ + /* ----------------------------------------------------------------------------- Arrays -------------------------------------------------------------------------- */ @@ -944,3 +948,25 @@ prim %memcpy(dst_p, src_p, n * SIZEOF_W, SIZEOF_W); \ \ return (dst); + + +// +// Nonmoving write barrier helpers +// +// See Note [Update remembered set] in NonMovingMark.c. + +#if defined(THREADED_RTS) +#define IF_NONMOVING_WRITE_BARRIER_ENABLED \ + if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False) +#else +// A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C-- +#define IF_NONMOVING_WRITE_BARRIER_ENABLED \ + if (0) +#define nonmoving_write_barrier_enabled 0 +#endif + +// A useful helper for pushing a pointer to the update remembered set. +#define updateRemembSetPushPtr(p) \ + IF_NONMOVING_WRITE_BARRIER_ENABLED { \ + ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr"); \ + } diff --git a/includes/Rts.h b/includes/Rts.h index 256a3e586cf48cb9711a6078fe6a8b2bb1526a62..d0f53710076e4c9321ffe54978fb6007b631a4a0 100644 --- a/includes/Rts.h +++ b/includes/Rts.h @@ -80,6 +80,10 @@ extern "C" { #define RTS_UNREACHABLE abort() #endif +/* Prefetch primitives */ +#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0) +#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1) + /* Fix for mingw stat problem (done here so it's early enough) */ #if defined(mingw32_HOST_OS) #define __MSVCRT__ 1 @@ -203,6 +207,7 @@ void _assertFail(const char *filename, unsigned int linenum) #include "rts/storage/ClosureMacros.h" #include "rts/storage/MBlock.h" #include "rts/storage/GC.h" +#include "rts/NonMoving.h" /* Other RTS external APIs */ #include "rts/Parallel.h" @@ -287,26 +292,27 @@ TICK_VAR(2) #define IF_RTSFLAGS(c,s) if (RtsFlags.c) { s; } doNothing() #if defined(DEBUG) +/* See Note [RtsFlags is a pointer in STG code] */ #if IN_STG_CODE #define IF_DEBUG(c,s) if (RtsFlags[0].DebugFlags.c) { s; } doNothing() #else #define IF_DEBUG(c,s) if (RtsFlags.DebugFlags.c) { s; } doNothing() -#endif +#endif /* IN_STG_CODE */ #else #define IF_DEBUG(c,s) doNothing() -#endif +#endif /* DEBUG */ #if defined(DEBUG) #define DEBUG_ONLY(s) s #else #define DEBUG_ONLY(s) doNothing() -#endif +#endif /* DEBUG */ #if defined(DEBUG) #define DEBUG_IS_ON 1 #else #define DEBUG_IS_ON 0 -#endif +#endif /* DEBUG */ /* ----------------------------------------------------------------------------- Useful macros and inline functions diff --git a/includes/Stg.h b/includes/Stg.h index 73de97055f29a68cea88144c6333b153154c4081..46f71c02417888313ada428bc4e90e68ffaf1a93 100644 --- a/includes/Stg.h +++ b/includes/Stg.h @@ -597,3 +597,4 @@ typedef union { c; \ }) #endif + diff --git a/includes/rts/EventLogFormat.h b/includes/rts/EventLogFormat.h index 7b989b014b7d338841e079bcd7e03736b0757696..d5ed01a8649c8f3e0b911d02b98e799c5b262fb8 100644 --- a/includes/rts/EventLogFormat.h +++ b/includes/rts/EventLogFormat.h @@ -185,12 +185,21 @@ #define EVENT_USER_BINARY_MSG 181 +#define EVENT_CONC_MARK_BEGIN 200 +#define EVENT_CONC_MARK_END 201 +#define EVENT_CONC_SYNC_BEGIN 202 +#define EVENT_CONC_SYNC_END 203 +#define EVENT_CONC_SWEEP_BEGIN 204 +#define EVENT_CONC_SWEEP_END 205 +#define EVENT_CONC_UPD_REM_SET_FLUSH 206 +#define EVENT_NONMOVING_HEAP_CENSUS 207 + /* * The highest event code +1 that ghc itself emits. Note that some event * ranges higher than this are reserved but not currently emitted by ghc. * This must match the size of the EventDesc[] array in EventLog.c */ -#define NUM_GHC_EVENT_TAGS 182 +#define NUM_GHC_EVENT_TAGS 208 #if 0 /* DEPRECATED EVENTS: */ /* we don't actually need to record the thread, it's implicit */ diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index b3caf13c1f44ba1416108de6fa1d6639cdabf1e4..f27ce23b0b88f385205b7cfca81fa15fcfe3f71e 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -52,6 +52,9 @@ typedef struct _GC_FLAGS { double oldGenFactor; double pcFreeHeap; + bool useNonmoving; // default = false + bool nonmovingSelectorOpt; // Do selector optimization in the + // non-moving heap, default = false uint32_t generations; bool squeezeUpdFrames; @@ -95,6 +98,7 @@ typedef struct _DEBUG_FLAGS { bool weak; /* 'w' */ bool gccafs; /* 'G' */ bool gc; /* 'g' */ + bool nonmoving_gc; /* 'n' */ bool block_alloc; /* 'b' */ bool sanity; /* 'S' warning: might be expensive! */ bool zero_on_gc; /* 'Z' */ @@ -168,6 +172,7 @@ typedef struct _TRACE_FLAGS { bool timestamp; /* show timestamp in stderr output */ bool scheduler; /* trace scheduler events */ bool gc; /* trace GC events */ + bool nonmoving_gc; /* trace nonmoving GC events */ bool sparks_sampled; /* trace spark events by a sampled method */ bool sparks_full; /* trace spark events 100% accurately */ bool user; /* trace user events (emitted from Haskell code) */ @@ -268,7 +273,11 @@ typedef struct _RTS_FLAGS { #if defined(COMPILING_RTS_MAIN) extern DLLIMPORT RTS_FLAGS RtsFlags; #elif IN_STG_CODE -/* Hack because the C code generator can't generate '&label'. */ +/* Note [RtsFlags is a pointer in STG code] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * When compiling with IN_STG_CODE the RtsFlags symbol is defined as a pointer. + * This is necessary because the C code generator can't generate '&label'. + */ extern RTS_FLAGS RtsFlags[]; #else extern RTS_FLAGS RtsFlags; diff --git a/includes/rts/NonMoving.h b/includes/rts/NonMoving.h new file mode 100644 index 0000000000000000000000000000000000000000..314c582a1e158f2703cc505f5d6ea407dec693a1 --- /dev/null +++ b/includes/rts/NonMoving.h @@ -0,0 +1,43 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 2018-2019 + * + * Non-moving garbage collector + * + * Do not #include this file directly: #include "Rts.h" instead. + * + * To understand the structure of the RTS headers, see the wiki: + * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes + * + * -------------------------------------------------------------------------- */ + +#pragma once + +// Forward declaration for Stg.h +struct StgClosure_; +struct StgThunk_; +struct Capability_; + +/* This is called by the code generator */ +extern DLL_IMPORT_RTS +void updateRemembSetPushClosure_(StgRegTable *reg, struct StgClosure_ *p); + +extern DLL_IMPORT_RTS +void updateRemembSetPushThunk_(StgRegTable *reg, struct StgThunk_ *p); + +// Forward declaration for unregisterised backend. +EF_(stg_copyArray_barrier); + +// Note that RTS code should not condition on this directly by rather +// use the IF_NONMOVING_WRITE_BARRIER_ENABLED macro to ensure that +// the barrier is eliminated in the non-threaded RTS. +extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled); + +// A similar macro is defined in includes/Cmm.h for C-- code. +#if defined(THREADED_RTS) +#define IF_NONMOVING_WRITE_BARRIER_ENABLED \ + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) +#else +#define IF_NONMOVING_WRITE_BARRIER_ENABLED \ + if (0) +#endif diff --git a/includes/rts/storage/Block.h b/includes/rts/storage/Block.h index ecd6bf5dd8f62cdcde4f17e2f2d05b63f5709b6b..4afc3689cb5591ba22eb9b31b8a46dcf6268578b 100644 --- a/includes/rts/storage/Block.h +++ b/includes/rts/storage/Block.h @@ -88,15 +88,23 @@ typedef struct bdescr_ { StgPtr start; // [READ ONLY] start addr of memory - StgPtr free; // First free byte of memory. - // allocGroup() sets this to the value of start. - // NB. during use this value should lie - // between start and start + blocks * - // BLOCK_SIZE. Values outside this - // range are reserved for use by the - // block allocator. In particular, the - // value (StgPtr)(-1) is used to - // indicate that a block is unallocated. + union { + StgPtr free; // First free byte of memory. + // allocGroup() sets this to the value of start. + // NB. during use this value should lie + // between start and start + blocks * + // BLOCK_SIZE. Values outside this + // range are reserved for use by the + // block allocator. In particular, the + // value (StgPtr)(-1) is used to + // indicate that a block is unallocated. + // + // Unused by the non-moving allocator. + struct NonmovingSegmentInfo { + StgWord8 log_block_size; + StgWord16 next_free_snap; + } nonmoving_segment; + }; struct bdescr_ *link; // used for chaining blocks together @@ -141,7 +149,8 @@ typedef struct bdescr_ { #define BF_LARGE 2 /* Block is pinned */ #define BF_PINNED 4 -/* Block is to be marked, not copied */ +/* Block is to be marked, not copied. Also used for marked large objects in + * non-moving heap. */ #define BF_MARKED 8 /* Block is executable */ #define BF_EXEC 32 @@ -153,6 +162,12 @@ typedef struct bdescr_ { #define BF_SWEPT 256 /* Block is part of a Compact */ #define BF_COMPACT 512 +/* A non-moving allocator segment (see NonMoving.c) */ +#define BF_NONMOVING 1024 +/* A large object which has been moved to off of oldest_gen->large_objects and + * onto nonmoving_large_objects. The mark phase ignores objects which aren't + * so-flagged */ +#define BF_NONMOVING_SWEEPING 2048 /* Maximum flag value (do not define anything higher than this!) */ #define BF_FLAG_MAX (1 << 15) @@ -290,6 +305,13 @@ EXTERN_INLINE bdescr* allocBlock(void) bdescr *allocGroupOnNode(uint32_t node, W_ n); +// Allocate n blocks, aligned at n-block boundary. The returned bdescr will +// have this invariant +// +// bdescr->start % BLOCK_SIZE*n == 0 +// +bdescr *allocAlignedGroupOnNode(uint32_t node, W_ n); + EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node); EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node) { diff --git a/includes/rts/storage/ClosureMacros.h b/includes/rts/storage/ClosureMacros.h index a3873cc49dadcf0b4e3347fcac9ae000ef084084..2af50863d0d7f4393f53baf8572936b1b883f171 100644 --- a/includes/rts/storage/ClosureMacros.h +++ b/includes/rts/storage/ClosureMacros.h @@ -107,6 +107,20 @@ INLINE_HEADER const StgConInfoTable *get_con_itbl(const StgClosure *c) return CON_INFO_PTR_TO_STRUCT((c)->header.info); } +/* Used when we expect another thread to be mutating the info table pointer of + * a closure (e.g. when busy-waiting on a WHITEHOLE). + */ +INLINE_HEADER const StgInfoTable *get_volatile_itbl(StgClosure *c) { + // The volatile here is import to ensure that the compiler does not + // optimise away multiple loads, e.g. in a busy-wait loop. Note that + // we can't use VOLATILE_LOAD here as the casts result in strict aliasing + // rule violations and this header may be compiled outside of the RTS + // (where we use -fno-strict-aliasing). + StgInfoTable * *volatile p = (StgInfoTable * *volatile) &c->header.info; + return INFO_PTR_TO_STRUCT(*p); +} + + INLINE_HEADER StgHalfWord GET_TAG(const StgClosure *con) { return get_itbl(con)->srt; diff --git a/includes/rts/storage/Closures.h b/includes/rts/storage/Closures.h index 6088fc8a1012b5c42f8ced778f2b30949d2081ad..b2b5eda407df07a5278fbbcc031f52b2589e1e31 100644 --- a/includes/rts/storage/Closures.h +++ b/includes/rts/storage/Closures.h @@ -94,7 +94,7 @@ typedef struct StgClosure_ { struct StgClosure_ *payload[]; } *StgClosurePtr; // StgClosure defined in rts/Types.h -typedef struct { +typedef struct StgThunk_ { StgThunkHeader header; struct StgClosure_ *payload[]; } StgThunk; diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h index 1571975852c0220fe274b8c9f4cd52a4e252ca95..7931433019869f2fe13e884441dd030a348c421f 100644 --- a/includes/rts/storage/GC.h +++ b/includes/rts/storage/GC.h @@ -234,15 +234,23 @@ void setKeepCAFs (void); and is put on the mutable list. -------------------------------------------------------------------------- */ -void dirty_MUT_VAR(StgRegTable *reg, StgClosure *p); +void dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mv, StgClosure *old); /* set to disable CAF garbage collection in GHCi. */ /* (needed when dynamic libraries are used). */ extern bool keepCAFs; +#include "rts/Flags.h" + INLINE_HEADER void initBdescr(bdescr *bd, generation *gen, generation *dest) { bd->gen = gen; bd->gen_no = gen->no; bd->dest_no = dest->no; + +#if !IN_STG_CODE + /* See Note [RtsFlags is a pointer in STG code] */ + ASSERT(gen->no < RtsFlags.GcFlags.generations); + ASSERT(dest->no < RtsFlags.GcFlags.generations); +#endif } diff --git a/includes/rts/storage/InfoTables.h b/includes/rts/storage/InfoTables.h index 4de5207b4df2ecbcd2aee395fe184cbf621a1f1b..b97e12982b36d19ae32c3eed5a93c557a5d4fc8f 100644 --- a/includes/rts/storage/InfoTables.h +++ b/includes/rts/storage/InfoTables.h @@ -355,7 +355,7 @@ typedef struct StgConInfoTable_ { */ #if defined(TABLES_NEXT_TO_CODE) #define GET_CON_DESC(info) \ - ((const char *)((StgWord)((info)+1) + (info->con_desc))) + ((const char *)((StgWord)((info)+1) + ((info)->con_desc))) #else #define GET_CON_DESC(info) ((const char *)(info)->con_desc) #endif diff --git a/includes/rts/storage/TSO.h b/includes/rts/storage/TSO.h index 93018581fdca46d143b9e6d74e44867631254226..d706282796b321b2a684d58f0cb02cb299471133 100644 --- a/includes/rts/storage/TSO.h +++ b/includes/rts/storage/TSO.h @@ -185,10 +185,66 @@ typedef struct StgTSO_ { } *StgTSOPtr; // StgTSO defined in rts/Types.h +/* Note [StgStack dirtiness flags and concurrent marking] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Without concurrent collection by the nonmoving collector the stack dirtiness story + * is quite simple: The stack is either STACK_DIRTY (meaning it has been added to mut_list) + * or not. + * + * However, things are considerably more complicated with concurrent collection + * (namely, when nonmoving_write_barrier_enabled is set): In addition to adding + * the stack to mut_list and flagging it as STACK_DIRTY, we also must ensure + * that stacks are marked in accordance with the nonmoving collector's snapshot + * invariant. This is: every stack alive at the time the snapshot is taken must + * be marked at some point after the moment the snapshot is taken and before it + * is mutated or the commencement of the sweep phase. + * + * This marking may be done by the concurrent mark phase (in the case of a + * thread that never runs during the concurrent mark) or by the mutator when + * dirtying the stack. However, it is unsafe for the concurrent collector to + * traverse the stack while it is under mutation. Consequently, the following + * handshake is obeyed by the mutator's write barrier and the concurrent mark to + * ensure this doesn't happen: + * + * 1. The entity seeking to mark first checks that the stack lives in the nonmoving + * generation; if not then the stack was not alive at the time the snapshot + * was taken and therefore we need not mark it. + * + * 2. The entity seeking to mark checks the stack's mark bit. If it is set then + * no mark is necessary. + * + * 3. The entity seeking to mark tries to lock the stack for marking by + * atomically setting its `marking` field to the current non-moving mark + * epoch: + * + * a. If the mutator finds the concurrent collector has already locked the + * stack then it waits until it is finished (indicated by the mark bit + * being set) before proceeding with execution. + * + * b. If the concurrent collector finds that the mutator has locked the stack + * then it moves on, leaving the mutator to mark it. There is no need to wait; + * the mark is guaranteed to finish before sweep due to the post-mark + * synchronization with mutators. + * + * c. Whoever succeeds in locking the stack is responsible for marking it and + * setting the stack's mark bit (either the BF_MARKED bit for large objects + * or otherwise its bit in its segment's mark bitmap). + * + * To ensure that mutation does not proceed until the stack is fully marked the + * mark phase must not set the mark bit until it has finished tracing. + * + */ + +#define STACK_DIRTY 1 +// used by sanity checker to verify that all dirty stacks are on the mutable list +#define STACK_SANE 64 + typedef struct StgStack_ { StgHeader header; StgWord32 stack_size; // stack size in *words* - StgWord32 dirty; // non-zero => dirty + StgWord8 dirty; // non-zero => dirty + StgWord8 marking; // non-zero => someone is currently marking the stack StgPtr sp; // current stack pointer StgWord stack[]; } StgStack; diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h index 217b1bc89d85ce78f3bc947260af791af0b1d430..7a2ac2ef5101189805cdcca11b32b284e0c7d144 100644 --- a/includes/stg/MiscClosures.h +++ b/includes/stg/MiscClosures.h @@ -543,4 +543,11 @@ void * pushCostCentre (void *ccs, void *cc); // Capability.c extern unsigned int n_capabilities; +/* ----------------------------------------------------------------------------- + Nonmoving GC write barrier + -------------------------------------------------------------------------- */ + +#include <rts/NonMoving.h> + + #endif diff --git a/includes/stg/SMP.h b/includes/stg/SMP.h index 2d6a220a9e283138b9ba01b46e1fa05b33be9db7..60f084be9a964a942d4020fed267de601b2606ee 100644 --- a/includes/stg/SMP.h +++ b/includes/stg/SMP.h @@ -49,6 +49,7 @@ EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w); * } */ EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n); +EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n); /* * Atomic addition by the provided quantity @@ -283,6 +284,12 @@ cas(StgVolatilePtr p, StgWord o, StgWord n) return __sync_val_compare_and_swap(p, o, n); } +EXTERN_INLINE StgWord8 +cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n) +{ + return __sync_val_compare_and_swap(p, o, n); +} + // RRN: Generalized to arbitrary increments to enable fetch-and-add in // Haskell code (fetchAddIntArray#). // PT: add-and-fetch, returns new value @@ -434,6 +441,18 @@ cas(StgVolatilePtr p, StgWord o, StgWord n) return result; } +EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n); +EXTERN_INLINE StgWord8 +cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n) +{ + StgWord8 result; + result = *p; + if (result == o) { + *p = n; + } + return result; +} + EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr); EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr) diff --git a/includes/stg/Types.h b/includes/stg/Types.h index 08ba58c799ee4e95261a24062cda1956b32c4340..8ce9e3c156d0ebb98439dc751b54deecb4baa178 100644 --- a/includes/stg/Types.h +++ b/includes/stg/Types.h @@ -192,3 +192,10 @@ typedef StgWord8* StgByteArray; typedef void *(*(*StgFunPtr)(void))(void); typedef StgFunPtr StgFun(void); + +// Forward declarations for the unregisterised backend, which +// only depends upon Stg.h and not the entirety of Rts.h, which +// is where these are defined. +struct StgClosure_; +struct StgThunk_; +struct Capability_; diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc index 249bcd5a98e2df72fb46fdce6cfb78148888bb52..913344c1660db8a1b645df937bf9725784fbddd0 100644 --- a/libraries/base/GHC/RTS/Flags.hsc +++ b/libraries/base/GHC/RTS/Flags.hsc @@ -150,21 +150,22 @@ data MiscFlags = MiscFlags -- -- @since 4.8.0.0 data DebugFlags = DebugFlags - { scheduler :: Bool -- ^ @s@ - , interpreter :: Bool -- ^ @i@ - , weak :: Bool -- ^ @w@ - , gccafs :: Bool -- ^ @G@ - , gc :: Bool -- ^ @g@ - , block_alloc :: Bool -- ^ @b@ - , sanity :: Bool -- ^ @S@ - , stable :: Bool -- ^ @t@ - , prof :: Bool -- ^ @p@ - , linker :: Bool -- ^ @l@ the object linker - , apply :: Bool -- ^ @a@ - , stm :: Bool -- ^ @m@ - , squeeze :: Bool -- ^ @z@ stack squeezing & lazy blackholing - , hpc :: Bool -- ^ @c@ coverage - , sparks :: Bool -- ^ @r@ + { scheduler :: Bool -- ^ @s@ + , interpreter :: Bool -- ^ @i@ + , weak :: Bool -- ^ @w@ + , gccafs :: Bool -- ^ @G@ + , gc :: Bool -- ^ @g@ + , nonmoving_gc :: Bool -- ^ @n@ + , block_alloc :: Bool -- ^ @b@ + , sanity :: Bool -- ^ @S@ + , stable :: Bool -- ^ @t@ + , prof :: Bool -- ^ @p@ + , linker :: Bool -- ^ @l@ the object linker + , apply :: Bool -- ^ @a@ + , stm :: Bool -- ^ @m@ + , squeeze :: Bool -- ^ @z@ stack squeezing & lazy blackholing + , hpc :: Bool -- ^ @c@ coverage + , sparks :: Bool -- ^ @r@ } deriving ( Show -- ^ @since 4.8.0.0 ) @@ -291,6 +292,8 @@ data TraceFlags = TraceFlags , timestamp :: Bool -- ^ show timestamp in stderr output , traceScheduler :: Bool -- ^ trace scheduler events , traceGc :: Bool -- ^ trace GC events + , traceNonmovingGc + :: Bool -- ^ trace nonmoving GC heap census samples , sparksSampled :: Bool -- ^ trace spark events by a sampled method , sparksFull :: Bool -- ^ trace spark events 100% accurately , user :: Bool -- ^ trace user events (emitted from Haskell code) @@ -462,6 +465,8 @@ getDebugFlags = do (#{peek DEBUG_FLAGS, gccafs} ptr :: IO CBool)) <*> (toBool <$> (#{peek DEBUG_FLAGS, gc} ptr :: IO CBool)) + <*> (toBool <$> + (#{peek DEBUG_FLAGS, nonmoving_gc} ptr :: IO CBool)) <*> (toBool <$> (#{peek DEBUG_FLAGS, block_alloc} ptr :: IO CBool)) <*> (toBool <$> @@ -522,6 +527,8 @@ getTraceFlags = do (#{peek TRACE_FLAGS, scheduler} ptr :: IO CBool)) <*> (toBool <$> (#{peek TRACE_FLAGS, gc} ptr :: IO CBool)) + <*> (toBool <$> + (#{peek TRACE_FLAGS, nonmoving_gc} ptr :: IO CBool)) <*> (toBool <$> (#{peek TRACE_FLAGS, sparks_sampled} ptr :: IO CBool)) <*> (toBool <$> diff --git a/libraries/base/tests/all.T b/libraries/base/tests/all.T index 61367e44918d87b64cbdf357cc9ac54e5eeef4c5..2d4119e54366fb022b498eda2f6e5e31d496a52a 100644 --- a/libraries/base/tests/all.T +++ b/libraries/base/tests/all.T @@ -74,7 +74,7 @@ test('length001', # excessive amounts of stack space. So we specifically set a low # stack limit and mark it as failing under a few conditions. [extra_run_opts('+RTS -K8m -RTS'), - expect_fail_for(['normal', 'threaded1', 'llvm'])], + expect_fail_for(['normal', 'threaded1', 'llvm', 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])], compile_and_run, ['']) test('ratio001', normal, compile_and_run, ['']) diff --git a/libraries/ghc-heap/tests/all.T b/libraries/ghc-heap/tests/all.T index afa224fde7dc218a211052fd1ba14d0e6bc35d6f..89e6f47ecb028f5164b1fbb4a7bdeb88c14625c9 100644 --- a/libraries/ghc-heap/tests/all.T +++ b/libraries/ghc-heap/tests/all.T @@ -2,7 +2,11 @@ test('heap_all', [when(have_profiling(), extra_ways(['prof'])), # These ways produce slightly different heap representations. # Currently we don't test them. - omit_ways(['ghci', 'hpc']) + omit_ways(['ghci', 'hpc', + 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']), + # The debug RTS initializes some fields with 0xaa and so + # this test spuriously fails. + when(compiler_debugged(), skip) ], compile_and_run, ['']) diff --git a/libraries/stm b/libraries/stm index a925aaa505d9259f26e2f3fb2ffa2e9b66b48749..f9979c926ca539362b5a2412359750e8b498e53a 160000 --- a/libraries/stm +++ b/libraries/stm @@ -1 +1 @@ -Subproject commit a925aaa505d9259f26e2f3fb2ffa2e9b66b48749 +Subproject commit f9979c926ca539362b5a2412359750e8b498e53a diff --git a/rts/Apply.cmm b/rts/Apply.cmm index 8d7fc3c01224a42fccb321a79ed700c076a1e0f8..eeb760c5ed073539e2fa7c843275474d12a315a9 100644 --- a/rts/Apply.cmm +++ b/rts/Apply.cmm @@ -654,6 +654,8 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK") /* someone else beat us to it */ jump ENTRY_LBL(stg_WHITEHOLE) (ap); } + // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is + // not reachable. StgInd_indirectee(ap) = CurrentTSO; prim_write_barrier; SET_INFO(ap, __stg_EAGER_BLACKHOLE_info); diff --git a/rts/Capability.c b/rts/Capability.c index 33a94398cd17705fa7ce9fa2ab8794f2a61f6814..0baa4ef205c8f64361e442f18af6b77d83bfd0a5 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -27,6 +27,7 @@ #include "STM.h" #include "RtsUtils.h" #include "sm/OSMem.h" +#include "sm/BlockAlloc.h" // for countBlocks() #if !defined(mingw32_HOST_OS) #include "rts/IOManager.h" // for setIOManagerControlFd() @@ -291,6 +292,11 @@ initCapability (Capability *cap, uint32_t i) RtsFlags.GcFlags.generations, "initCapability"); + + // At this point storage manager is not initialized yet, so this will be + // initialized in initStorage(). + cap->upd_rem_set.queue.blocks = NULL; + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { cap->mut_lists[g] = NULL; } @@ -748,6 +754,8 @@ static Capability * waitForReturnCapability (Task *task) * result of the external call back to the Haskell thread that * made it. * + * pCap is strictly an output. + * * ------------------------------------------------------------------------- */ void waitForCapability (Capability **pCap, Task *task) @@ -840,6 +848,9 @@ void waitForCapability (Capability **pCap, Task *task) * SYNC_GC_PAR), either to do a sequential GC, forkProcess, or * setNumCapabilities. We should give up the Capability temporarily. * + * When yieldCapability returns *pCap will have been updated to the new + * capability held by the caller. + * * ------------------------------------------------------------------------- */ #if defined(THREADED_RTS) @@ -855,16 +866,27 @@ yieldCapability (Capability** pCap, Task *task, bool gcAllowed) { PendingSync *sync = pending_sync; - if (sync && sync->type == SYNC_GC_PAR) { - if (! sync->idle[cap->no]) { - traceEventGcStart(cap); - gcWorkerThread(cap); - traceEventGcEnd(cap); - traceSparkCounters(cap); - // See Note [migrated bound threads 2] - if (task->cap == cap) { - return true; + if (sync) { + switch (sync->type) { + case SYNC_GC_PAR: + if (! sync->idle[cap->no]) { + traceEventGcStart(cap); + gcWorkerThread(cap); + traceEventGcEnd(cap); + traceSparkCounters(cap); + // See Note [migrated bound threads 2] + if (task->cap == cap) { + return true; + } } + break; + + case SYNC_FLUSH_UPD_REM_SET: + debugTrace(DEBUG_nonmoving_gc, "Flushing update remembered set blocks..."); + break; + + default: + break; } } } diff --git a/rts/Capability.h b/rts/Capability.h index 0833006b0c848a03a8d3e1a20ac5efed4cd98cfc..3078680aa6d2bb7e1e4072b1ce596613cfa4f2b2 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -23,6 +23,7 @@ #include "sm/GC.h" // for evac_fn #include "Task.h" #include "Sparks.h" +#include "sm/NonMovingMark.h" // for MarkQueue #include "BeginPrivate.h" @@ -84,6 +85,9 @@ struct Capability_ { bdescr **mut_lists; bdescr **saved_mut_lists; // tmp use during GC + // The update remembered set for the non-moving collector + UpdRemSet upd_rem_set; + // block for allocating pinned objects into bdescr *pinned_object_block; // full pinned object blocks allocated since the last GC @@ -258,7 +262,8 @@ extern Capability **capabilities; typedef enum { SYNC_OTHER, SYNC_GC_SEQ, - SYNC_GC_PAR + SYNC_GC_PAR, + SYNC_FLUSH_UPD_REM_SET } SyncType; // diff --git a/rts/Exception.cmm b/rts/Exception.cmm index 8ea94b19f279d66ca34126486846b249fcb1f8bd..334d0ef82360dc4a9815880202fb5ec2270e6f28 100644 --- a/rts/Exception.cmm +++ b/rts/Exception.cmm @@ -318,6 +318,7 @@ stg_killThreadzh (P_ target, P_ exception) return (); } else { StgTSO_why_blocked(CurrentTSO) = BlockedOnMsgThrowTo; + updateRemembSetPushPtr(StgTSO_block_info(CurrentTSO)); StgTSO_block_info(CurrentTSO) = msg; // we must block, and unlock the message before returning jump stg_block_throwto (target, exception); @@ -489,6 +490,8 @@ retry_pop_stack: ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr"); ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr"); + // No need to push `trec` to update remembered set; it will be no longer + // reachable after we overwrite StgTSO.trec. StgTSO_trec(CurrentTSO) = NO_TREC; if (r != 0) { // Transaction was valid: continue searching for a catch frame @@ -607,6 +610,8 @@ retry_pop_stack: outer = StgTRecHeader_enclosing_trec(trec); ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr"); ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr"); + // No need to push `trec` to update remembered set since we just freed + // it; it is no longer reachable. StgTSO_trec(CurrentTSO) = outer; Sp = Sp + SIZEOF_StgCatchSTMFrame; } diff --git a/rts/Messages.c b/rts/Messages.c index d878db5eda2ea9a4ad9b9f9daa29273e4772f3e9..374f3d673e7a31dd854898566f5f53afcfae75ca 100644 --- a/rts/Messages.c +++ b/rts/Messages.c @@ -244,8 +244,8 @@ loop: // a barrier is necessary to ensure that all writes are visible. // See Note [Heap memory barriers] in SMP.h. write_barrier(); + dirty_TSO(cap, owner); // we will modify owner->bq owner->bq = bq; - dirty_TSO(cap, owner); // we modified owner->bq // If the owner of the blackhole is currently runnable, then // bump it to the front of the run queue. This gives the @@ -262,6 +262,9 @@ loop: // point to the BLOCKING_QUEUE from the BLACKHOLE write_barrier(); // make the BQ visible, see Note [Heap memory barriers]. + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure(cap, (StgClosure*)p); + } ((StgInd*)bh)->indirectee = (StgClosure *)bq; recordClosureMutated(cap,bh); // bh was mutated @@ -290,6 +293,11 @@ loop: } #endif + IF_NONMOVING_WRITE_BARRIER_ENABLED { + // We are about to overwrite bq->queue; make sure its current value + // makes it into the update remembered set + updateRemembSetPushClosure(cap, (StgClosure*)bq->queue); + } msg->link = bq->queue; bq->queue = msg; // No barrier is necessary here: we are only exposing the diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index ec35ee42b479482cea8ffb6d2ecf4f404a88e60d..b66c561dcb8a6d72e9a59c0e25109ead34d079bf 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -349,8 +349,13 @@ stg_casArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new ) // Compare and Swap Succeeded: SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS); len = StgMutArrPtrs_ptrs(arr); + // The write barrier. We must write a byte into the mark table: I8[arr + SIZEOF_StgMutArrPtrs + WDS(len) + (ind >> MUT_ARR_PTRS_CARD_BITS )] = 1; + + // Concurrent GC write barrier + updateRemembSetPushPtr(old); + return (0,new); } } @@ -462,16 +467,45 @@ stg_thawSmallArrayzh ( gcptr src, W_ offset, W_ n ) cloneSmallArray(stg_SMALL_MUT_ARR_PTRS_DIRTY_info, src, offset, n) } +// Concurrent GC write barrier for pointer array copies +// +// hdr_size in bytes. dst_off in words, n in words. +stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n) +{ + W_ end, p; + ASSERT(n > 0); // Assumes n==0 is handled by caller + p = dst + hdr_size + WDS(dst_off); + end = p + WDS(n); + +again: + IF_NONMOVING_WRITE_BARRIER_ENABLED { + ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr"); + } + p = p + WDS(1); + if (p < end) { + goto again; + } + + return (); +} + stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n) { W_ dst_p, src_p, bytes; - SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + if (n > 0) { + IF_NONMOVING_WRITE_BARRIER_ENABLED { + call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs, + dst, dst_off, n); + } - dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); - src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); - bytes = WDS(n); - prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + + dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); + src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); + bytes = WDS(n); + prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + } return (); } @@ -480,15 +514,22 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n { W_ dst_p, src_p, bytes; - SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + if (n > 0) { + IF_NONMOVING_WRITE_BARRIER_ENABLED { + call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs, + dst, dst_off, n); + } - dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); - src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); - bytes = WDS(n); - if (src == dst) { - prim %memmove(dst_p, src_p, bytes, SIZEOF_W); - } else { - prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + + dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); + src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); + bytes = WDS(n); + if (src == dst) { + prim %memmove(dst_p, src_p, bytes, SIZEOF_W); + } else { + prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + } } return (); @@ -510,6 +551,10 @@ stg_casSmallArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new ) } else { // Compare and Swap Succeeded: SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS); + + // Concurrent GC write barrier + updateRemembSetPushPtr(old); + return (0,new); } } @@ -549,7 +594,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new ) return (1,h); } else { if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr"); + ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old); } return (0,new); } @@ -562,7 +607,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new ) } else { StgMutVar_var(mv) = new; if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr"); + ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old); } return (0,new); } @@ -629,11 +674,12 @@ stg_atomicModifyMutVar2zh ( gcptr mv, gcptr f ) (h) = prim %cmpxchgW(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y); if (h != x) { goto retry; } #else + h = StgMutVar_var(mv); StgMutVar_var(mv) = y; #endif if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr"); + ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", h); } return (x,z); @@ -755,6 +801,9 @@ stg_addCFinalizzerToWeakzh ( W_ fptr, // finalizer return (0); } + // Write barrier for concurrent non-moving collector + updateRemembSetPushPtr(StgWeak_cfinalizers(w)) + StgCFinalizerList_link(c) = StgWeak_cfinalizers(w); StgWeak_cfinalizers(w) = c; @@ -835,6 +884,8 @@ stg_deRefWeakzh ( gcptr w ) if (info == stg_WEAK_info) { code = 1; val = StgWeak_value(w); + // See Note [Concurrent read barrier on deRefWeak#] in NonMovingMark.c + updateRemembSetPushPtr(val); } else { code = 0; val = w; @@ -1515,7 +1566,7 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ ) */ if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) { if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } // We want to put the heap check down here in the slow path, @@ -1561,6 +1612,9 @@ loop: // If the MVar is not already dirty, then we don't need to make // it dirty, as it is empty with nothing blocking on it. unlockClosure(mvar, info); + // However, we do need to ensure that the nonmoving collector + // knows about the reference to the value that we just removed... + updateRemembSetPushPtr(val); return (val); } qinfo = StgHeader_info(q); @@ -1574,7 +1628,7 @@ loop: // There are putMVar(s) waiting... wake up the first thread on the queue if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr"); } tso = StgMVarTSOQueue_tso(q); @@ -1643,7 +1697,7 @@ loop: // There are putMVar(s) waiting... wake up the first thread on the queue if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr"); } tso = StgMVarTSOQueue_tso(q); @@ -1681,7 +1735,7 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */ if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) { if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } // We want to put the heap check down here in the slow path, @@ -1715,14 +1769,20 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */ jump stg_block_putmvar(mvar,val); } + // We are going to mutate the closure, make sure its current pointers + // are marked. + if (info == stg_MVAR_CLEAN_info) { + ccall update_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); + } + q = StgMVar_head(mvar); loop: if (q == stg_END_TSO_QUEUE_closure) { /* No further takes, the MVar is now full. */ + StgMVar_value(mvar) = val; if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } - StgMVar_value(mvar) = val; unlockClosure(mvar, stg_MVAR_DIRTY_info); return (); } @@ -1758,7 +1818,7 @@ loop: // indicate that the MVar operation has now completed. StgTSO__link(tso) = stg_END_TSO_QUEUE_closure; - if (TO_W_(StgStack_dirty(stack)) == 0) { + if ((TO_W_(StgStack_dirty(stack)) & STACK_DIRTY) == 0) { ccall dirty_STACK(MyCapability() "ptr", stack "ptr"); } @@ -1804,7 +1864,7 @@ loop: if (q == stg_END_TSO_QUEUE_closure) { /* No further takes, the MVar is now full. */ if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } StgMVar_value(mvar) = val; @@ -1843,7 +1903,7 @@ loop: // indicate that the MVar operation has now completed. StgTSO__link(tso) = stg_END_TSO_QUEUE_closure; - if (TO_W_(StgStack_dirty(stack)) == 0) { + if ((TO_W_(StgStack_dirty(stack)) & STACK_DIRTY) == 0) { ccall dirty_STACK(MyCapability() "ptr", stack "ptr"); } @@ -1875,7 +1935,7 @@ stg_readMVarzh ( P_ mvar, /* :: MVar a */ ) if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) { if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar)); } ALLOC_PRIM_WITH_CUSTOM_FAILURE diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c index 807c3e3d30a2783c3234ab758312980674947e7f..50cddff0513458179d238e1141d6774f200f7f5b 100644 --- a/rts/RaiseAsync.c +++ b/rts/RaiseAsync.c @@ -515,9 +515,9 @@ blockedThrowTo (Capability *cap, StgTSO *target, MessageThrowTo *msg) ASSERT(target->cap == cap); + dirty_TSO(cap,target); // we will modify the blocked_exceptions queue msg->link = target->blocked_exceptions; target->blocked_exceptions = msg; - dirty_TSO(cap,target); // we modified the blocked_exceptions queue } /* ----------------------------------------------------------------------------- diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index d36e9ffc6626d71a7517bb8fbbd2d54556c9504c..0e28b980ac6ee4174809ebfc7dbfe0f1a9ccaad6 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -156,6 +156,8 @@ void initRtsFlagsDefaults(void) RtsFlags.GcFlags.heapSizeSuggestionAuto = false; RtsFlags.GcFlags.pcFreeHeap = 3; /* 3% */ RtsFlags.GcFlags.oldGenFactor = 2; + RtsFlags.GcFlags.useNonmoving = false; + RtsFlags.GcFlags.nonmovingSelectorOpt = false; RtsFlags.GcFlags.generations = 2; RtsFlags.GcFlags.squeezeUpdFrames = true; RtsFlags.GcFlags.compact = false; @@ -179,6 +181,7 @@ void initRtsFlagsDefaults(void) RtsFlags.DebugFlags.weak = false; RtsFlags.DebugFlags.gccafs = false; RtsFlags.DebugFlags.gc = false; + RtsFlags.DebugFlags.nonmoving_gc = false; RtsFlags.DebugFlags.block_alloc = false; RtsFlags.DebugFlags.sanity = false; RtsFlags.DebugFlags.zero_on_gc = false; @@ -220,6 +223,7 @@ void initRtsFlagsDefaults(void) RtsFlags.TraceFlags.timestamp = false; RtsFlags.TraceFlags.scheduler = false; RtsFlags.TraceFlags.gc = false; + RtsFlags.TraceFlags.nonmoving_gc = false; RtsFlags.TraceFlags.sparks_sampled= false; RtsFlags.TraceFlags.sparks_full = false; RtsFlags.TraceFlags.user = false; @@ -299,6 +303,7 @@ usage_text[] = { " -xb<addr> Sets the address from which a suitable start for the heap memory", " will be searched from. This is useful if the default address", " clashes with some third-party library.", +" -xn Use the non-moving collector for the old generation.", " -m<n> Minimum % of heap which must be available (default 3%)", " -G<n> Number of generations (default: 2)", " -c<n> Use in-place compaction instead of copying in the oldest generation", @@ -404,6 +409,7 @@ usage_text[] = { " -Dw DEBUG: weak", " -DG DEBUG: gccafs", " -Dg DEBUG: gc", +" -Dn DEBUG: non-moving gc", " -Db DEBUG: block", " -DS DEBUG: sanity", " -DZ DEBUG: zero freed memory during GC", @@ -1533,6 +1539,16 @@ error = true; break; #endif + case 'n': + OPTION_SAFE; + RtsFlags.GcFlags.useNonmoving = true; + unchecked_arg_start++; + if (rts_argv[arg][3] == 's') { + RtsFlags.GcFlags.nonmovingSelectorOpt = true; + unchecked_arg_start++; + } + break; + case 'c': /* Debugging tool: show current cost centre on an exception */ OPTION_SAFE; @@ -1706,6 +1722,16 @@ static void normaliseRtsOpts (void) if (RtsFlags.MiscFlags.generate_dump_file) { RtsFlags.MiscFlags.install_seh_handlers = true; } + + if (RtsFlags.GcFlags.useNonmoving && RtsFlags.GcFlags.generations == 1) { + barf("The non-moving collector doesn't support -G1"); + } + + if (RtsFlags.GcFlags.compact && RtsFlags.GcFlags.useNonmoving) { + errorBelch("The non-moving collector cannot be used in conjunction with\n" + "the compacting collector."); + errorUsage(); + } } static void errorUsage (void) @@ -1871,6 +1897,9 @@ static void read_debug_flags(const char* arg) case 'g': RtsFlags.DebugFlags.gc = true; break; + case 'n': + RtsFlags.DebugFlags.nonmoving_gc = true; + break; case 'b': RtsFlags.DebugFlags.block_alloc = true; break; @@ -2108,6 +2137,10 @@ static void read_trace_flags(const char *arg) RtsFlags.TraceFlags.gc = enabled; enabled = true; break; + case 'n': + RtsFlags.TraceFlags.nonmoving_gc = enabled; + enabled = true; + break; case 'u': RtsFlags.TraceFlags.user = enabled; enabled = true; diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c index a202d5396068f5356b299551960d34cc5a2ec6e3..d0d08a249594394e3fdc505f93f02d84be82ce82 100644 --- a/rts/RtsStartup.c +++ b/rts/RtsStartup.c @@ -392,7 +392,8 @@ hs_exit_(bool wait_foreign) ioManagerDie(); #endif - /* stop all running tasks */ + /* stop all running tasks. This is also where we stop concurrent non-moving + * collection if it's running */ exitScheduler(wait_foreign); /* run C finalizers for all active weak pointers */ diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c index 4da4258e9525cd4b747d8e14df17258c50984446..0611de11cca41c3e0cfa78dee5f12fb1f706e267 100644 --- a/rts/RtsSymbols.c +++ b/rts/RtsSymbols.c @@ -14,6 +14,7 @@ #include "HsFFI.h" #include "sm/Storage.h" +#include "sm/NonMovingMark.h" #include <stdbool.h> #if !defined(mingw32_HOST_OS) @@ -716,6 +717,9 @@ SymI_HasProto(stg_shrinkMutableByteArrayzh) \ SymI_HasProto(stg_resizzeMutableByteArrayzh) \ SymI_HasProto(newSpark) \ + SymI_HasProto(updateRemembSetPushThunk) \ + SymI_HasProto(updateRemembSetPushThunk_) \ + SymI_HasProto(updateRemembSetPushClosure_) \ SymI_HasProto(performGC) \ SymI_HasProto(performMajorGC) \ SymI_HasProto(prog_argc) \ @@ -1071,6 +1075,7 @@ RtsSymbolVal rtsSyms[] = { RTS_OPENBSD_ONLY_SYMBOLS RTS_LIBGCC_SYMBOLS RTS_LIBFFI_SYMBOLS + SymI_HasDataProto(nonmoving_write_barrier_enabled) #if defined(darwin_HOST_OS) && defined(i386_HOST_ARCH) // dyld stub code contains references to this, // but it should never be called because we treat diff --git a/rts/STM.c b/rts/STM.c index dc0b0ebb7876672e1de27abf07f66288ba9b72af..1dde70b485f903c75d8c6307b5c9da4c4ae72b72 100644 --- a/rts/STM.c +++ b/rts/STM.c @@ -182,7 +182,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) { TRACE("%p : unlock_stm()", trec); } -static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgClosure *lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED) { StgClosure *result; TRACE("%p : lock_tvar(%p)", trec, s); @@ -197,12 +198,14 @@ static void unlock_tvar(Capability *cap, StgBool force_update) { TRACE("%p : unlock_tvar(%p)", trec, s); if (force_update) { + StgClosure *old_value = s -> current_value; s -> current_value = c; - dirty_TVAR(cap,s); + dirty_TVAR(cap, s, old_value); } } -static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgBool cond_lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED, StgClosure *expected) { StgClosure *result; @@ -231,7 +234,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) { smp_locked = 0; } -static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgClosure *lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED) { StgClosure *result; TRACE("%p : lock_tvar(%p)", trec, s); @@ -248,12 +252,14 @@ static void *unlock_tvar(Capability *cap, TRACE("%p : unlock_tvar(%p, %p)", trec, s, c); ASSERT(smp_locked == trec); if (force_update) { + StgClosure *old_value = s -> current_value; s -> current_value = c; - dirty_TVAR(cap,s); + dirty_TVAR(cap, s, old_value); } } -static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgBool cond_lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED, StgClosure *expected) { StgClosure *result; @@ -279,7 +285,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) { TRACE("%p : unlock_stm()", trec); } -static StgClosure *lock_tvar(StgTRecHeader *trec, +static StgClosure *lock_tvar(Capability *cap, + StgTRecHeader *trec, StgTVar *s STG_UNUSED) { StgClosure *result; TRACE("%p : lock_tvar(%p)", trec, s); @@ -289,6 +296,12 @@ static StgClosure *lock_tvar(StgTRecHeader *trec, } while (GET_INFO(UNTAG_CLOSURE(result)) == &stg_TREC_HEADER_info); } while (cas((void *)&(s -> current_value), (StgWord)result, (StgWord)trec) != (StgWord)result); + + + IF_NONMOVING_WRITE_BARRIER_ENABLED { + if (result) + updateRemembSetPushClosure(cap, result); + } return result; } @@ -300,10 +313,11 @@ static void unlock_tvar(Capability *cap, TRACE("%p : unlock_tvar(%p, %p)", trec, s, c); ASSERT(s -> current_value == (StgClosure *)trec); s -> current_value = c; - dirty_TVAR(cap,s); + dirty_TVAR(cap, s, (StgClosure *) trec); } -static StgBool cond_lock_tvar(StgTRecHeader *trec, +static StgBool cond_lock_tvar(Capability *cap, + StgTRecHeader *trec, StgTVar *s, StgClosure *expected) { StgClosure *result; @@ -311,6 +325,10 @@ static StgBool cond_lock_tvar(StgTRecHeader *trec, TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected); w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec); result = (StgClosure *)w; + IF_NONMOVING_WRITE_BARRIER_ENABLED { + if (result) + updateRemembSetPushClosure(cap, expected); + } TRACE("%p : %s", trec, result ? "success" : "failure"); return (result == expected); } @@ -525,7 +543,7 @@ static void build_watch_queue_entries_for_trec(Capability *cap, } s -> first_watch_queue_entry = q; e -> new_value = (StgClosure *) q; - dirty_TVAR(cap,s); // we modified first_watch_queue_entry + dirty_TVAR(cap, s, (StgClosure *) fq); // we modified first_watch_queue_entry }); } @@ -545,7 +563,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap, StgTVarWatchQueue *q; StgClosure *saw; s = e -> tvar; - saw = lock_tvar(trec, s); + saw = lock_tvar(cap, trec, s); q = (StgTVarWatchQueue *) (e -> new_value); TRACE("%p : removing tso=%p from watch queue for tvar=%p", trec, @@ -562,7 +580,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap, } else { ASSERT(s -> first_watch_queue_entry == q); s -> first_watch_queue_entry = nq; - dirty_TVAR(cap,s); // we modified first_watch_queue_entry + dirty_TVAR(cap, s, (StgClosure *) q); // we modified first_watch_queue_entry } free_stg_tvar_watch_queue(cap, q); unlock_tvar(cap, trec, s, saw, false); @@ -773,7 +791,7 @@ static StgBool validate_and_acquire_ownership (Capability *cap, s = e -> tvar; if (acquire_all || entry_is_update(e)) { TRACE("%p : trying to acquire %p", trec, s); - if (!cond_lock_tvar(trec, s, e -> expected_value)) { + if (!cond_lock_tvar(cap, trec, s, e -> expected_value)) { TRACE("%p : failed to acquire %p", trec, s); result = false; BREAK_FOR_EACH; diff --git a/rts/Schedule.c b/rts/Schedule.c index eced4d4fb6484a247cf866de2d226eecbfeebeba..9323915dfefdc2bf939d28d4c7a045fa74eb16d9 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -44,6 +44,8 @@ #include "StablePtr.h" #include "StableName.h" #include "TopHandler.h" +#include "sm/NonMoving.h" +#include "sm/NonMovingMark.h" #if defined(HAVE_SYS_TYPES_H) #include <sys/types.h> @@ -110,6 +112,19 @@ Mutex sched_mutex; #define FORKPROCESS_PRIMOP_SUPPORTED #endif +/* + * sync_finished_cond allows threads which do not own any capability (e.g. the + * concurrent mark thread) to participate in the sync protocol. In particular, + * if such a thread requests a sync while sync is already in progress it will + * block on sync_finished_cond, which will be signalled when the sync is + * finished (by releaseAllCapabilities). + */ +#if defined(THREADED_RTS) +static Condition sync_finished_cond; +static Mutex sync_finished_mutex; +#endif + + /* ----------------------------------------------------------------------------- * static function prototypes * -------------------------------------------------------------------------- */ @@ -130,7 +145,6 @@ static void scheduleYield (Capability **pcap, Task *task); static bool requestSync (Capability **pcap, Task *task, PendingSync *sync_type, SyncType *prev_sync_type); static void acquireAllCapabilities(Capability *cap, Task *task); -static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task); static void startWorkerTasks (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS); #endif @@ -150,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t ); static bool scheduleHandleThreadFinished( Capability *cap, Task *task, StgTSO *t ); static bool scheduleNeedHeapProfile(bool ready_to_gc); -static void scheduleDoGC(Capability **pcap, Task *task, bool force_major); +static void scheduleDoGC( Capability **pcap, Task *task, + bool force_major, bool deadlock_detect ); static void deleteThread (StgTSO *tso); static void deleteAllThreads (void); @@ -250,7 +265,7 @@ schedule (Capability *initialCapability, Task *task) case SCHED_INTERRUPTING: debugTrace(DEBUG_sched, "SCHED_INTERRUPTING"); /* scheduleDoGC() deletes all the threads */ - scheduleDoGC(&cap,task,true); + scheduleDoGC(&cap,task,true,false); // after scheduleDoGC(), we must be shutting down. Either some // other Capability did the final GC, or we did it above, @@ -547,7 +562,7 @@ run_thread: } if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) { - scheduleDoGC(&cap,task,false); + scheduleDoGC(&cap,task,false,false); } } /* end of while() */ } @@ -921,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task) // they are unreachable and will therefore be sent an // exception. Any threads thus released will be immediately // runnable. - scheduleDoGC (pcap, task, true/*force major GC*/); + scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/); cap = *pcap; // when force_major == true. scheduleDoGC sets // recent_activity to ACTIVITY_DONE_GC and turns off the timer @@ -991,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS) while (!emptyInbox(cap)) { // Executing messages might use heap, so we should check for GC. if (doYouWantToGC(cap)) { - scheduleDoGC(pcap, cap->running_task, false); + scheduleDoGC(pcap, cap->running_task, false, false); cap = *pcap; } @@ -1368,17 +1383,24 @@ scheduleNeedHeapProfile( bool ready_to_gc ) * change to the system, such as altering the number of capabilities, or * forking. * + * pCap may be NULL in the event that the caller doesn't yet own a capability. + * * To resume after stopAllCapabilities(), use releaseAllCapabilities(). * -------------------------------------------------------------------------- */ #if defined(THREADED_RTS) -static void stopAllCapabilities (Capability **pCap, Task *task) +void stopAllCapabilities (Capability **pCap, Task *task) +{ + stopAllCapabilitiesWith(pCap, task, SYNC_OTHER); +} + +void stopAllCapabilitiesWith (Capability **pCap, Task *task, SyncType sync_type) { bool was_syncing; SyncType prev_sync_type; PendingSync sync = { - .type = SYNC_OTHER, + .type = sync_type, .idle = NULL, .task = task }; @@ -1387,9 +1409,10 @@ static void stopAllCapabilities (Capability **pCap, Task *task) was_syncing = requestSync(pCap, task, &sync, &prev_sync_type); } while (was_syncing); - acquireAllCapabilities(*pCap,task); + acquireAllCapabilities(pCap ? *pCap : NULL, task); pending_sync = 0; + signalCondition(&sync_finished_cond); } #endif @@ -1400,6 +1423,16 @@ static void stopAllCapabilities (Capability **pCap, Task *task) * directly, instead use stopAllCapabilities(). This is used by the GC, which * has some special synchronisation requirements. * + * Note that this can be called in two ways: + * + * - where *pcap points to a capability owned by the caller: in this case + * *prev_sync_type will reflect the in-progress sync type on return, if one + * *was found + * + * - where pcap == NULL: in this case the caller doesn't hold a capability. + * we only return whether or not a pending sync was found and prev_sync_type + * is unchanged. + * * Returns: * false if we successfully got a sync * true if there was another sync request in progress, @@ -1424,13 +1457,25 @@ static bool requestSync ( // After the sync is completed, we cannot read that struct any // more because it has been freed. *prev_sync_type = sync->type; - do { - debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...", - sync->type); - ASSERT(*pcap); - yieldCapability(pcap,task,true); - sync = pending_sync; - } while (sync != NULL); + if (pcap == NULL) { + // The caller does not hold a capability (e.g. may be a concurrent + // mark thread). Consequently we must wait until the pending sync is + // finished before proceeding to ensure we don't loop. + // TODO: Don't busy-wait + ACQUIRE_LOCK(&sync_finished_mutex); + while (pending_sync) { + waitCondition(&sync_finished_cond, &sync_finished_mutex); + } + RELEASE_LOCK(&sync_finished_mutex); + } else { + do { + debugTrace(DEBUG_sched, "someone else is trying to sync (%d)...", + sync->type); + ASSERT(*pcap); + yieldCapability(pcap,task,true); + sync = pending_sync; + } while (sync != NULL); + } // NOTE: task->cap might have changed now return true; @@ -1445,9 +1490,9 @@ static bool requestSync ( /* ----------------------------------------------------------------------------- * acquireAllCapabilities() * - * Grab all the capabilities except the one we already hold. Used - * when synchronising before a single-threaded GC (SYNC_SEQ_GC), and - * before a fork (SYNC_OTHER). + * Grab all the capabilities except the one we already hold (cap may be NULL is + * the caller does not currently hold a capability). Used when synchronising + * before a single-threaded GC (SYNC_SEQ_GC), and before a fork (SYNC_OTHER). * * Only call this after requestSync(), otherwise a deadlock might * ensue if another thread is trying to synchronise. @@ -1477,29 +1522,30 @@ static void acquireAllCapabilities(Capability *cap, Task *task) } } } - task->cap = cap; + task->cap = cap == NULL ? tmpcap : cap; } #endif /* ----------------------------------------------------------------------------- - * releaseAllcapabilities() + * releaseAllCapabilities() * - * Assuming this thread holds all the capabilities, release them all except for - * the one passed in as cap. + * Assuming this thread holds all the capabilities, release them all (except for + * the one passed in as keep_cap, if non-NULL). * -------------------------------------------------------------------------- */ #if defined(THREADED_RTS) -static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task) +void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task) { uint32_t i; for (i = 0; i < n; i++) { - if (cap->no != i) { - task->cap = capabilities[i]; - releaseCapability(capabilities[i]); + Capability *tmpcap = capabilities[i]; + if (keep_cap != tmpcap) { + task->cap = tmpcap; + releaseCapability(tmpcap); } } - task->cap = cap; + task->cap = keep_cap; } #endif @@ -1507,9 +1553,11 @@ static void releaseAllCapabilities(uint32_t n, Capability *cap, Task *task) * Perform a garbage collection if necessary * -------------------------------------------------------------------------- */ +// N.B. See Note [Deadlock detection under nonmoving collector] for rationale +// behind deadlock_detect argument. static void scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS, - bool force_major) + bool force_major, bool deadlock_detect) { Capability *cap = *pcap; bool heap_census; @@ -1801,9 +1849,10 @@ delete_threads_and_gc: // reset pending_sync *before* GC, so that when the GC threads // emerge they don't immediately re-enter the GC. pending_sync = 0; - GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap); + signalCondition(&sync_finished_cond); + GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap); #else - GarbageCollect(collect_gen, heap_census, 0, cap, NULL); + GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL); #endif // If we're shutting down, don't leave any idle GC work to do. @@ -2453,7 +2502,11 @@ resumeThread (void *task_) tso = incall->suspended_tso; incall->suspended_tso = NULL; incall->suspended_cap = NULL; - tso->_link = END_TSO_QUEUE; // no write barrier reqd + // we will modify tso->_link + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure(cap, (StgClosure *)tso->_link); + } + tso->_link = END_TSO_QUEUE; traceEventRunThread(cap, tso); @@ -2627,6 +2680,8 @@ initScheduler(void) /* Initialise the mutex and condition variables used by * the scheduler. */ initMutex(&sched_mutex); + initMutex(&sync_finished_mutex); + initCondition(&sync_finished_cond); #endif ACQUIRE_LOCK(&sched_mutex); @@ -2662,9 +2717,10 @@ exitScheduler (bool wait_foreign USED_IF_THREADS) // If we haven't killed all the threads yet, do it now. if (sched_state < SCHED_SHUTTING_DOWN) { sched_state = SCHED_INTERRUPTING; + nonmovingStop(); Capability *cap = task->cap; waitForCapability(&cap,task); - scheduleDoGC(&cap,task,true); + scheduleDoGC(&cap,task,true,false); ASSERT(task->incall->tso == NULL); releaseCapability(cap); } @@ -2732,7 +2788,7 @@ performGC_(bool force_major) // TODO: do we need to traceTask*() here? waitForCapability(&cap,task); - scheduleDoGC(&cap,task,force_major); + scheduleDoGC(&cap,task,force_major,false); releaseCapability(cap); boundTaskExiting(task); } diff --git a/rts/Schedule.h b/rts/Schedule.h index 31979800411eb7f0c02d28d4b9b5ea1422e09218..6434515604e22be28be314850fc51d734fb90c5b 100644 --- a/rts/Schedule.h +++ b/rts/Schedule.h @@ -52,6 +52,12 @@ StgWord findAtomicallyFrameHelper (Capability *cap, StgTSO *tso); /* Entry point for a new worker */ void scheduleWorker (Capability *cap, Task *task); +#if defined(THREADED_RTS) +void stopAllCapabilitiesWith (Capability **pCap, Task *task, SyncType sync_type); +void stopAllCapabilities (Capability **pCap, Task *task); +void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task); +#endif + /* The state of the scheduler. This is used to control the sequence * of events during shutdown. See Note [shutdown] in Schedule.c. */ diff --git a/rts/StableName.c b/rts/StableName.c index 757eb59180e72cebc1030e24725fbf72a4eae5ab..4b26fee396d31a1e8ff9ecf05fc28721ec906d9f 100644 --- a/rts/StableName.c +++ b/rts/StableName.c @@ -21,7 +21,7 @@ snEntry *stable_name_table = NULL; static snEntry *stable_name_free = NULL; -static unsigned int SNT_size = 0; +unsigned int SNT_size = 0; #define INIT_SNT_SIZE 64 #if defined(THREADED_RTS) @@ -128,7 +128,7 @@ exitStableNameTable(void) #endif } -STATIC_INLINE void +void freeSnEntry(snEntry *sn) { ASSERT(sn->sn_obj == NULL); @@ -218,27 +218,6 @@ lookupStableName (StgPtr p) * Remember old stable name addresses * -------------------------------------------------------------------------- */ -#define FOR_EACH_STABLE_NAME(p, CODE) \ - do { \ - snEntry *p; \ - snEntry *__end_ptr = &stable_name_table[SNT_size]; \ - for (p = stable_name_table + 1; p < __end_ptr; p++) { \ - /* Internal pointers are free slots. */ \ - /* If p->addr == NULL, it's a */ \ - /* stable name where the object has been GC'd, but the */ \ - /* StableName object (sn_obj) is still alive. */ \ - if ((p->addr < (P_)stable_name_table || \ - p->addr >= (P_)__end_ptr)) \ - { \ - /* NOTE: There is an ambiguity here if p->addr == NULL */ \ - /* it is either the last item in the free list or it */ \ - /* is a stable name whose pointee died. sn_obj == NULL */ \ - /* disambiguates as last free list item. */ \ - do { CODE } while(0); \ - } \ - } \ - } while(0) - void rememberOldStableNameAddresses(void) { @@ -284,6 +263,9 @@ threadStableNameTable( evac_fn evac, void *user ) void gcStableNameTable( void ) { + // We must take the stable name lock lest we race with the nonmoving + // collector (namely nonmovingSweepStableNameTable). + stableNameLock(); FOR_EACH_STABLE_NAME( p, { // FOR_EACH_STABLE_NAME traverses free entries too, so @@ -307,6 +289,7 @@ gcStableNameTable( void ) } } }); + stableNameUnlock(); } /* ----------------------------------------------------------------------------- diff --git a/rts/StableName.h b/rts/StableName.h index 6b5e551add3cd20d6947b60504aa9f6d7dbb88c6..e5903bb3b5e87a803a6dc4d599b4221eb5fb7f59 100644 --- a/rts/StableName.h +++ b/rts/StableName.h @@ -11,7 +11,8 @@ #include "BeginPrivate.h" void initStableNameTable ( void ); -void exitStableNameTable ( void ); +void freeSnEntry ( snEntry *sn ); +void exitStableNameTable ( void ); StgWord lookupStableName ( StgPtr p ); void rememberOldStableNameAddresses ( void ); @@ -23,6 +24,29 @@ void updateStableNameTable ( bool full ); void stableNameLock ( void ); void stableNameUnlock ( void ); +extern unsigned int SNT_size; + +#define FOR_EACH_STABLE_NAME(p, CODE) \ + do { \ + snEntry *p; \ + snEntry *__end_ptr = &stable_name_table[SNT_size]; \ + for (p = stable_name_table + 1; p < __end_ptr; p++) { \ + /* Internal pointers are free slots. */ \ + /* If p->addr == NULL, it's a */ \ + /* stable name where the object has been GC'd, but the */ \ + /* StableName object (sn_obj) is still alive. */ \ + if ((p->addr < (P_)stable_name_table || \ + p->addr >= (P_)__end_ptr)) \ + { \ + /* NOTE: There is an ambiguity here if p->addr == NULL */ \ + /* it is either the last item in the free list or it */ \ + /* is a stable name whose pointee died. sn_obj == NULL */ \ + /* disambiguates as last free list item. */ \ + do { CODE } while(0); \ + } \ + } \ + } while(0) + #if defined(THREADED_RTS) // needed by Schedule.c:forkProcess() extern Mutex stable_name_mutex; diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c index cccc7ad0b0a241be52eba7f8fca97163b80a5e41..83c621e386baf72090f5a196bbd9ee4a1756a77c 100644 --- a/rts/ThreadPaused.c +++ b/rts/ThreadPaused.c @@ -15,6 +15,7 @@ #include "RaiseAsync.h" #include "Trace.h" #include "Threads.h" +#include "sm/NonMovingMark.h" #include <string.h> // for memmove() @@ -243,6 +244,9 @@ threadPaused(Capability *cap, StgTSO *tso) bh = ((StgUpdateFrame *)frame)->updatee; bh_info = bh->header.info; + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure(cap, (StgClosure *) bh); + } #if defined(THREADED_RTS) retry: @@ -334,6 +338,18 @@ threadPaused(Capability *cap, StgTSO *tso) } #endif + IF_NONMOVING_WRITE_BARRIER_ENABLED { + if (ip_THUNK(INFO_PTR_TO_STRUCT(bh_info))) { + // We are about to replace a thunk with a blackhole. + // Add the free variables of the closure we are about to + // overwrite to the update remembered set. + // N.B. We caught the WHITEHOLE case above. + updateRemembSetPushThunkEager(cap, + THUNK_INFO_PTR_TO_STRUCT(bh_info), + (StgThunk *) bh); + } + } + // The payload of the BLACKHOLE points to the TSO ((StgInd *)bh)->indirectee = (StgClosure *)tso; write_barrier(); diff --git a/rts/Threads.c b/rts/Threads.c index 2bdcea1c0056c0f96597b1d8d1abdce339321d1a..8334c5a5acc1e5d54366aa09aa8ce67f28524a35 100644 --- a/rts/Threads.c +++ b/rts/Threads.c @@ -85,7 +85,8 @@ createThread(Capability *cap, W_ size) SET_HDR(stack, &stg_STACK_info, cap->r.rCCCS); stack->stack_size = stack_size - sizeofW(StgStack); stack->sp = stack->stack + stack->stack_size; - stack->dirty = 1; + stack->dirty = STACK_DIRTY; + stack->marking = 0; tso = (StgTSO *)allocate(cap, sizeofW(StgTSO)); TICK_ALLOC_TSO(); @@ -611,6 +612,7 @@ threadStackOverflow (Capability *cap, StgTSO *tso) TICK_ALLOC_STACK(chunk_size); new_stack->dirty = 0; // begin clean, we'll mark it dirty below + new_stack->marking = 0; new_stack->stack_size = chunk_size - sizeofW(StgStack); new_stack->sp = new_stack->stack + new_stack->stack_size; @@ -721,9 +723,17 @@ threadStackUnderflow (Capability *cap, StgTSO *tso) barf("threadStackUnderflow: not enough space for return values"); } - new_stack->sp -= retvals; + IF_NONMOVING_WRITE_BARRIER_ENABLED { + // ensure that values that we copy into the new stack are marked + // for the nonmoving collector. Note that these values won't + // necessarily form a full closure so we need to handle them + // specially. + for (unsigned int i = 0; i < retvals; i++) { + updateRemembSetPushClosure(cap, (StgClosure *) old_stack->sp[i]); + } + } - memcpy(/* dest */ new_stack->sp, + memcpy(/* dest */ new_stack->sp - retvals, /* src */ old_stack->sp, /* size */ retvals * sizeof(W_)); } @@ -735,8 +745,12 @@ threadStackUnderflow (Capability *cap, StgTSO *tso) // restore the stack parameters, and update tot_stack_size tso->tot_stack_size -= old_stack->stack_size; - // we're about to run it, better mark it dirty + // we're about to run it, better mark it dirty. + // + // N.B. the nonmoving collector may mark the stack, meaning that sp must + // point at a valid stack frame. dirty_STACK(cap, new_stack); + new_stack->sp -= retvals; return retvals; } @@ -768,7 +782,7 @@ loop: if (q == (StgMVarTSOQueue*)&stg_END_TSO_QUEUE_closure) { /* No further takes, the MVar is now full. */ if (info == &stg_MVAR_CLEAN_info) { - dirty_MVAR(&cap->r, (StgClosure*)mvar); + dirty_MVAR(&cap->r, (StgClosure*)mvar, mvar->value); } mvar->value = value; @@ -804,7 +818,7 @@ loop: // indicate that the MVar operation has now completed. tso->_link = (StgTSO*)&stg_END_TSO_QUEUE_closure; - if (stack->dirty == 0) { + if ((stack->dirty & STACK_DIRTY) == 0) { dirty_STACK(cap, stack); } diff --git a/rts/Trace.c b/rts/Trace.c index c8a951a51056ded6e4102ce248b35b12a96dd3c7..8e44716eb0e603abdc5f5f1e1afc571815700eaf 100644 --- a/rts/Trace.c +++ b/rts/Trace.c @@ -30,6 +30,7 @@ // events int TRACE_sched; int TRACE_gc; +int TRACE_nonmoving_gc; int TRACE_spark_sampled; int TRACE_spark_full; int TRACE_user; @@ -72,6 +73,9 @@ void initTracing (void) RtsFlags.GcFlags.giveStats = COLLECT_GC_STATS; } + TRACE_nonmoving_gc = + RtsFlags.TraceFlags.nonmoving_gc; + TRACE_spark_sampled = RtsFlags.TraceFlags.sparks_sampled; @@ -818,6 +822,55 @@ void traceThreadLabel_(Capability *cap, } } +void traceConcMarkBegin() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_MARK_BEGIN); +} + +void traceConcMarkEnd(StgWord32 marked_obj_count) +{ + if (eventlog_enabled) + postConcMarkEnd(marked_obj_count); +} + +void traceConcSyncBegin() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SYNC_BEGIN); +} + +void traceConcSyncEnd() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SYNC_END); +} + +void traceConcSweepBegin() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SWEEP_BEGIN); +} + +void traceConcSweepEnd() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SWEEP_END); +} + +void traceConcUpdRemSetFlush(Capability *cap) +{ + if (eventlog_enabled) + postConcUpdRemSetFlush(cap); +} + +void traceNonmovingHeapCensus(uint32_t log_blk_size, + const struct NonmovingAllocCensus *census) +{ + if (eventlog_enabled && TRACE_nonmoving_gc) + postNonmovingHeapCensus(log_blk_size, census); +} + void traceThreadStatus_ (StgTSO *tso USED_IF_DEBUG) { #if defined(DEBUG) diff --git a/rts/Trace.h b/rts/Trace.h index b7db0ca912f0696eeb158331a4dd502b3ff6d201..ec25a09d7b14eac8b39ce053f86e9b348ea5da75 100644 --- a/rts/Trace.h +++ b/rts/Trace.h @@ -9,6 +9,7 @@ #pragma once #include "rts/EventLogFormat.h" +#include "sm/NonMovingCensus.h" #include "Capability.h" #if defined(DTRACE) @@ -50,6 +51,7 @@ enum CapsetType { CapsetTypeCustom = CAPSET_TYPE_CUSTOM, #define DEBUG_weak RtsFlags.DebugFlags.weak #define DEBUG_gccafs RtsFlags.DebugFlags.gccafs #define DEBUG_gc RtsFlags.DebugFlags.gc +#define DEBUG_nonmoving_gc RtsFlags.DebugFlags.nonmoving_gc #define DEBUG_block_alloc RtsFlags.DebugFlags.alloc #define DEBUG_sanity RtsFlags.DebugFlags.sanity #define DEBUG_zero_on_gc RtsFlags.DebugFlags.zero_on_gc @@ -71,6 +73,7 @@ extern int TRACE_spark_sampled; extern int TRACE_spark_full; /* extern int TRACE_user; */ // only used in Trace.c extern int TRACE_cap; +extern int TRACE_nonmoving_gc; // ----------------------------------------------------------------------------- // Posting events @@ -307,6 +310,16 @@ void traceProfSampleCostCentre(Capability *cap, void traceProfBegin(void); #endif /* PROFILING */ +void traceConcMarkBegin(void); +void traceConcMarkEnd(StgWord32 marked_obj_count); +void traceConcSyncBegin(void); +void traceConcSyncEnd(void); +void traceConcSweepBegin(void); +void traceConcSweepEnd(void); +void traceConcUpdRemSetFlush(Capability *cap); +void traceNonmovingHeapCensus(uint32_t log_blk_size, + const struct NonmovingAllocCensus *census); + void flushTrace(void); #else /* !TRACING */ @@ -347,6 +360,15 @@ void flushTrace(void); #define traceHeapProfSampleCostCentre(profile_id, stack, residency) /* nothing */ #define traceHeapProfSampleString(profile_id, label, residency) /* nothing */ +#define traceConcMarkBegin() /* nothing */ +#define traceConcMarkEnd(marked_obj_count) /* nothing */ +#define traceConcSyncBegin() /* nothing */ +#define traceConcSyncEnd() /* nothing */ +#define traceConcSweepBegin() /* nothing */ +#define traceConcSweepEnd() /* nothing */ +#define traceConcUpdRemSetFlush(cap) /* nothing */ +#define traceNonmovingHeapCensus(blk_size, census) /* nothing */ + #define flushTrace() /* nothing */ #endif /* TRACING */ diff --git a/rts/Updates.h b/rts/Updates.h index 1bd3e065afbb11348f03d805b0857f71526f36ce..91d1b0b1cbd683075eeae9fcab23808ee60f7104 100644 --- a/rts/Updates.h +++ b/rts/Updates.h @@ -50,6 +50,9 @@ \ prim_write_barrier; \ OVERWRITING_CLOSURE(p1); \ + IF_NONMOVING_WRITE_BARRIER_ENABLED { \ + ccall updateRemembSetPushThunk_(BaseReg, p1 "ptr"); \ + } \ StgInd_indirectee(p1) = p2; \ prim_write_barrier; \ SET_INFO(p1, stg_BLACKHOLE_info); \ @@ -62,7 +65,7 @@ } else { \ TICK_UPD_NEW_IND(); \ and_then; \ - } + } #else /* !CMINUSMINUS */ @@ -78,6 +81,9 @@ INLINE_HEADER void updateWithIndirection (Capability *cap, /* See Note [Heap memory barriers] in SMP.h */ write_barrier(); OVERWRITING_CLOSURE(p1); + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushThunk(cap, (StgThunk*)p1); + } ((StgInd *)p1)->indirectee = p2; write_barrier(); SET_INFO(p1, &stg_BLACKHOLE_info); diff --git a/rts/Weak.c b/rts/Weak.c index ec998c214f46e9194ea02fa11d7cf183392b61e3..fe4516794ab5fff11fcb69695fdb7c3bc994bd69 100644 --- a/rts/Weak.c +++ b/rts/Weak.c @@ -93,9 +93,19 @@ scheduleFinalizers(Capability *cap, StgWeak *list) StgWord size; uint32_t n, i; - ASSERT(n_finalizers == 0); - - finalizer_list = list; + // This assertion does not hold with non-moving collection because + // non-moving collector does not wait for the list to be consumed (by + // doIdleGcWork()) before appending the list with more finalizers. + ASSERT(RtsFlags.GcFlags.useNonmoving || n_finalizers == 0); + + // Append finalizer_list with the new list. TODO: Perhaps cache tail of the + // list for faster append. NOTE: We can't append `list` here! Otherwise we + // end up traversing already visited weaks in the loops below. + StgWeak **tl = &finalizer_list; + while (*tl) { + tl = &(*tl)->link; + } + *tl = list; // Traverse the list and // * count the number of Haskell finalizers @@ -130,7 +140,7 @@ scheduleFinalizers(Capability *cap, StgWeak *list) SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs); } - n_finalizers = i; + n_finalizers += i; // No Haskell finalizers to run? if (n == 0) return; diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c index 6d7b4871521fe8e5a7f85817d89fd22a4fe33f71..5f22af5bfc33c81a70d5096161a6cfe9f880418a 100644 --- a/rts/eventlog/EventLog.c +++ b/rts/eventlog/EventLog.c @@ -109,7 +109,15 @@ char *EventDesc[] = { [EVENT_HEAP_PROF_SAMPLE_COST_CENTRE] = "Heap profile cost-centre sample", [EVENT_PROF_SAMPLE_COST_CENTRE] = "Time profile cost-centre stack", [EVENT_PROF_BEGIN] = "Start of a time profile", - [EVENT_USER_BINARY_MSG] = "User binary message" + [EVENT_USER_BINARY_MSG] = "User binary message", + [EVENT_CONC_MARK_BEGIN] = "Begin concurrent mark phase", + [EVENT_CONC_MARK_END] = "End concurrent mark phase", + [EVENT_CONC_SYNC_BEGIN] = "Begin concurrent GC synchronisation", + [EVENT_CONC_SYNC_END] = "End concurrent GC synchronisation", + [EVENT_CONC_SWEEP_BEGIN] = "Begin concurrent sweep", + [EVENT_CONC_SWEEP_END] = "End concurrent sweep", + [EVENT_CONC_UPD_REM_SET_FLUSH] = "Update remembered set flushed", + [EVENT_NONMOVING_HEAP_CENSUS] = "Nonmoving heap census" }; // Event type. @@ -456,6 +464,27 @@ init_event_types(void) eventTypes[t].size = EVENT_SIZE_DYNAMIC; break; + case EVENT_CONC_MARK_BEGIN: + case EVENT_CONC_SYNC_BEGIN: + case EVENT_CONC_SYNC_END: + case EVENT_CONC_SWEEP_BEGIN: + case EVENT_CONC_SWEEP_END: + eventTypes[t].size = 0; + break; + + case EVENT_CONC_MARK_END: + eventTypes[t].size = 4; + break; + + case EVENT_CONC_UPD_REM_SET_FLUSH: // (cap) + eventTypes[t].size = + sizeof(EventCapNo); + break; + + case EVENT_NONMOVING_HEAP_CENSUS: // (cap, blk_size, active_segs, filled_segs, live_blks) + eventTypes[t].size = 13; + break; + default: continue; /* ignore deprecated events */ } @@ -497,8 +526,10 @@ initEventLogging(const EventLogWriter *ev_writer) event_log_writer = ev_writer; initEventLogWriter(); - if (sizeof(EventDesc) / sizeof(char*) != NUM_GHC_EVENT_TAGS) { - barf("EventDesc array has the wrong number of elements"); + int num_descs = sizeof(EventDesc) / sizeof(char*); + if (num_descs != NUM_GHC_EVENT_TAGS) { + barf("EventDesc array has the wrong number of elements (%d, NUM_GHC_EVENT_TAGS=%d)", + num_descs, NUM_GHC_EVENT_TAGS); } /* @@ -1014,6 +1045,15 @@ void postTaskDeleteEvent (EventTaskId taskId) RELEASE_LOCK(&eventBufMutex); } +void +postEventNoCap (EventTypeNum tag) +{ + ACQUIRE_LOCK(&eventBufMutex); + ensureRoomForEvent(&eventBuf, tag); + postEventHeader(&eventBuf, tag); + RELEASE_LOCK(&eventBufMutex); +} + void postEvent (Capability *cap, EventTypeNum tag) { @@ -1140,6 +1180,35 @@ void postThreadLabel(Capability *cap, postBuf(eb, (StgWord8*) label, strsize); } +void postConcUpdRemSetFlush(Capability *cap) +{ + EventsBuf *eb = &capEventBuf[cap->no]; + ensureRoomForEvent(eb, EVENT_CONC_UPD_REM_SET_FLUSH); + postEventHeader(eb, EVENT_CONC_UPD_REM_SET_FLUSH); + postCapNo(eb, cap->no); +} + +void postConcMarkEnd(StgWord32 marked_obj_count) +{ + ACQUIRE_LOCK(&eventBufMutex); + ensureRoomForEvent(&eventBuf, EVENT_CONC_MARK_END); + postEventHeader(&eventBuf, EVENT_CONC_MARK_END); + postWord32(&eventBuf, marked_obj_count); + RELEASE_LOCK(&eventBufMutex); +} + +void postNonmovingHeapCensus(int log_blk_size, + const struct NonmovingAllocCensus *census) +{ + ACQUIRE_LOCK(&eventBufMutex); + postEventHeader(&eventBuf, EVENT_NONMOVING_HEAP_CENSUS); + postWord8(&eventBuf, log_blk_size); + postWord32(&eventBuf, census->n_active_segs); + postWord32(&eventBuf, census->n_filled_segs); + postWord32(&eventBuf, census->n_live_blocks); + RELEASE_LOCK(&eventBufMutex); +} + void closeBlockMarker (EventsBuf *ebuf) { if (ebuf->marker) diff --git a/rts/eventlog/EventLog.h b/rts/eventlog/EventLog.h index ec9a5f34e391553110eb86e36a2d8bbf806fa7c7..5bd3b5dadb40cd8245310de1589a9ce7042c0a14 100644 --- a/rts/eventlog/EventLog.h +++ b/rts/eventlog/EventLog.h @@ -11,6 +11,7 @@ #include "rts/EventLogFormat.h" #include "rts/EventLogWriter.h" #include "Capability.h" +#include "sm/NonMovingCensus.h" #include "BeginPrivate.h" @@ -39,6 +40,7 @@ void postSchedEvent(Capability *cap, EventTypeNum tag, * Post a nullary event. */ void postEvent(Capability *cap, EventTypeNum tag); +void postEventNoCap(EventTypeNum tag); void postEventAtTimestamp (Capability *cap, EventTimestamp ts, EventTypeNum tag); @@ -164,6 +166,11 @@ void postProfSampleCostCentre(Capability *cap, void postProfBegin(void); #endif /* PROFILING */ +void postConcUpdRemSetFlush(Capability *cap); +void postConcMarkEnd(StgWord32 marked_obj_count); +void postNonmovingHeapCensus(int log_blk_size, + const struct NonmovingAllocCensus *census); + #else /* !TRACING */ INLINE_HEADER void postSchedEvent (Capability *cap STG_UNUSED, @@ -177,6 +184,9 @@ INLINE_HEADER void postEvent (Capability *cap STG_UNUSED, EventTypeNum tag STG_UNUSED) { /* nothing */ } +INLINE_HEADER void postEventNoCap (EventTypeNum tag STG_UNUSED) +{ /* nothing */ } + INLINE_HEADER void postMsg (char *msg STG_UNUSED, va_list ap STG_UNUSED) { /* nothing */ } diff --git a/rts/ghc.mk b/rts/ghc.mk index dca22fb7338796a5152f55b25fa74dae074525cc..59d5994147aece51cf89c0163a285758729620be 100644 --- a/rts/ghc.mk +++ b/rts/ghc.mk @@ -340,6 +340,8 @@ WARNING_OPTS += -Wredundant-decls ifeq "$(GccLT46)" "NO" WARNING_OPTS += -Wundef endif +# Some gccs annoyingly enable this archaic specimen by default +WARNING_OPTS += -Wno-aggregate-return # These ones are hard to avoid: #WARNING_OPTS += -Wconversion diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in index 30c829ad42ab33f54d219b4323dce13754a71eb0..4b5d837c3a78072ffa227af4a0fd248bd65923f7 100644 --- a/rts/rts.cabal.in +++ b/rts/rts.cabal.in @@ -139,6 +139,7 @@ library rts/Linker.h rts/Main.h rts/Messages.h + rts/NonMoving.h rts/OSThreads.h rts/Parallel.h rts/PrimFloat.h @@ -465,6 +466,12 @@ library sm/GCUtils.c sm/MBlock.c sm/MarkWeak.c + sm/NonMoving.c + sm/NonMovingCensus.c + sm/NonMovingMark.c + sm/NonMovingScav.c + sm/NonMovingShortcut.c + sm/NonMovingSweep.c sm/Sanity.c sm/Scav.c sm/Scav_thr.c diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c index f9e3d11407bd79f4607c28f198e0c92186c9e505..b3e1e2ce75121d043c4d283fbe86b3b48e51f470 100644 --- a/rts/sm/BlockAlloc.c +++ b/rts/sm/BlockAlloc.c @@ -310,7 +310,7 @@ setup_tail (bdescr *bd) // Take a free block group bd, and split off a group of size n from // it. Adjust the free list as necessary, and return the new group. static bdescr * -split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln) +split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln /* log_2_ceil(n) */) { bdescr *fg; // free group @@ -325,6 +325,46 @@ split_free_block (bdescr *bd, uint32_t node, W_ n, uint32_t ln) return fg; } +// Take N blocks off the end, free the rest. +static bdescr * +split_block_high (bdescr *bd, W_ n) +{ + ASSERT(bd->blocks > n); + + bdescr* ret = bd + bd->blocks - n; // take n blocks off the end + ret->blocks = n; + ret->start = ret->free = bd->start + (bd->blocks - n)*BLOCK_SIZE_W; + ret->link = NULL; + + bd->blocks -= n; + + setup_tail(ret); + setup_tail(bd); + freeGroup(bd); + + return ret; +} + +// Like `split_block_high`, but takes n blocks off the beginning rather +// than the end. +static bdescr * +split_block_low (bdescr *bd, W_ n) +{ + ASSERT(bd->blocks > n); + + bdescr* bd_ = bd + n; + bd_->blocks = bd->blocks - n; + bd_->start = bd_->free = bd->start + n*BLOCK_SIZE_W; + + bd->blocks = n; + + setup_tail(bd_); + setup_tail(bd); + freeGroup(bd_); + + return bd; +} + /* Only initializes the start pointers on the first megablock and the * blocks field of the first bdescr; callers are responsible for calling * initGroup afterwards. @@ -461,6 +501,108 @@ finish: return bd; } +// Allocate `n` blocks aligned to `n` blocks, e.g. when n = 8, the blocks will +// be aligned at `8 * BLOCK_SIZE`. For a group with `n` blocks this can be used +// for easily accessing the beginning of the group from a location p in the +// group with +// +// p % (BLOCK_SIZE*n) +// +// Used by the non-moving collector for allocating segments. +// +// Because the storage manager does not support aligned allocations, we have to +// allocate `2*n - 1` blocks here to make sure we'll be able to find an aligned +// region in the allocated blocks. After finding the aligned area we want to +// free slop on the low and high sides, and block allocator doesn't support +// freeing only some portion of a megablock (we can only free whole megablocks). +// So we disallow allocating megablocks here, and allow allocating at most +// `BLOCKS_PER_MBLOCK / 2` blocks. +bdescr * +allocAlignedGroupOnNode (uint32_t node, W_ n) +{ + // allocate enough blocks to have enough space aligned at n-block boundary + // free any slops on the low and high side of this space + + // number of blocks to allocate to make sure we have enough aligned space + W_ num_blocks = 2*n - 1; + + if (num_blocks >= BLOCKS_PER_MBLOCK) { + barf("allocAlignedGroupOnNode: allocating megablocks is not supported\n" + " requested blocks: %" FMT_Word "\n" + " required for alignment: %" FMT_Word "\n" + " megablock size (in blocks): %" FMT_Word, + n, num_blocks, (W_) BLOCKS_PER_MBLOCK); + } + + W_ group_size = n * BLOCK_SIZE; + + // To reduce splitting and fragmentation we use allocLargeChunkOnNode here. + // Tweak the max allocation to avoid allocating megablocks. Splitting slop + // below doesn't work with megablocks (freeGroup can't free only a portion + // of a megablock so we can't allocate megablocks and free some parts of + // them). + W_ max_blocks = stg_min(num_blocks * 3, BLOCKS_PER_MBLOCK - 1); + bdescr *bd = allocLargeChunkOnNode(node, num_blocks, max_blocks); + // We may allocate more than num_blocks, so update it + num_blocks = bd->blocks; + + // slop on the low side + W_ slop_low = 0; + if ((uintptr_t)bd->start % group_size != 0) { + slop_low = group_size - ((uintptr_t)bd->start % group_size); + } + + W_ slop_high = (num_blocks * BLOCK_SIZE) - group_size - slop_low; + + ASSERT((slop_low % BLOCK_SIZE) == 0); + ASSERT((slop_high % BLOCK_SIZE) == 0); + + W_ slop_low_blocks = slop_low / BLOCK_SIZE; + W_ slop_high_blocks = slop_high / BLOCK_SIZE; + + ASSERT(slop_low_blocks + slop_high_blocks + n == num_blocks); + +#if defined(DEBUG) + checkFreeListSanity(); + W_ free_before = countFreeList(); +#endif + + if (slop_low_blocks != 0) { + bd = split_block_high(bd, num_blocks - slop_low_blocks); + ASSERT(countBlocks(bd) == num_blocks - slop_low_blocks); + } + +#if defined(DEBUG) + ASSERT(countFreeList() == free_before + slop_low_blocks); + checkFreeListSanity(); +#endif + + // At this point the bd should be aligned, but we may have slop on the high side + ASSERT((uintptr_t)bd->start % group_size == 0); + +#if defined(DEBUG) + free_before = countFreeList(); +#endif + + if (slop_high_blocks != 0) { + bd = split_block_low(bd, n); + ASSERT(bd->blocks == n); + } + +#if defined(DEBUG) + ASSERT(countFreeList() == free_before + slop_high_blocks); + checkFreeListSanity(); +#endif + + // Should still be aligned + ASSERT((uintptr_t)bd->start % group_size == 0); + + // Just to make sure I get this right + ASSERT(Bdescr(bd->start) == bd); + + return bd; +} + STATIC_INLINE uint32_t nodeWithLeastBlocks (void) { diff --git a/rts/sm/CNF.c b/rts/sm/CNF.c index f85b3904145e1c24bf342f4b906bb328890437d9..87d1d84f50ad71395a7ba059bb4840e52469b452 100644 --- a/rts/sm/CNF.c +++ b/rts/sm/CNF.c @@ -276,7 +276,10 @@ compactFree(StgCompactNFData *str) for ( ; block; block = next) { next = block->next; bd = Bdescr((StgPtr)block); - ASSERT((bd->flags & BF_EVACUATED) == 0); + ASSERT(RtsFlags.GcFlags.useNonmoving || ((bd->flags & BF_EVACUATED) == 0)); + // When using the non-moving collector we leave compact object + // evacuated to the oldset gen as BF_EVACUATED to avoid evacuating + // objects in the non-moving heap. freeGroup(bd); } } diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index 53a473d26cf8d86a7ea42b9749e4bbed3fed0edd..521fd4eef4540def56c91390f89a0240d5343eb3 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -27,6 +27,7 @@ #include "LdvProfile.h" #include "CNF.h" #include "Scav.h" +#include "NonMoving.h" #if defined(THREADED_RTS) && !defined(PARALLEL_GC) #define evacuate(p) evacuate1(p) @@ -39,7 +40,19 @@ copy_tag(p, info, src, size, stp, tag) #endif -/* Used to avoid long recursion due to selector thunks +/* Note [Selector optimisation depth limit] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * MAX_THUNK_SELECTOR_DEPTH is used to avoid long recursion of + * eval_thunk_selector due to nested selector thunks. Note that this *only* + * counts nested selector thunks, e.g. `fst (fst (... (fst x)))`. The collector + * will traverse interleaved selector-constructor pairs without limit, e.g. + * + * a = (fst b, _) + * b = (fst c, _) + * c = (fst d, _) + * d = (x, _) + * */ #define MAX_THUNK_SELECTOR_DEPTH 16 @@ -50,9 +63,12 @@ STATIC_INLINE void evacuate_large(StgPtr p); Allocate some space in which to copy an object. -------------------------------------------------------------------------- */ +/* size is in words */ STATIC_INLINE StgPtr alloc_for_copy (uint32_t size, uint32_t gen_no) { + ASSERT(gen_no < RtsFlags.GcFlags.generations); + StgPtr to; gen_workspace *ws; @@ -69,6 +85,36 @@ alloc_for_copy (uint32_t size, uint32_t gen_no) } } + if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) { + /* See Note [Deadlock detection under nonmoving collector]. */ + if (deadlock_detect_gc) + gen_no = oldest_gen->no; + + if (gen_no == oldest_gen->no) { + gct->copied += size; + to = nonmovingAllocate(gct->cap, size); + + // Add segment to the todo list unless it's already there + // current->todo_link == NULL means not in todo list + struct NonmovingSegment *seg = nonmovingGetSegment(to); + if (!seg->todo_link) { + gen_workspace *ws = &gct->gens[oldest_gen->no]; + seg->todo_link = ws->todo_seg; + ws->todo_seg = seg; + } + + // The object which refers to this closure may have been aged (i.e. + // retained in a younger generation). Consequently, we must add the + // closure to the mark queue to ensure that it will be marked. + // + // However, if we are in a deadlock detection GC then we disable aging + // so there is no need. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to); + return to; + } + } + ws = &gct->gens[gen_no]; // zero memory references here /* chain a new block onto the to-space for the destination gen if @@ -88,6 +134,7 @@ alloc_for_copy (uint32_t size, uint32_t gen_no) The evacuate() code -------------------------------------------------------------------------- */ +/* size is in words */ STATIC_INLINE GNUC_ATTR_HOT void copy_tag(StgClosure **p, const StgInfoTable *info, StgClosure *src, uint32_t size, uint32_t gen_no, StgWord tag) @@ -284,7 +331,10 @@ evacuate_large(StgPtr p) */ new_gen_no = bd->dest_no; - if (new_gen_no < gct->evac_gen_no) { + if (RTS_UNLIKELY(deadlock_detect_gc)) { + /* See Note [Deadlock detection under nonmoving collector]. */ + new_gen_no = oldest_gen->no; + } else if (new_gen_no < gct->evac_gen_no) { if (gct->eager_promotion) { new_gen_no = gct->evac_gen_no; } else { @@ -296,6 +346,9 @@ evacuate_large(StgPtr p) new_gen = &generations[new_gen_no]; bd->flags |= BF_EVACUATED; + if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen)) { + bd->flags |= BF_NONMOVING; + } initBdescr(bd, new_gen, new_gen->to); // If this is a block of pinned or compact objects, we don't have to scan @@ -330,6 +383,13 @@ evacuate_large(StgPtr p) STATIC_INLINE void evacuate_static_object (StgClosure **link_field, StgClosure *q) { + if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) { + // See Note [Static objects under the nonmoving collector] in Storage.c. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + return; + } + StgWord link = (StgWord)*link_field; // See Note [STATIC_LINK fields] for how the link field bits work @@ -376,12 +436,22 @@ evacuate_compact (StgPtr p) bd = Bdescr((StgPtr)str); gen_no = bd->gen_no; + if (bd->flags & BF_NONMOVING) { + // We may have evacuated the block to the nonmoving generation. If so + // we need to make sure it is added to the mark queue since the only + // reference to it may be from the moving heap. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) str); + return; + } + // already evacuated? (we're about to do the same check, // but we avoid taking the spin-lock) if (bd->flags & BF_EVACUATED) { /* Don't forget to set the gct->failed_to_evac flag if we didn't get * the desired destination (see comments in evacuate()). */ + debugTrace(DEBUG_compact, "Compact %p already evacuated", str); if (gen_no < gct->evac_gen_no) { gct->failed_to_evac = true; TICK_GC_FAILED_PROMOTION(); @@ -430,9 +500,15 @@ evacuate_compact (StgPtr p) // for that - the only code touching the generation of the block is // in the GC, and that should never see blocks other than the first) bd->flags |= BF_EVACUATED; + if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving && new_gen == oldest_gen)) { + bd->flags |= BF_NONMOVING; + } initBdescr(bd, new_gen, new_gen->to); if (str->hash) { + // If there is a hash-table for sharing preservation then we need to add + // the compact to the scavenging work list to ensure that the hashtable + // is scavenged. gen_workspace *ws = &gct->gens[new_gen_no]; bd->link = ws->todo_large_objects; ws->todo_large_objects = bd; @@ -563,7 +639,18 @@ loop: bd = Bdescr((P_)q); - if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT)) != 0) { + if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_COMPACT | BF_NONMOVING)) != 0) { + // Pointer to non-moving heap. Non-moving heap is collected using + // mark-sweep so this object should be marked and then retained in sweep. + if (RTS_UNLIKELY(bd->flags & BF_NONMOVING)) { + // NOTE: large objects in nonmoving heap are also marked with + // BF_NONMOVING. Those are moved to scavenged_large_objects list in + // mark phase. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + return; + } + // pointer into to-space: just return it. It might be a pointer // into a generation that we aren't collecting (> N), or it // might just be a pointer into to-space. The latter doesn't @@ -594,6 +681,13 @@ loop: */ if (bd->flags & BF_LARGE) { evacuate_large((P_)q); + + // We may have evacuated the block to the nonmoving generation. If so + // we need to make sure it is added to the mark queue since the only + // reference to it may be from the moving heap. + if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) { + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + } return; } @@ -894,6 +988,12 @@ evacuate_BLACKHOLE(StgClosure **p) // blackholes can't be in a compact ASSERT((bd->flags & BF_COMPACT) == 0); + if (RTS_UNLIKELY(bd->flags & BF_NONMOVING)) { + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + return; + } + // blackholes *can* be in a large object: when raiseAsync() creates an // AP_STACK the payload might be large enough to create a large object. // See #14497. @@ -1044,7 +1144,7 @@ selector_chain: // save any space in any case, and updating with an indirection is // trickier in a non-collected gen: we would have to update the // mutable list. - if (bd->flags & BF_EVACUATED) { + if (bd->flags & (BF_EVACUATED | BF_NONMOVING)) { unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p); *q = (StgClosure *)p; // shortcut, behave as for: if (evac) evacuate(q); @@ -1257,6 +1357,7 @@ selector_loop: // recursively evaluate this selector. We don't want to // recurse indefinitely, so we impose a depth bound. + // See Note [Selector optimisation depth limit]. if (gct->thunk_selector_depth >= MAX_THUNK_SELECTOR_DEPTH) { goto bale_out; } diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 76237f35c2d32ed8deaf31507e92d93fe5d286bc..83e9c97bd90ea87ea5cea41778d2b0d443422a5f 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -51,6 +51,7 @@ #include "CheckUnload.h" #include "CNF.h" #include "RtsFlags.h" +#include "NonMoving.h" #include <string.h> // for memset() #include <unistd.h> @@ -103,6 +104,7 @@ */ uint32_t N; bool major_gc; +bool deadlock_detect_gc; /* Data used for allocation area sizing. */ @@ -159,7 +161,6 @@ static void mark_root (void *user, StgClosure **root); static void prepare_collected_gen (generation *gen); static void prepare_uncollected_gen (generation *gen); static void init_gc_thread (gc_thread *t); -static void resize_generations (void); static void resize_nursery (void); static void start_gc_threads (void); static void scavenge_until_all_done (void); @@ -193,7 +194,8 @@ StgPtr mark_sp; // pointer to the next unallocated mark stack entry void GarbageCollect (uint32_t collect_gen, - bool do_heap_census, + const bool do_heap_census, + const bool deadlock_detect, uint32_t gc_type USED_IF_THREADS, Capability *cap, bool idle_cap[]) @@ -263,7 +265,25 @@ GarbageCollect (uint32_t collect_gen, N = collect_gen; major_gc = (N == RtsFlags.GcFlags.generations-1); - if (major_gc) { + /* See Note [Deadlock detection under nonmoving collector]. */ + deadlock_detect_gc = deadlock_detect; + +#if defined(THREADED_RTS) + if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) { + /* If there is already a concurrent major collection running then + * there is no benefit to starting another. + * TODO: Catch heap-size runaway. + */ + N--; + collect_gen--; + major_gc = false; + } +#endif + + /* N.B. The nonmoving collector works a bit differently. See + * Note [Static objects under the nonmoving collector]. + */ + if (major_gc && !RtsFlags.GcFlags.useNonmoving) { prev_static_flag = static_flag; static_flag = static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A; @@ -572,7 +592,7 @@ GarbageCollect (uint32_t collect_gen, gen = &generations[g]; // for generations we collected... - if (g <= N) { + if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) { /* free old memory and shift to-space into from-space for all * the collected generations (except the allocation area). These @@ -710,8 +730,55 @@ GarbageCollect (uint32_t collect_gen, } } // for all generations - // update the max size of older generations after a major GC - resize_generations(); + // Flush the update remembered set. See Note [Eager update remembered set + // flushing] in NonMovingMark.c + if (RtsFlags.GcFlags.useNonmoving) { + RELEASE_SM_LOCK; + nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue); + ACQUIRE_SM_LOCK; + } + + // Mark and sweep the oldest generation. + // N.B. This can only happen after we've moved + // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects. + ASSERT(oldest_gen->scavenged_large_objects == NULL); + if (RtsFlags.GcFlags.useNonmoving && major_gc) { + // All threads in non-moving heap should be found to be alive, becuase + // threads in the non-moving generation's list should live in the + // non-moving heap, and we consider non-moving objects alive during + // preparation. + ASSERT(oldest_gen->old_threads == END_TSO_QUEUE); + // For weaks, remember that we evacuated all weaks to the non-moving heap + // in markWeakPtrList(), and then moved the weak_ptr_list list to + // old_weak_ptr_list. We then moved weaks with live keys to the + // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in + // old_weak_ptr_list to dead_weak_ptr_list. So at this point + // old_weak_ptr_list should be empty. + ASSERT(oldest_gen->old_weak_ptr_list == NULL); + + // we may need to take the lock to allocate mark queue blocks + RELEASE_SM_LOCK; + // dead_weak_ptr_list contains weak pointers with dead keys. Those need to + // be kept alive because we'll use them in finalizeSchedulers(). Similarly + // resurrected_threads are also going to be used in resurrectedThreads() + // so we need to mark those too. + // Note that in sequential case these lists will be appended with more + // weaks and threads found to be dead in mark. +#if !defined(THREADED_RTS) + // In the non-threaded runtime this is the only time we push to the + // upd_rem_set + nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue); +#endif + nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads); + ACQUIRE_SM_LOCK; + } + + // Update the max size of older generations after a major GC: + // We can't resize here in the case of the concurrent collector since we + // don't yet know how much live data we have. This will be instead done + // once we finish marking. + if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving) + resizeGenerations(); // Free the mark stack. if (mark_stack_top_bd != NULL) { @@ -735,7 +802,7 @@ GarbageCollect (uint32_t collect_gen, // mark the garbage collected CAFs as dead #if defined(DEBUG) - if (major_gc) { gcCAFs(); } + if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); } #endif // Update the stable name hash table @@ -768,8 +835,9 @@ GarbageCollect (uint32_t collect_gen, // check sanity after GC // before resurrectThreads(), because that might overwrite some // closures, which will cause problems with THREADED where we don't - // fill slop. - IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc)); + // fill slop. If we are using the nonmoving collector then we can't claim to + // be *after* the major GC; it's now running concurrently. + IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving)); // If a heap census is due, we need to do it before // resurrectThreads(), for the same reason as checkSanity above: @@ -942,6 +1010,7 @@ new_gc_thread (uint32_t n, gc_thread *t) ws->todo_overflow = NULL; ws->n_todo_overflow = 0; ws->todo_large_objects = NULL; + ws->todo_seg = END_NONMOVING_TODO_LIST; ws->part_list = NULL; ws->n_part_blocks = 0; @@ -1320,6 +1389,18 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) } #endif +/* ---------------------------------------------------------------------------- + Save the mutable lists in saved_mut_lists where it will be scavenged + during GC + ------------------------------------------------------------------------- */ + +static void +stash_mut_list (Capability *cap, uint32_t gen_no) +{ + cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; + cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node); +} + /* ---------------------------------------------------------------------------- Initialise a generation that is to be collected ------------------------------------------------------------------------- */ @@ -1331,11 +1412,17 @@ prepare_collected_gen (generation *gen) gen_workspace *ws; bdescr *bd, *next; - // Throw away the current mutable list. Invariant: the mutable - // list always has at least one block; this means we can avoid a - // check for NULL in recordMutable(). g = gen->no; - if (g != 0) { + + if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) { + // Nonmoving heap's mutable list is always a root. + for (i = 0; i < n_capabilities; i++) { + stash_mut_list(capabilities[i], g); + } + } else if (g != 0) { + // Otherwise throw away the current mutable list. Invariant: the + // mutable list always has at least one block; this means we can avoid + // a check for NULL in recordMutable(). for (i = 0; i < n_capabilities; i++) { freeChain(capabilities[i]->mut_lists[g]); capabilities[i]->mut_lists[g] = @@ -1351,13 +1438,17 @@ prepare_collected_gen (generation *gen) gen->old_threads = gen->threads; gen->threads = END_TSO_QUEUE; - // deprecate the existing blocks - gen->old_blocks = gen->blocks; - gen->n_old_blocks = gen->n_blocks; - gen->blocks = NULL; - gen->n_blocks = 0; - gen->n_words = 0; - gen->live_estimate = 0; + // deprecate the existing blocks (except in the case of the nonmoving + // collector since these will be preserved in nonmovingCollect for the + // concurrent GC). + if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) { + gen->old_blocks = gen->blocks; + gen->n_old_blocks = gen->n_blocks; + gen->blocks = NULL; + gen->n_blocks = 0; + gen->n_words = 0; + gen->live_estimate = 0; + } // initialise the large object queues. ASSERT(gen->scavenged_large_objects == NULL); @@ -1451,18 +1542,6 @@ prepare_collected_gen (generation *gen) } } - -/* ---------------------------------------------------------------------------- - Save the mutable lists in saved_mut_lists - ------------------------------------------------------------------------- */ - -static void -stash_mut_list (Capability *cap, uint32_t gen_no) -{ - cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; - cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node); -} - /* ---------------------------------------------------------------------------- Initialise a generation that is *not* to be collected ------------------------------------------------------------------------- */ @@ -1531,31 +1610,57 @@ collect_gct_blocks (void) } /* ----------------------------------------------------------------------------- - During mutation, any blocks that are filled by allocatePinned() are - stashed on the local pinned_object_blocks list, to avoid needing to - take a global lock. Here we collect those blocks from the - cap->pinned_object_blocks lists and put them on the - main g0->large_object list. + During mutation, any blocks that are filled by allocatePinned() are stashed + on the local pinned_object_blocks list, to avoid needing to take a global + lock. Here we collect those blocks from the cap->pinned_object_blocks lists + and put them on the g0->large_object or oldest_gen->large_objects. + + How to decide which list to put them on? + + - When non-moving heap is enabled and this is a major GC, we put them on + oldest_gen. This is because after preparation we really want no + old-to-young references, and we want to be able to reset mut_lists. For + this we need to promote every potentially live object to the oldest gen. + + - Otherwise we put them on g0. -------------------------------------------------------------------------- */ static void collect_pinned_object_blocks (void) { - uint32_t n; - bdescr *bd, *prev; + generation *gen; + const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving; + if (use_nonmoving && major_gc) { + gen = oldest_gen; + } else { + gen = g0; + } - for (n = 0; n < n_capabilities; n++) { - prev = NULL; - for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) { - prev = bd; + for (uint32_t n = 0; n < n_capabilities; n++) { + bdescr *last = NULL; + if (use_nonmoving && gen == oldest_gen) { + // Mark objects as belonging to the nonmoving heap + for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) { + bd->flags |= BF_NONMOVING; + bd->gen = oldest_gen; + bd->gen_no = oldest_gen->no; + oldest_gen->n_large_words += bd->free - bd->start; + oldest_gen->n_large_blocks += bd->blocks; + last = bd; + } + } else { + for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) { + last = bd; + } } - if (prev != NULL) { - prev->link = g0->large_objects; - if (g0->large_objects != NULL) { - g0->large_objects->u.back = prev; + + if (last != NULL) { + last->link = gen->large_objects; + if (gen->large_objects != NULL) { + gen->large_objects->u.back = last; } - g0->large_objects = capabilities[n]->pinned_object_blocks; - capabilities[n]->pinned_object_blocks = 0; + gen->large_objects = capabilities[n]->pinned_object_blocks; + capabilities[n]->pinned_object_blocks = NULL; } } } @@ -1614,98 +1719,100 @@ mark_root(void *user USED_IF_THREADS, StgClosure **root) percentage of the maximum heap size available to allocate into. ------------------------------------------------------------------------- */ -static void -resize_generations (void) +void +resizeGenerations (void) { uint32_t g; + W_ live, size, min_alloc, words; + const W_ max = RtsFlags.GcFlags.maxHeapSize; + const W_ gens = RtsFlags.GcFlags.generations; - if (major_gc && RtsFlags.GcFlags.generations > 1) { - W_ live, size, min_alloc, words; - const W_ max = RtsFlags.GcFlags.maxHeapSize; - const W_ gens = RtsFlags.GcFlags.generations; - - // live in the oldest generations - if (oldest_gen->live_estimate != 0) { - words = oldest_gen->live_estimate; - } else { - words = oldest_gen->n_words; - } - live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + - oldest_gen->n_large_blocks + - oldest_gen->n_compact_blocks; + // live in the oldest generations + if (oldest_gen->live_estimate != 0) { + words = oldest_gen->live_estimate; + } else { + words = oldest_gen->n_words; + } + live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + + oldest_gen->n_large_blocks + + oldest_gen->n_compact_blocks; - // default max size for all generations except zero - size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, - RtsFlags.GcFlags.minOldGenSize); + // default max size for all generations except zero + size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, + RtsFlags.GcFlags.minOldGenSize); - if (RtsFlags.GcFlags.heapSizeSuggestionAuto) { - if (max > 0) { - RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size); - } else { - RtsFlags.GcFlags.heapSizeSuggestion = size; - } + if (RtsFlags.GcFlags.heapSizeSuggestionAuto) { + if (max > 0) { + RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size); + } else { + RtsFlags.GcFlags.heapSizeSuggestion = size; } + } - // minimum size for generation zero - min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200, - RtsFlags.GcFlags.minAllocAreaSize - * (W_)n_capabilities); - - // Auto-enable compaction when the residency reaches a - // certain percentage of the maximum heap size (default: 30%). - if (RtsFlags.GcFlags.compact || - (max > 0 && - oldest_gen->n_blocks > - (RtsFlags.GcFlags.compactThreshold * max) / 100)) { - oldest_gen->mark = 1; - oldest_gen->compact = 1; + // minimum size for generation zero + min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200, + RtsFlags.GcFlags.minAllocAreaSize + * (W_)n_capabilities); + + // Auto-enable compaction when the residency reaches a + // certain percentage of the maximum heap size (default: 30%). + // Except when non-moving GC is enabled. + if (!RtsFlags.GcFlags.useNonmoving && + (RtsFlags.GcFlags.compact || + (max > 0 && + oldest_gen->n_blocks > + (RtsFlags.GcFlags.compactThreshold * max) / 100))) { + oldest_gen->mark = 1; + oldest_gen->compact = 1; // debugBelch("compaction: on\n", live); - } else { - oldest_gen->mark = 0; - oldest_gen->compact = 0; + } else { + oldest_gen->mark = 0; + oldest_gen->compact = 0; // debugBelch("compaction: off\n", live); - } + } - if (RtsFlags.GcFlags.sweep) { - oldest_gen->mark = 1; - } + if (RtsFlags.GcFlags.sweep) { + oldest_gen->mark = 1; + } - // if we're going to go over the maximum heap size, reduce the - // size of the generations accordingly. The calculation is - // different if compaction is turned on, because we don't need - // to double the space required to collect the old generation. - if (max != 0) { + // if we're going to go over the maximum heap size, reduce the + // size of the generations accordingly. The calculation is + // different if compaction is turned on, because we don't need + // to double the space required to collect the old generation. + if (max != 0) { + + // this test is necessary to ensure that the calculations + // below don't have any negative results - we're working + // with unsigned values here. + if (max < min_alloc) { + heapOverflow(); + } - // this test is necessary to ensure that the calculations - // below don't have any negative results - we're working - // with unsigned values here. - if (max < min_alloc) { - heapOverflow(); + if (oldest_gen->compact) { + if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) { + size = (max - min_alloc) / ((gens - 1) * 2 - 1); } - - if (oldest_gen->compact) { - if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) { - size = (max - min_alloc) / ((gens - 1) * 2 - 1); - } - } else { - if ( (size * (gens - 1) * 2) + min_alloc > max ) { - size = (max - min_alloc) / ((gens - 1) * 2); - } + } else { + if ( (size * (gens - 1) * 2) + min_alloc > max ) { + size = (max - min_alloc) / ((gens - 1) * 2); } + } - if (size < live) { - heapOverflow(); - } + if (size < live) { + heapOverflow(); } + } #if 0 - debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live, - min_alloc, size, max); + debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live, + min_alloc, size, max); + debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n", + oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks); + debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks); #endif - for (g = 0; g < gens; g++) { - generations[g].max_blocks = size; - } + for (g = 0; g < gens; g++) { + generations[g].max_blocks = size; } } @@ -1841,21 +1948,16 @@ resize_nursery (void) #if defined(DEBUG) -static void gcCAFs(void) +void gcCAFs(void) { - StgIndStatic *p, *prev; + uint32_t i = 0; + StgIndStatic *prev = NULL; - const StgInfoTable *info; - uint32_t i; - - i = 0; - p = debug_caf_list; - prev = NULL; - - for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST; - p = (StgIndStatic*)p->saved_info) { - - info = get_itbl((StgClosure*)p); + for (StgIndStatic *p = debug_caf_list; + p != (StgIndStatic*) END_OF_CAF_LIST; + p = (StgIndStatic*) p->saved_info) + { + const StgInfoTable *info = get_itbl((StgClosure*)p); ASSERT(info->type == IND_STATIC); // See Note [STATIC_LINK fields] in Storage.h diff --git a/rts/sm/GC.h b/rts/sm/GC.h index 43cc4ca8a16ac3357ff1d2a08e8255b89618f6fd..bde006913b83059a6c5d67912fd2bc417730a062 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -17,9 +17,12 @@ #include "HeapAlloc.h" -void GarbageCollect (uint32_t force_major_gc, +void GarbageCollect (uint32_t collect_gen, bool do_heap_census, - uint32_t gc_type, Capability *cap, bool idle_cap[]); + bool deadlock_detect, + uint32_t gc_type, + Capability *cap, + bool idle_cap[]); typedef void (*evac_fn)(void *user, StgClosure **root); @@ -30,6 +33,8 @@ bool doIdleGCWork(Capability *cap, bool all); extern uint32_t N; extern bool major_gc; +/* See Note [Deadlock detection under nonmoving collector]. */ +extern bool deadlock_detect_gc; extern bdescr *mark_stack_bd; extern bdescr *mark_stack_top_bd; @@ -55,6 +60,8 @@ void gcWorkerThread (Capability *cap); void initGcThreads (uint32_t from, uint32_t to); void freeGcThreads (void); +void resizeGenerations (void); + #if defined(THREADED_RTS) void waitForGcThreads (Capability *cap, bool idle_cap[]); void releaseGCThreads (Capability *cap, bool idle_cap[]); diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c index 650dc2c1dfbab2bc0d6c103a27522a502e2a1d2a..11080c1f22cf4aaf244e70b6e3b0d11357bc3a8f 100644 --- a/rts/sm/GCAux.c +++ b/rts/sm/GCAux.c @@ -60,6 +60,14 @@ isAlive(StgClosure *p) // ignore closures in generations that we're not collecting. bd = Bdescr((P_)q); + // isAlive is used when scavenging moving generations, before the mark + // phase. Because we don't know alive-ness of objects before the mark phase + // we have to conservatively treat objects in the non-moving generation as + // alive here. + if (bd->flags & BF_NONMOVING) { + return p; + } + // if it's a pointer into to-space, then we're done if (bd->flags & BF_EVACUATED) { return p; @@ -140,14 +148,14 @@ markCAFs (evac_fn evac, void *user) StgIndStatic *c; for (c = dyn_caf_list; - c != (StgIndStatic*)END_OF_CAF_LIST; + ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST; c = (StgIndStatic *)c->static_link) { c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c); evac(user, &c->indirectee); } for (c = revertible_caf_list; - c != (StgIndStatic*)END_OF_CAF_LIST; + ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST; c = (StgIndStatic *)c->static_link) { c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c); diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h index 66f7a7f84fb7bd5855a0800599532ece48c6e1e9..3012f52f2822b4aff7f5edc74de0e000fd36a4bc 100644 --- a/rts/sm/GCThread.h +++ b/rts/sm/GCThread.h @@ -83,6 +83,7 @@ typedef struct gen_workspace_ { bdescr * todo_bd; StgPtr todo_free; // free ptr for todo_bd StgPtr todo_lim; // lim for todo_bd + struct NonmovingSegment *todo_seg; // only available for oldest gen workspace WSDeque * todo_q; bdescr * todo_overflow; @@ -100,9 +101,6 @@ typedef struct gen_workspace_ { bdescr * part_list; StgWord n_part_blocks; // count of above StgWord n_part_words; - - StgWord pad[1]; - } gen_workspace ATTRIBUTE_ALIGNED(64); // align so that computing gct->gens[n] is a shift, not a multiply // fails if the size is <64, which is why we need the pad above diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c new file mode 100644 index 0000000000000000000000000000000000000000..50cf784aab61f35cb06e96b3aceec6ca785e656a --- /dev/null +++ b/rts/sm/NonMoving.c @@ -0,0 +1,1390 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator + * + * ---------------------------------------------------------------------------*/ + +#include "Rts.h" +#include "RtsUtils.h" +#include "Capability.h" +#include "Printer.h" +#include "Storage.h" +// We call evacuate, which expects the thread-local gc_thread to be valid; +// This is sometimes declared as a register variable therefore it is necessary +// to include the declaration so that the compiler doesn't clobber the register. +#include "GCThread.h" +#include "GCTDecl.h" +#include "Schedule.h" + +#include "NonMoving.h" +#include "NonMovingMark.h" +#include "NonMovingSweep.h" +#include "NonMovingCensus.h" +#include "StablePtr.h" // markStablePtrTable +#include "Schedule.h" // markScheduler +#include "Weak.h" // dead_weak_ptr_list + +struct NonmovingHeap nonmovingHeap; + +uint8_t nonmovingMarkEpoch = 1; + +static void nonmovingBumpEpoch(void) { + nonmovingMarkEpoch = nonmovingMarkEpoch == 1 ? 2 : 1; +} + +#if defined(THREADED_RTS) +/* + * This mutex ensures that only one non-moving collection is active at a time. + */ +Mutex nonmoving_collection_mutex; + +OSThreadId mark_thread; +bool concurrent_coll_running = false; +Condition concurrent_coll_finished; +Mutex concurrent_coll_finished_lock; +#endif + +/* + * Note [Non-moving garbage collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * The sources rts/NonMoving*.c implement GHC's non-moving garbage collector + * for the oldest generation. In contrast to the throughput-oriented moving + * collector, the non-moving collector is designed to achieve low GC latencies + * on large heaps. It accomplishes low-latencies by way of a concurrent + * mark-and-sweep collection strategy on a specially-designed heap structure. + * While the design is described in detail in the design document found in + * docs/storage/nonmoving-gc, we briefly summarize the structure here. + * + * + * === Heap Structure === + * + * The nonmoving heap (embodied by struct NonmovingHeap) consists of a family + * of allocators, each serving a range of allocation sizes. Each allocator + * consists of a set of *segments*, each of which contain fixed-size *blocks* + * (not to be confused with "blocks" provided by GHC's block allocator; this is + * admittedly an unfortunate overlap in terminology). These blocks are the + * backing store for the allocator. In addition to blocks, the segment also + * contains some header information (see struct NonmovingSegment in + * NonMoving.h). This header contains a *bitmap* encoding one byte per block + * (used by the collector to record liveness), as well as the index of the next + * unallocated block (and a *snapshot* of this field which will be described in + * the next section). + * + * Each allocator maintains three sets of segments: + * + * - A *current* segment for each capability; this is the segment which that + * capability will allocate into. + * + * - A pool of *active* segments, each of which containing at least one + * unallocated block. The allocate will take a segment from this pool when + * it fills its *current* segment. + * + * - A set of *filled* segments, which contain no unallocated blocks and will + * be collected during the next major GC cycle + * + * Storage for segments is allocated using the block allocator using an aligned + * group of NONMOVING_SEGMENT_BLOCKS blocks. This makes the task of locating + * the segment header for a clone a simple matter of bit-masking (as + * implemented by nonmovingGetSegment). + * + * In addition, to relieve pressure on the block allocator we keep a small pool + * of free blocks around (nonmovingHeap.free) which can be pushed/popped + * to/from in a lock-free manner. + * + * + * === Allocation === + * + * The allocator (as implemented by nonmovingAllocate) starts by identifying + * which allocator the request should be made against. It then allocates into + * its local current segment and bumps the next_free pointer to point to the + * next unallocated block (as indicated by the bitmap). If it finds the current + * segment is now full it moves it to the filled list and looks for a new + * segment to make current from a few sources: + * + * 1. the allocator's active list (see pop_active_segment) + * 2. the nonmoving heap's free block pool (see nonmovingPopFreeSegment) + * 3. allocate a new segment from the block allocator (see + * nonmovingAllocSegment) + * + * Note that allocation does *not* involve modifying the bitmap. The bitmap is + * only modified by the collector. + * + * + * === Snapshot invariant === + * + * To safely collect in a concurrent setting, the collector relies on the + * notion of a *snapshot*. The snapshot is a hypothetical frozen state of the + * heap topology taken at the beginning of the major collection cycle. + * With this definition we require the following property of the mark phase, + * which we call the *snapshot invariant*, + * + * All objects that were reachable at the time the snapshot was collected + * must have their mark bits set at the end of the mark phase. + * + * As the mutator might change the topology of the heap while we are marking + * this property requires some cooperation from the mutator to maintain. + * Specifically, we rely on a write barrier as described in Note [Update + * remembered set]. + * + * To determine which objects were existent when the snapshot was taken we + * record a snapshot of each segments next_free pointer at the beginning of + * collection. + * + * + * === Collection === + * + * Collection happens in a few phases some of which occur during a + * stop-the-world period (marked with [STW]) and others which can occur + * concurrently with mutation and minor collection (marked with [CONC]): + * + * 1. [STW] Preparatory GC: Here we do a standard minor collection of the + * younger generations (which may evacuate things to the nonmoving heap). + * References from younger generations into the nonmoving heap are recorded + * in the mark queue (see Note [Aging under the non-moving collector] in + * this file). + * + * 2. [STW] Snapshot update: Here we update the segment snapshot metadata + * (see nonmovingPrepareMark) and move the filled segments to + * nonmovingHeap.sweep_list, which is the set of segments which we will + * sweep this GC cycle. + * + * 3. [STW] Root collection: Here we walk over a variety of root sources + * and add them to the mark queue (see nonmovingCollect). + * + * 4. [CONC] Concurrent marking: Here we do the majority of marking concurrently + * with mutator execution (but with the write barrier enabled; see + * Note [Update remembered set]). + * + * 5. [STW] Final sync: Here we interrupt the mutators, ask them to + * flush their final update remembered sets, and mark any new references + * we find. + * + * 6. [CONC] Sweep: Here we walk over the nonmoving segments on sweep_list + * and place them back on either the active, current, or filled list, + * depending upon how much live data they contain. + * + * + * === Marking === + * + * Ignoring large and static objects, marking a closure is fairly + * straightforward (implemented in NonMovingMark.c:mark_closure): + * + * 1. Check whether the closure is in the non-moving generation; if not then + * we ignore it. + * 2. Find the segment containing the closure's block. + * 3. Check whether the closure's block is above $seg->next_free_snap; if so + * then the block was not allocated when we took the snapshot and therefore + * we don't need to mark it. + * 4. Check whether the block's bitmap bits is equal to nonmovingMarkEpoch. If + * so then we can stop as we have already marked it. + * 5. Push the closure's pointers to the mark queue. + * 6. Set the blocks bitmap bits to nonmovingMarkEpoch. + * + * Note that the ordering of (5) and (6) is rather important, as described in + * Note [StgStack dirtiness flags and concurrent marking]. + * + * + * === Other references === + * + * Apart from the design document in docs/storage/nonmoving-gc and the Ueno + * 2016 paper (TODO citation) from which it drew inspiration, there are a + * variety of other relevant Notes scattered throughout the tree: + * + * - Note [Concurrent non-moving collection] (NonMoving.c) describes + * concurrency control of the nonmoving collector + * + * - Note [Live data accounting in nonmoving collector] (NonMoving.c) + * describes how we track the quantity of live data in the nonmoving + * generation. + * + * - Note [Aging under the non-moving collector] (NonMoving.c) describes how + * we accomodate aging + * + * - Note [Large objects in the non-moving collector] (NonMovingMark.c) + * describes how we track large objects. + * + * - Note [Update remembered set] (NonMovingMark.c) describes the function and + * implementation of the update remembered set used to realize the concurrent + * write barrier. + * + * - Note [Concurrent read barrier on deRefWeak#] (NonMovingMark.c) describes + * the read barrier on Weak# objects. + * + * - Note [Unintentional marking in resurrectThreads] (NonMovingMark.c) describes + * a tricky interaction between the update remembered set flush and weak + * finalization. + * + * - Note [Origin references in the nonmoving collector] (NonMovingMark.h) + * describes how we implement indirection short-cutting and the selector + * optimisation. + * + * - Note [StgStack dirtiness flags and concurrent marking] (TSO.h) describes + * the protocol for concurrent marking of stacks. + * + * - Note [Static objects under the nonmoving collector] (Storage.c) describes + * treatment of static objects. + * + * + * Note [Concurrent non-moving collection] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Concurrency-control of non-moving garbage collection is a bit tricky. There + * are a few things to keep in mind: + * + * - Only one non-moving collection may be active at a time. This is enforced by the + * concurrent_coll_running flag, which is set when a collection is on-going. If + * we attempt to initiate a new collection while this is set we wait on the + * concurrent_coll_finished condition variable, which signals when the + * active collection finishes. + * + * - In between the mark and sweep phases the non-moving collector must synchronize + * with mutator threads to collect and mark their final update remembered + * sets. This is accomplished using + * stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held + * the final mark has concluded. + * + * Note that possibility of concurrent minor and non-moving collections + * requires that we handle static objects a bit specially. See + * Note [Static objects under the nonmoving collector] in Storage.c + * for details. + * + * + * Note [Aging under the non-moving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * The initial design of the non-moving collector mandated that all live data + * be evacuated to the non-moving heap prior to a major collection. This + * simplified certain bits of implementation and eased reasoning. However, it + * was (unsurprisingly) also found to result in significant amounts of + * unnecessary copying. + * + * Consequently, we now allow aging. Aging allows the preparatory GC leading up + * to a major collection to evacuate some objects into the young generation. + * However, this introduces the following tricky case that might arise after + * we have finished the preparatory GC: + * + * moving heap ┆ non-moving heap + * ───────────────┆────────────────── + * ┆ + * B â†â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ A â†â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ root + * │ ┆ ↖─────────────── gen1 mut_list + * ╰───────────────→ C + * ┆ + * + * In this case C is clearly live, but the non-moving collector can only see + * this by walking through B, which lives in the moving heap. However, doing so + * would require that we synchronize with the mutator/minor GC to ensure that it + * isn't in the middle of moving B. What to do? + * + * The solution we use here is to teach the preparatory moving collector to + * "evacuate" objects it encounters in the non-moving heap by adding them to + * the mark queue. This is implemented by pushing the object to the update + * remembered set of the capability held by the evacuating gc_thread + * (implemented by markQueuePushClosureGC) + * + * Consequently collection of the case above would proceed as follows: + * + * 1. Initial state: + * * A lives in the non-moving heap and is reachable from the root set + * * A is on the oldest generation's mut_list, since it contains a pointer + * to B, which lives in a younger generation + * * B lives in the moving collector's from space + * * C lives in the non-moving heap + * + * 2. Preparatory GC: Scavenging mut_lists: + * + * The mut_list of the oldest generation is scavenged, resulting in B being + * evacuated (aged) into the moving collector's to-space. + * + * 3. Preparatory GC: Scavenge B + * + * B (now in to-space) is scavenged, resulting in evacuation of C. + * evacuate(C) pushes a reference to C to the mark queue. + * + * 4. Non-moving GC: C is marked + * + * The non-moving collector will come to C in the mark queue and mark it. + * + * + * Note [Deadlock detection under the non-moving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * In GHC the garbage collector is responsible for identifying deadlocked + * programs. Providing for this responsibility is slightly tricky in the + * non-moving collector due to the existence of aging. In particular, the + * non-moving collector cannot traverse objects living in a young generation + * but reachable from the non-moving generation, as described in Note [Aging + * under the non-moving collector]. + * + * However, this can pose trouble for deadlock detection since it means that we + * may conservatively mark dead closures as live. Consider this case: + * + * moving heap ┆ non-moving heap + * ───────────────┆────────────────── + * ┆ + * MVAR_QUEUE â†â”€â”€â”€â”€â”€ TSO â†â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€ gen1 mut_list + * ↑ │ ╰────────↗ │ + * │ │ ┆ │ + * │ │ ┆ ↓ + * │ ╰──────────→ MVAR + * ╰─────────────────╯ + * ┆ + * + * In this case we have a TSO blocked on a dead MVar. Because the MVAR_TSO_QUEUE on + * which it is blocked lives in the moving heap, the TSO is necessarily on the + * oldest generation's mut_list. As in Note [Aging under the non-moving + * collector], the MVAR_TSO_QUEUE will be evacuated. If MVAR_TSO_QUEUE is aged + * (e.g. evacuated to the young generation) then the MVAR will be added to the + * mark queue. Consequently, we will falsely conclude that the MVAR is still + * alive and fail to spot the deadlock. + * + * To avoid this sort of situation we disable aging when we are starting a + * major GC specifically for deadlock detection (as done by + * scheduleDetectDeadlock). This condition is recorded by the + * deadlock_detect_gc global variable declared in GC.h. Setting this has a few + * effects on the preparatory GC: + * + * - Evac.c:alloc_for_copy forces evacuation to the non-moving generation. + * + * - The evacuation logic usually responsible for pushing objects living in + * the non-moving heap to the mark queue is disabled. This is safe because + * we know that all live objects will be in the non-moving heap by the end + * of the preparatory moving collection. + * + * + * Note [Live data accounting in nonmoving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * The nonmoving collector uses an approximate heuristic for reporting live + * data quantity. Specifically, during mark we record how much live data we + * find in nonmoving_live_words. At the end of mark we declare this amount to + * be how much live data we have on in the nonmoving heap (by setting + * oldest_gen->live_estimate). + * + * In addition, we update oldest_gen->live_estimate every time we fill a + * segment. This, as well, is quite approximate: we assume that all blocks + * above next_free_next are newly-allocated. In principle we could refer to the + * bitmap to count how many blocks we actually allocated but this too would be + * approximate due to concurrent collection and ultimately seems more costly + * than the problem demands. + * + */ + +memcount nonmoving_live_words = 0; + +#if defined(THREADED_RTS) +static void* nonmovingConcurrentMark(void *mark_queue); +#endif +static void nonmovingClearBitmap(struct NonmovingSegment *seg); +static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads); + +static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t log_block_size) +{ + bdescr *bd = Bdescr((P_) seg); + seg->link = NULL; + seg->todo_link = NULL; + seg->next_free = 0; + nonmovingClearBitmap(seg); + bd->nonmoving_segment.log_block_size = log_block_size; + bd->nonmoving_segment.next_free_snap = 0; + bd->u.scan = nonmovingSegmentGetBlock(seg, 0); +} + +// Add a segment to the free list. +void nonmovingPushFreeSegment(struct NonmovingSegment *seg) +{ + // See Note [Live data accounting in nonmoving collector]. + if (nonmovingHeap.n_free > NONMOVING_MAX_FREE) { + bdescr *bd = Bdescr((StgPtr) seg); + ACQUIRE_SM_LOCK; + ASSERT(oldest_gen->n_blocks >= bd->blocks); + ASSERT(oldest_gen->n_words >= BLOCK_SIZE_W * bd->blocks); + oldest_gen->n_blocks -= bd->blocks; + oldest_gen->n_words -= BLOCK_SIZE_W * bd->blocks; + freeGroup(bd); + RELEASE_SM_LOCK; + return; + } + + while (true) { + struct NonmovingSegment *old = nonmovingHeap.free; + seg->link = old; + if (cas((StgVolatilePtr) &nonmovingHeap.free, (StgWord) old, (StgWord) seg) == (StgWord) old) + break; + } + __sync_add_and_fetch(&nonmovingHeap.n_free, 1); +} + +static struct NonmovingSegment *nonmovingPopFreeSegment(void) +{ + while (true) { + struct NonmovingSegment *seg = nonmovingHeap.free; + if (seg == NULL) { + return NULL; + } + if (cas((StgVolatilePtr) &nonmovingHeap.free, + (StgWord) seg, + (StgWord) seg->link) == (StgWord) seg) { + __sync_sub_and_fetch(&nonmovingHeap.n_free, 1); + return seg; + } + } +} + +unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size) +{ + // We compute the overwhelmingly common size cases directly to avoid a very + // expensive integer division. + switch (log_block_size) { + case 3: return nonmovingBlockCount(3); + case 4: return nonmovingBlockCount(4); + case 5: return nonmovingBlockCount(5); + case 6: return nonmovingBlockCount(6); + case 7: return nonmovingBlockCount(7); + default: return nonmovingBlockCount(log_block_size); + } +} + +/* + * Request a fresh segment from the free segment list or allocate one of the + * given node. + * + * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock + * under the assumption that we are in a GC context). + */ +static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node) +{ + // First try taking something off of the free list + struct NonmovingSegment *ret; + ret = nonmovingPopFreeSegment(); + + // Nothing in the free list, allocate a new segment... + if (ret == NULL) { + // Take gc spinlock: another thread may be scavenging a moving + // generation and call `todo_block_full` + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + bdescr *bd = allocAlignedGroupOnNode(node, NONMOVING_SEGMENT_BLOCKS); + // See Note [Live data accounting in nonmoving collector]. + oldest_gen->n_blocks += bd->blocks; + oldest_gen->n_words += BLOCK_SIZE_W * bd->blocks; + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + + for (StgWord32 i = 0; i < bd->blocks; ++i) { + initBdescr(&bd[i], oldest_gen, oldest_gen); + bd[i].flags = BF_NONMOVING; + } + ret = (struct NonmovingSegment *)bd->start; + } + + // Check alignment + ASSERT(((uintptr_t)ret % NONMOVING_SEGMENT_SIZE) == 0); + return ret; +} + +static inline unsigned long log2_floor(unsigned long x) +{ + return sizeof(unsigned long)*8 - 1 - __builtin_clzl(x); +} + +static inline unsigned long log2_ceil(unsigned long x) +{ + unsigned long log = log2_floor(x); + return (x - (1 << log)) ? log + 1 : log; +} + +// Advance a segment's next_free pointer. Returns true if segment if full. +static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count) +{ + const uint8_t *bitmap = seg->bitmap; + ASSERT(blk_count == nonmovingSegmentBlockCount(seg)); +#if defined(NAIVE_ADVANCE_FREE) + // reference implementation + for (unsigned int i = seg->next_free+1; i < blk_count; i++) { + if (!bitmap[i]) { + seg->next_free = i; + return false; + } + } + seg->next_free = blk_count; + return true; +#else + const uint8_t *c = memchr(&bitmap[seg->next_free+1], 0, blk_count - seg->next_free - 1); + if (c == NULL) { + seg->next_free = blk_count; + return true; + } else { + seg->next_free = c - bitmap; + return false; + } +#endif +} + +static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca) +{ + while (true) { + struct NonmovingSegment *seg = alloca->active; + if (seg == NULL) { + return NULL; + } + if (cas((StgVolatilePtr) &alloca->active, + (StgWord) seg, + (StgWord) seg->link) == (StgWord) seg) { + return seg; + } + } +} + +/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */ +GNUC_ATTR_HOT +void *nonmovingAllocate(Capability *cap, StgWord sz) +{ + unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord)); + unsigned int block_count = nonmovingBlockCountFromSize(log_block_size); + + // The max we ever allocate is 3276 bytes (anything larger is a large + // object and not moved) which is covered by allocator 9. + ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT); + + struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0]; + + // Allocate into current segment + struct NonmovingSegment *current = alloca->current[cap->no]; + ASSERT(current); // current is never NULL + void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free); + ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment + + // Advance the current segment's next_free or allocate a new segment if full + bool full = advance_next_free(current, block_count); + if (full) { + // Current segment is full: update live data estimate link it to + // filled, take an active segment if one exists, otherwise allocate a + // new segment. + + // Update live data estimate. + // See Note [Live data accounting in nonmoving collector]. + unsigned int new_blocks = block_count - nonmovingSegmentInfo(current)->next_free_snap; + unsigned int block_size = 1 << log_block_size; + atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_)); + + // push the current segment to the filled list + nonmovingPushFilledSegment(current); + + // first look for a new segment in the active list + struct NonmovingSegment *new_current = pop_active_segment(alloca); + + // there are no active segments, allocate new segment + if (new_current == NULL) { + new_current = nonmovingAllocSegment(cap->node); + nonmovingInitSegment(new_current, log_block_size); + } + + // make it current + new_current->link = NULL; + alloca->current[cap->no] = new_current; + } + + return ret; +} + +/* Allocate a nonmovingAllocator */ +static struct NonmovingAllocator *alloc_nonmoving_allocator(uint32_t n_caps) +{ + size_t allocator_sz = + sizeof(struct NonmovingAllocator) + + sizeof(void*) * n_caps; // current segment pointer for each capability + struct NonmovingAllocator *alloc = + stgMallocBytes(allocator_sz, "nonmovingInit"); + memset(alloc, 0, allocator_sz); + return alloc; +} + +static void free_nonmoving_allocator(struct NonmovingAllocator *alloc) +{ + stgFree(alloc); +} + +void nonmovingInit(void) +{ + if (! RtsFlags.GcFlags.useNonmoving) return; +#if defined(THREADED_RTS) + initMutex(&nonmoving_collection_mutex); + initCondition(&concurrent_coll_finished); + initMutex(&concurrent_coll_finished_lock); +#endif + for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) { + nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(n_capabilities); + } + nonmovingMarkInitUpdRemSet(); +} + +// Stop any nonmoving collection in preparation for RTS shutdown. +void nonmovingStop(void) +{ + if (! RtsFlags.GcFlags.useNonmoving) return; +#if defined(THREADED_RTS) + if (mark_thread) { + debugTrace(DEBUG_nonmoving_gc, + "waiting for nonmoving collector thread to terminate"); + ACQUIRE_LOCK(&concurrent_coll_finished_lock); + waitCondition(&concurrent_coll_finished, &concurrent_coll_finished_lock); + } +#endif +} + +void nonmovingExit(void) +{ + if (! RtsFlags.GcFlags.useNonmoving) return; + + // First make sure collector is stopped before we tear things down. + nonmovingStop(); + +#if defined(THREADED_RTS) + closeMutex(&concurrent_coll_finished_lock); + closeCondition(&concurrent_coll_finished); + closeMutex(&nonmoving_collection_mutex); +#endif + + for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) { + free_nonmoving_allocator(nonmovingHeap.allocators[i]); + } +} + +/* + * Assumes that no garbage collector or mutator threads are running to safely + * resize the nonmoving_allocators. + * + * Must hold sm_mutex. + */ +void nonmovingAddCapabilities(uint32_t new_n_caps) +{ + unsigned int old_n_caps = nonmovingHeap.n_caps; + struct NonmovingAllocator **allocs = nonmovingHeap.allocators; + + for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) { + struct NonmovingAllocator *old = allocs[i]; + allocs[i] = alloc_nonmoving_allocator(new_n_caps); + + // Copy the old state + allocs[i]->filled = old->filled; + allocs[i]->active = old->active; + for (unsigned int j = 0; j < old_n_caps; j++) { + allocs[i]->current[j] = old->current[j]; + } + stgFree(old); + + // Initialize current segments for the new capabilities + for (unsigned int j = old_n_caps; j < new_n_caps; j++) { + allocs[i]->current[j] = nonmovingAllocSegment(capabilities[j]->node); + nonmovingInitSegment(allocs[i]->current[j], NONMOVING_ALLOCA0 + i); + allocs[i]->current[j]->link = NULL; + } + } + nonmovingHeap.n_caps = new_n_caps; +} + +static inline void nonmovingClearBitmap(struct NonmovingSegment *seg) +{ + unsigned int n = nonmovingSegmentBlockCount(seg); + memset(seg->bitmap, 0, n); +} + +/* Prepare the heap bitmaps and snapshot metadata for a mark */ +static void nonmovingPrepareMark(void) +{ + // See Note [Static objects under the nonmoving collector]. + prev_static_flag = static_flag; + static_flag = + static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A; + + // Should have been cleared by the last sweep + ASSERT(nonmovingHeap.sweep_list == NULL); + + nonmovingBumpEpoch(); + for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) { + struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx]; + + // Update current segments' snapshot pointers + for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) { + struct NonmovingSegment *seg = alloca->current[cap_n]; + nonmovingSegmentInfo(seg)->next_free_snap = seg->next_free; + } + + // Update filled segments' snapshot pointers and move to sweep_list + uint32_t n_filled = 0; + struct NonmovingSegment *const filled = alloca->filled; + alloca->filled = NULL; + if (filled) { + struct NonmovingSegment *seg = filled; + while (true) { + n_filled++; + prefetchForRead(seg->link); + // Clear bitmap + prefetchForWrite(seg->link->bitmap); + nonmovingClearBitmap(seg); + // Set snapshot + nonmovingSegmentInfo(seg)->next_free_snap = seg->next_free; + if (seg->link) + seg = seg->link; + else + break; + } + // add filled segments to sweep_list + seg->link = nonmovingHeap.sweep_list; + nonmovingHeap.sweep_list = filled; + } + + // N.B. It's not necessary to update snapshot pointers of active segments; + // they were set after they were swept and haven't seen any allocation + // since. + } + + // Clear large object bits of existing large objects + for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) { + bd->flags &= ~BF_MARKED; + } + + // Add newly promoted large objects and clear mark bits + bdescr *next; + ASSERT(oldest_gen->scavenged_large_objects == NULL); + for (bdescr *bd = oldest_gen->large_objects; bd; bd = next) { + next = bd->link; + bd->flags |= BF_NONMOVING_SWEEPING; + bd->flags &= ~BF_MARKED; + dbl_link_onto(bd, &nonmoving_large_objects); + } + n_nonmoving_large_blocks += oldest_gen->n_large_blocks; + oldest_gen->large_objects = NULL; + oldest_gen->n_large_words = 0; + oldest_gen->n_large_blocks = 0; + nonmoving_live_words = 0; + + // Clear compact object mark bits + for (bdescr *bd = nonmoving_compact_objects; bd; bd = bd->link) { + bd->flags &= ~BF_MARKED; + } + + // Move new compact objects from younger generations to nonmoving_compact_objects + for (bdescr *bd = oldest_gen->compact_objects; bd; bd = next) { + next = bd->link; + bd->flags |= BF_NONMOVING_SWEEPING; + bd->flags &= ~BF_MARKED; + dbl_link_onto(bd, &nonmoving_compact_objects); + } + n_nonmoving_compact_blocks += oldest_gen->n_compact_blocks; + oldest_gen->n_compact_blocks = 0; + oldest_gen->compact_objects = NULL; + // TODO (osa): what about "in import" stuff?? + + + +#if defined(DEBUG) + debug_caf_list_snapshot = debug_caf_list; + debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST; +#endif +} + +// Mark weak pointers in the non-moving heap. They'll either end up in +// dead_weak_ptr_list or stay in weak_ptr_list. Either way they need to be kept +// during sweep. See `MarkWeak.c:markWeakPtrList` for the moving heap variant +// of this. +static void nonmovingMarkWeakPtrList(MarkQueue *mark_queue, StgWeak *dead_weak_ptr_list) +{ + for (StgWeak *w = oldest_gen->weak_ptr_list; w; w = w->link) { + markQueuePushClosure_(mark_queue, (StgClosure*)w); + // Do not mark finalizers and values here, those fields will be marked + // in `nonmovingMarkDeadWeaks` (for dead weaks) or + // `nonmovingTidyWeaks` (for live weaks) + } + + // We need to mark dead_weak_ptr_list too. This is subtle: + // + // - By the beginning of this GC we evacuated all weaks to the non-moving + // heap (in `markWeakPtrList`) + // + // - During the scavenging of the moving heap we discovered that some of + // those weaks are dead and moved them to `dead_weak_ptr_list`. Note that + // because of the fact above _all weaks_ are in the non-moving heap at + // this point. + // + // - So, to be able to traverse `dead_weak_ptr_list` and run finalizers we + // need to mark it. + for (StgWeak *w = dead_weak_ptr_list; w; w = w->link) { + markQueuePushClosure_(mark_queue, (StgClosure*)w); + nonmovingMarkDeadWeak(mark_queue, w); + } +} + +void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads) +{ +#if defined(THREADED_RTS) + // We can't start a new collection until the old one has finished + // We also don't run in final GC + if (concurrent_coll_running || sched_state > SCHED_RUNNING) { + return; + } +#endif + + trace(TRACE_nonmoving_gc, "Starting nonmoving GC preparation"); + resizeGenerations(); + + nonmovingPrepareMark(); + + // N.B. These should have been cleared at the end of the last sweep. + ASSERT(nonmoving_marked_large_objects == NULL); + ASSERT(n_nonmoving_marked_large_blocks == 0); + ASSERT(nonmoving_marked_compact_objects == NULL); + ASSERT(n_nonmoving_marked_compact_blocks == 0); + + MarkQueue *mark_queue = stgMallocBytes(sizeof(MarkQueue), "mark queue"); + initMarkQueue(mark_queue); + current_mark_queue = mark_queue; + + // Mark roots + trace(TRACE_nonmoving_gc, "Marking roots for nonmoving GC"); + markCAFs((evac_fn)markQueueAddRoot, mark_queue); + for (unsigned int n = 0; n < n_capabilities; ++n) { + markCapability((evac_fn)markQueueAddRoot, mark_queue, + capabilities[n], true/*don't mark sparks*/); + } + markScheduler((evac_fn)markQueueAddRoot, mark_queue); + nonmovingMarkWeakPtrList(mark_queue, *dead_weaks); + markStablePtrTable((evac_fn)markQueueAddRoot, mark_queue); + + // Mark threads resurrected during moving heap scavenging + for (StgTSO *tso = *resurrected_threads; tso != END_TSO_QUEUE; tso = tso->global_link) { + markQueuePushClosure_(mark_queue, (StgClosure*)tso); + } + trace(TRACE_nonmoving_gc, "Finished marking roots for nonmoving GC"); + + // Roots marked, mark threads and weak pointers + + // At this point all threads are moved to threads list (from old_threads) + // and all weaks are moved to weak_ptr_list (from old_weak_ptr_list) by + // the previous scavenge step, so we need to move them to "old" lists + // again. + + // Fine to override old_threads because any live or resurrected threads are + // moved to threads or resurrected_threads lists. + ASSERT(oldest_gen->old_threads == END_TSO_QUEUE); + ASSERT(nonmoving_old_threads == END_TSO_QUEUE); + nonmoving_old_threads = oldest_gen->threads; + oldest_gen->threads = END_TSO_QUEUE; + + // Make sure we don't lose any weak ptrs here. Weaks in old_weak_ptr_list + // will either be moved to `dead_weaks` (if dead) or `weak_ptr_list` (if + // alive). + ASSERT(oldest_gen->old_weak_ptr_list == NULL); + ASSERT(nonmoving_old_weak_ptr_list == NULL); + nonmoving_old_weak_ptr_list = oldest_gen->weak_ptr_list; + oldest_gen->weak_ptr_list = NULL; + trace(TRACE_nonmoving_gc, "Finished nonmoving GC preparation"); + + // We are now safe to start concurrent marking + + // Note that in concurrent mark we can't use dead_weaks and + // resurrected_threads from the preparation to add new weaks and threads as + // that would cause races between minor collection and mark. So we only pass + // those lists to mark function in sequential case. In concurrent case we + // allocate fresh lists. + +#if defined(THREADED_RTS) + // If we're interrupting or shutting down, do not let this capability go and + // run a STW collection. Reason: we won't be able to acquire this capability + // again for the sync if we let it go, because it'll immediately start doing + // a major GC, becuase that's what we do when exiting scheduler (see + // exitScheduler()). + if (sched_state == SCHED_RUNNING) { + concurrent_coll_running = true; + nonmoving_write_barrier_enabled = true; + debugTrace(DEBUG_nonmoving_gc, "Starting concurrent mark thread"); + createOSThread(&mark_thread, "non-moving mark thread", + nonmovingConcurrentMark, mark_queue); + } else { + nonmovingConcurrentMark(mark_queue); + } +#else + // Use the weak and thread lists from the preparation for any new weaks and + // threads found to be dead in mark. + nonmovingMark_(mark_queue, dead_weaks, resurrected_threads); +#endif +} + +/* Mark mark queue, threads, and weak pointers until no more weaks have been + * resuscitated + */ +static void nonmovingMarkThreadsWeaks(MarkQueue *mark_queue) +{ + while (true) { + // Propagate marks + nonmovingMark(mark_queue); + + // Tidy threads and weaks + nonmovingTidyThreads(); + + if (! nonmovingTidyWeaks(mark_queue)) + return; + } +} + +#if defined(THREADED_RTS) +static void* nonmovingConcurrentMark(void *data) +{ + MarkQueue *mark_queue = (MarkQueue*)data; + StgWeak *dead_weaks = NULL; + StgTSO *resurrected_threads = (StgTSO*)&stg_END_TSO_QUEUE_closure; + nonmovingMark_(mark_queue, &dead_weaks, &resurrected_threads); + return NULL; +} + +// TODO: Not sure where to put this function. +// Append w2 to the end of w1. +static void appendWeakList( StgWeak **w1, StgWeak *w2 ) +{ + while (*w1) { + w1 = &(*w1)->link; + } + *w1 = w2; +} +#endif + +static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads) +{ + ACQUIRE_LOCK(&nonmoving_collection_mutex); + debugTrace(DEBUG_nonmoving_gc, "Starting mark..."); + + // Do concurrent marking; most of the heap will get marked here. + nonmovingMarkThreadsWeaks(mark_queue); + +#if defined(THREADED_RTS) + Task *task = newBoundTask(); + + // If at this point if we've decided to exit then just return + if (sched_state > SCHED_RUNNING) { + // Note that we break our invariants here and leave segments in + // nonmovingHeap.sweep_list, don't free nonmoving_large_objects etc. + // However because we won't be running mark-sweep in the final GC this + // is OK. + + // This is a RTS shutdown so we need to move our copy (snapshot) of + // weaks (nonmoving_old_weak_ptr_list and nonmoving_weak_ptr_list) to + // oldest_gen->threads to be able to run C finalizers in hs_exit_. Note + // that there may be more weaks added to oldest_gen->threads since we + // started mark, so we need to append our list to the tail of + // oldest_gen->threads. + appendWeakList(&nonmoving_old_weak_ptr_list, nonmoving_weak_ptr_list); + appendWeakList(&oldest_gen->weak_ptr_list, nonmoving_old_weak_ptr_list); + // These lists won't be used again so this is not necessary, but still + nonmoving_old_weak_ptr_list = NULL; + nonmoving_weak_ptr_list = NULL; + + goto finish; + } + + // We're still running, request a sync + nonmovingBeginFlush(task); + + bool all_caps_syncd; + do { + all_caps_syncd = nonmovingWaitForFlush(); + nonmovingMarkThreadsWeaks(mark_queue); + } while (!all_caps_syncd); +#endif + + nonmovingResurrectThreads(mark_queue, resurrected_threads); + + // No more resurrecting threads after this point + + // Do last marking of weak pointers + while (true) { + // Propagate marks + nonmovingMark(mark_queue); + + if (!nonmovingTidyWeaks(mark_queue)) + break; + } + + nonmovingMarkDeadWeaks(mark_queue, dead_weaks); + + // Propagate marks + nonmovingMark(mark_queue); + + // Now remove all dead objects from the mut_list to ensure that a younger + // generation collection doesn't attempt to look at them after we've swept. + nonmovingSweepMutLists(); + + debugTrace(DEBUG_nonmoving_gc, + "Done marking, resurrecting threads before releasing capabilities"); + + + // Schedule finalizers and resurrect threads +#if defined(THREADED_RTS) + // Just pick a random capability. Not sure if this is a good idea -- we use + // only one capability for all finalizers. + scheduleFinalizers(capabilities[0], *dead_weaks); + // Note that this mutates heap and causes running write barriers. + // See Note [Unintentional marking in resurrectThreads] in NonMovingMark.c + // for how we deal with this. + resurrectThreads(*resurrected_threads); +#endif + +#if defined(DEBUG) + // Zap CAFs that we will sweep + nonmovingGcCafs(); +#endif + + ASSERT(mark_queue->top->head == 0); + ASSERT(mark_queue->blocks->link == NULL); + + // Update oldest_gen thread and weak lists + // Note that we need to append these lists as a concurrent minor GC may have + // added stuff to them while we're doing mark-sweep concurrently + { + StgTSO **threads = &oldest_gen->threads; + while (*threads != END_TSO_QUEUE) { + threads = &(*threads)->global_link; + } + *threads = nonmoving_threads; + nonmoving_threads = END_TSO_QUEUE; + nonmoving_old_threads = END_TSO_QUEUE; + } + + { + StgWeak **weaks = &oldest_gen->weak_ptr_list; + while (*weaks) { + weaks = &(*weaks)->link; + } + *weaks = nonmoving_weak_ptr_list; + nonmoving_weak_ptr_list = NULL; + nonmoving_old_weak_ptr_list = NULL; + } + + // Everything has been marked; allow the mutators to proceed +#if defined(THREADED_RTS) + nonmoving_write_barrier_enabled = false; + nonmovingFinishFlush(task); +#endif + + current_mark_queue = NULL; + freeMarkQueue(mark_queue); + stgFree(mark_queue); + + oldest_gen->live_estimate = nonmoving_live_words; + oldest_gen->n_old_blocks = 0; + resizeGenerations(); + + /**************************************************** + * Sweep + ****************************************************/ + + traceConcSweepBegin(); + + // Because we can't mark large object blocks (no room for mark bit) we + // collect them in a map in mark_queue and we pass it here to sweep large + // objects + nonmovingSweepLargeObjects(); + nonmovingSweepCompactObjects(); + nonmovingSweepStableNameTable(); + + nonmovingSweep(); + ASSERT(nonmovingHeap.sweep_list == NULL); + debugTrace(DEBUG_nonmoving_gc, "Finished sweeping."); + traceConcSweepEnd(); +#if defined(DEBUG) + if (RtsFlags.DebugFlags.nonmoving_gc) + nonmovingPrintAllocatorCensus(); +#endif + + // TODO: Remainder of things done by GarbageCollect (update stats) + +#if defined(THREADED_RTS) +finish: + boundTaskExiting(task); + + // We are done... + mark_thread = 0; + + // Signal that the concurrent collection is finished, allowing the next + // non-moving collection to proceed + concurrent_coll_running = false; + signalCondition(&concurrent_coll_finished); + RELEASE_LOCK(&nonmoving_collection_mutex); +#endif +} + +#if defined(DEBUG) + +// Use this with caution: this doesn't work correctly during scavenge phase +// when we're doing parallel scavenging. Use it in mark phase or later (where +// we don't allocate more anymore). +void assert_in_nonmoving_heap(StgPtr p) +{ + if (!HEAP_ALLOCED_GC(p)) + return; + + bdescr *bd = Bdescr(p); + if (bd->flags & BF_LARGE) { + // It should be in a capability (if it's not filled yet) or in non-moving heap + for (uint32_t cap = 0; cap < n_capabilities; ++cap) { + if (bd == capabilities[cap]->pinned_object_block) { + return; + } + } + ASSERT(bd->flags & BF_NONMOVING); + return; + } + + // Search snapshot segments + for (struct NonmovingSegment *seg = nonmovingHeap.sweep_list; seg; seg = seg->link) { + if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + return; + } + } + + for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) { + struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx]; + // Search current segments + for (uint32_t cap_idx = 0; cap_idx < n_capabilities; ++cap_idx) { + struct NonmovingSegment *seg = alloca->current[cap_idx]; + if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + return; + } + } + + // Search active segments + int seg_idx = 0; + struct NonmovingSegment *seg = alloca->active; + while (seg) { + if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + return; + } + seg_idx++; + seg = seg->link; + } + + // Search filled segments + seg_idx = 0; + seg = alloca->filled; + while (seg) { + if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + return; + } + seg_idx++; + seg = seg->link; + } + } + + // We don't search free segments as they're unused + + barf("%p is not in nonmoving heap\n", (void*)p); +} + +void nonmovingPrintSegment(struct NonmovingSegment *seg) +{ + int num_blocks = nonmovingSegmentBlockCount(seg); + uint8_t log_block_size = nonmovingSegmentLogBlockSize(seg); + + debugBelch("Segment with %d blocks of size 2^%d (%d bytes, %u words, scan: %p)\n", + num_blocks, + log_block_size, + 1 << log_block_size, + (unsigned int) ROUNDUP_BYTES_TO_WDS(1 << log_block_size), + (void*)Bdescr((P_)seg)->u.scan); + + for (nonmoving_block_idx p_idx = 0; p_idx < seg->next_free; ++p_idx) { + StgClosure *p = (StgClosure*)nonmovingSegmentGetBlock(seg, p_idx); + if (nonmovingGetMark(seg, p_idx) != 0) { + debugBelch("%d (%p)* :\t", p_idx, (void*)p); + } else { + debugBelch("%d (%p) :\t", p_idx, (void*)p); + } + printClosure(p); + } + + debugBelch("End of segment\n\n"); +} + +void nonmovingPrintAllocator(struct NonmovingAllocator *alloc) +{ + debugBelch("Allocator at %p\n", (void*)alloc); + debugBelch("Filled segments:\n"); + for (struct NonmovingSegment *seg = alloc->filled; seg != NULL; seg = seg->link) { + debugBelch("%p ", (void*)seg); + } + debugBelch("\nActive segments:\n"); + for (struct NonmovingSegment *seg = alloc->active; seg != NULL; seg = seg->link) { + debugBelch("%p ", (void*)seg); + } + debugBelch("\nCurrent segments:\n"); + for (uint32_t i = 0; i < n_capabilities; ++i) { + debugBelch("%p ", alloc->current[i]); + } + debugBelch("\n"); +} + +void locate_object(P_ obj) +{ + // Search allocators + for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) { + struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx]; + for (uint32_t cap = 0; cap < n_capabilities; ++cap) { + struct NonmovingSegment *seg = alloca->current[cap]; + if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + debugBelch("%p is in current segment of capability %d of allocator %d at %p\n", obj, cap, alloca_idx, (void*)seg); + return; + } + } + int seg_idx = 0; + struct NonmovingSegment *seg = alloca->active; + while (seg) { + if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + debugBelch("%p is in active segment %d of allocator %d at %p\n", obj, seg_idx, alloca_idx, (void*)seg); + return; + } + seg_idx++; + seg = seg->link; + } + + seg_idx = 0; + seg = alloca->filled; + while (seg) { + if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + debugBelch("%p is in filled segment %d of allocator %d at %p\n", obj, seg_idx, alloca_idx, (void*)seg); + return; + } + seg_idx++; + seg = seg->link; + } + } + + struct NonmovingSegment *seg = nonmovingHeap.free; + int seg_idx = 0; + while (seg) { + if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) { + debugBelch("%p is in free segment %d at %p\n", obj, seg_idx, (void*)seg); + return; + } + seg_idx++; + seg = seg->link; + } + + // Search nurseries + for (uint32_t nursery_idx = 0; nursery_idx < n_nurseries; ++nursery_idx) { + for (bdescr* nursery_block = nurseries[nursery_idx].blocks; nursery_block; nursery_block = nursery_block->link) { + if (obj >= nursery_block->start && obj <= nursery_block->start + nursery_block->blocks*BLOCK_SIZE_W) { + debugBelch("%p is in nursery %d\n", obj, nursery_idx); + return; + } + } + } + + // Search generations + for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++g) { + generation *gen = &generations[g]; + for (bdescr *blk = gen->blocks; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " blocks\n", obj, g); + return; + } + } + for (bdescr *blk = gen->old_blocks; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " old blocks\n", obj, g); + return; + } + } + } + + // Search large objects + for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++g) { + generation *gen = &generations[g]; + for (bdescr *large_block = gen->large_objects; large_block; large_block = large_block->link) { + if ((P_)large_block->start == obj) { + debugBelch("%p is in large blocks of generation %d\n", obj, g); + return; + } + } + } + + for (bdescr *large_block = nonmoving_large_objects; large_block; large_block = large_block->link) { + if ((P_)large_block->start == obj) { + debugBelch("%p is in nonmoving_large_objects\n", obj); + return; + } + } + + for (bdescr *large_block = nonmoving_marked_large_objects; large_block; large_block = large_block->link) { + if ((P_)large_block->start == obj) { + debugBelch("%p is in nonmoving_marked_large_objects\n", obj); + return; + } + } + + // Search workspaces FIXME only works in non-threaded runtime +#if !defined(THREADED_RTS) + for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++ g) { + gen_workspace *ws = &gct->gens[g]; + for (bdescr *blk = ws->todo_bd; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " todo bds\n", obj, g); + return; + } + } + for (bdescr *blk = ws->scavd_list; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " scavd bds\n", obj, g); + return; + } + } + for (bdescr *blk = ws->todo_large_objects; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " todo large bds\n", obj, g); + return; + } + } + } +#endif +} + +void nonmovingPrintSweepList() +{ + debugBelch("==== SWEEP LIST =====\n"); + int i = 0; + for (struct NonmovingSegment *seg = nonmovingHeap.sweep_list; seg; seg = seg->link) { + debugBelch("%d: %p\n", i++, (void*)seg); + } + debugBelch("= END OF SWEEP LIST =\n"); +} + +void check_in_mut_list(StgClosure *p) +{ + for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) { + for (bdescr *bd = capabilities[cap_n]->mut_lists[oldest_gen->no]; bd; bd = bd->link) { + for (StgPtr q = bd->start; q < bd->free; ++q) { + if (*((StgPtr**)q) == (StgPtr*)p) { + debugBelch("Object is in mut list of cap %d: %p\n", cap_n, capabilities[cap_n]->mut_lists[oldest_gen->no]); + return; + } + } + } + } + + debugBelch("Object is not in a mut list\n"); +} + +void print_block_list(bdescr* bd) +{ + while (bd) { + debugBelch("%p, ", (void*)bd); + bd = bd->link; + } + debugBelch("\n"); +} + +void print_thread_list(StgTSO* tso) +{ + while (tso != END_TSO_QUEUE) { + printClosure((StgClosure*)tso); + tso = tso->global_link; + } +} + +#endif diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h new file mode 100644 index 0000000000000000000000000000000000000000..b3d4e14065cd718327b2fd16fa84b12a81e9a93b --- /dev/null +++ b/rts/sm/NonMoving.h @@ -0,0 +1,335 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator + * + * ---------------------------------------------------------------------------*/ + +#pragma once + +#if !defined(CMINUSMINUS) + +#include <string.h> +#include "HeapAlloc.h" +#include "NonMovingMark.h" + +#include "BeginPrivate.h" + +// Segments +#define NONMOVING_SEGMENT_BITS 15 // 2^15 = 32kByte +// Mask to find base of segment +#define NONMOVING_SEGMENT_MASK ((1 << NONMOVING_SEGMENT_BITS) - 1) +// In bytes +#define NONMOVING_SEGMENT_SIZE (1 << NONMOVING_SEGMENT_BITS) +// In words +#define NONMOVING_SEGMENT_SIZE_W ((1 << NONMOVING_SEGMENT_BITS) / SIZEOF_VOID_P) +// In blocks +#define NONMOVING_SEGMENT_BLOCKS (NONMOVING_SEGMENT_SIZE / BLOCK_SIZE) + +_Static_assert(NONMOVING_SEGMENT_SIZE % BLOCK_SIZE == 0, + "non-moving segment size must be multiple of block size"); + +// The index of a block within a segment +typedef uint16_t nonmoving_block_idx; + +// A non-moving heap segment +struct NonmovingSegment { + struct NonmovingSegment *link; // for linking together segments into lists + struct NonmovingSegment *todo_link; // NULL when not in todo list + nonmoving_block_idx next_free; // index of the next unallocated block + uint8_t bitmap[]; // liveness bitmap + // After the liveness bitmap comes the data blocks. Note that we need to + // ensure that the size of this struct (including the bitmap) is a multiple + // of the word size since GHC assumes that all object pointers are + // so-aligned. + + // N.B. There are also bits of information which are stored in the + // NonmovingBlockInfo stored in the segment's block descriptor. Namely: + // + // * the block size can be found in nonmovingBlockInfo(seg)->log_block_size. + // * the next_free snapshot can be found in + // nonmovingBlockInfo(seg)->next_free_snap. + // + // This allows us to mark a nonmoving closure without bringing the + // NonmovingSegment header into cache. +}; + +// This is how we mark end of todo lists. Not NULL because todo_link == NULL +// means segment is not in list. +#define END_NONMOVING_TODO_LIST ((struct NonmovingSegment*)1) + +// A non-moving allocator for a particular block size +struct NonmovingAllocator { + struct NonmovingSegment *filled; + struct NonmovingSegment *active; + // indexed by capability number + struct NonmovingSegment *current[]; +}; + +// first allocator is of size 2^NONMOVING_ALLOCA0 (in bytes) +#define NONMOVING_ALLOCA0 3 + +// allocators cover block sizes of 2^NONMOVING_ALLOCA0 to +// 2^(NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT) (in bytes) +#define NONMOVING_ALLOCA_CNT 12 + +// maximum number of free segments to hold on to +#define NONMOVING_MAX_FREE 16 + +struct NonmovingHeap { + struct NonmovingAllocator *allocators[NONMOVING_ALLOCA_CNT]; + // free segment list. This is a cache where we keep up to + // NONMOVING_MAX_FREE segments to avoid thrashing the block allocator. + // Note that segments in this list are still counted towards + // oldest_gen->n_blocks. + struct NonmovingSegment *free; + // how many segments in free segment list? accessed atomically. + unsigned int n_free; + + // records the current length of the nonmovingAllocator.current arrays + unsigned int n_caps; + + // The set of segments being swept in this GC. Segments are moved here from + // the filled list during preparation and moved back to either the filled, + // active, or free lists during sweep. Should be NULL before mark and + // after sweep. + struct NonmovingSegment *sweep_list; +}; + +extern struct NonmovingHeap nonmovingHeap; + +extern memcount nonmoving_live_words; + +#if defined(THREADED_RTS) +extern bool concurrent_coll_running; +#endif + +void nonmovingInit(void); +void nonmovingStop(void); +void nonmovingExit(void); + + +// dead_weaks and resurrected_threads lists are used for two things: +// +// - The weaks and threads in those lists are found to be dead during +// preparation, but the weaks will be used for finalization and threads will +// be scheduled again (aka. resurrection) so we need to keep them alive in the +// non-moving heap as well. So we treat them as roots and mark them. +// +// - In non-threaded runtime we add weaks and threads found to be dead in the +// non-moving heap to those lists so that they'll be finalized and scheduled +// as other weaks and threads. In threaded runtime we can't do this as that'd +// cause races between a minor collection and non-moving collection. Instead +// in non-moving heap we finalize the weaks and resurrect the threads +// directly, but in a pause. +// +void nonmovingCollect(StgWeak **dead_weaks, + StgTSO **resurrected_threads); + +void *nonmovingAllocate(Capability *cap, StgWord sz); +void nonmovingAddCapabilities(uint32_t new_n_caps); +void nonmovingPushFreeSegment(struct NonmovingSegment *seg); + + +INLINE_HEADER struct NonmovingSegmentInfo *nonmovingSegmentInfo(struct NonmovingSegment *seg) { + return &Bdescr((StgPtr) seg)->nonmoving_segment; +} + +INLINE_HEADER uint8_t nonmovingSegmentLogBlockSize(struct NonmovingSegment *seg) { + return nonmovingSegmentInfo(seg)->log_block_size; +} + +// Add a segment to the appropriate active list. +INLINE_HEADER void nonmovingPushActiveSegment(struct NonmovingSegment *seg) +{ + struct NonmovingAllocator *alloc = + nonmovingHeap.allocators[nonmovingSegmentLogBlockSize(seg) - NONMOVING_ALLOCA0]; + while (true) { + struct NonmovingSegment *current_active = (struct NonmovingSegment*)VOLATILE_LOAD(&alloc->active); + seg->link = current_active; + if (cas((StgVolatilePtr) &alloc->active, (StgWord) current_active, (StgWord) seg) == (StgWord) current_active) { + break; + } + } +} + +// Add a segment to the appropriate filled list. +INLINE_HEADER void nonmovingPushFilledSegment(struct NonmovingSegment *seg) +{ + struct NonmovingAllocator *alloc = + nonmovingHeap.allocators[nonmovingSegmentLogBlockSize(seg) - NONMOVING_ALLOCA0]; + while (true) { + struct NonmovingSegment *current_filled = (struct NonmovingSegment*)VOLATILE_LOAD(&alloc->filled); + seg->link = current_filled; + if (cas((StgVolatilePtr) &alloc->filled, (StgWord) current_filled, (StgWord) seg) == (StgWord) current_filled) { + break; + } + } +} +// Assert that the pointer can be traced by the non-moving collector (e.g. in +// mark phase). This means one of the following: +// +// - A static object +// - A large object +// - An object in the non-moving heap (e.g. in one of the segments) +// +void assert_in_nonmoving_heap(StgPtr p); + +// The block size of a given segment in bytes. +INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *seg) +{ + return 1 << nonmovingSegmentLogBlockSize(seg); +} + +// How many blocks does a segment with the given block size have? +INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size) +{ + unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment); + segment_data_size -= segment_data_size % SIZEOF_VOID_P; + unsigned int blk_size = 1 << log_block_size; + // N.B. +1 accounts for the byte in the mark bitmap. + return segment_data_size / (blk_size + 1); +} + +unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size); + +// How many blocks does the given segment contain? Also the size of the bitmap. +INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg) +{ + return nonmovingBlockCountFromSize(nonmovingSegmentLogBlockSize(seg)); +} + +// Get a pointer to the given block index assuming that the block size is as +// given (avoiding a potential cache miss when this information is already +// available). The log_block_size argument must be equal to seg->block_size. +INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i) +{ + ASSERT(log_block_size == nonmovingSegmentLogBlockSize(seg)); + // Block size in bytes + unsigned int blk_size = 1 << log_block_size; + // Bitmap size in bytes + W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t); + // Where the actual data starts (address of the first block). + // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that + // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back + // back to bytes by multiplying with word size. + W_ data = ROUNDUP_BYTES_TO_WDS(((W_)seg) + sizeof(struct NonmovingSegment) + bitmap_size) * sizeof(W_); + return (void*)(data + i*blk_size); +} + +// Get a pointer to the given block index. +INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i) +{ + return nonmovingSegmentGetBlock_(seg, nonmovingSegmentLogBlockSize(seg), i); +} + +// Get the segment which a closure resides in. Assumes that pointer points into +// non-moving heap. +INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p) +{ + const uintptr_t mask = ~NONMOVING_SEGMENT_MASK; + return (struct NonmovingSegment *) (((uintptr_t) p) & mask); +} + +INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p) +{ + ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING)); + return nonmovingGetSegment_unchecked(p); +} + +INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p) +{ + ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING)); + struct NonmovingSegment *seg = nonmovingGetSegment(p); + ptrdiff_t blk0 = (ptrdiff_t)nonmovingSegmentGetBlock(seg, 0); + ptrdiff_t offset = (ptrdiff_t)p - blk0; + return (nonmoving_block_idx) (offset >> nonmovingSegmentLogBlockSize(seg)); +} + +// TODO: Eliminate this +extern uint8_t nonmovingMarkEpoch; + +INLINE_HEADER void nonmovingSetMark(struct NonmovingSegment *seg, nonmoving_block_idx i) +{ + seg->bitmap[i] = nonmovingMarkEpoch; +} + +INLINE_HEADER uint8_t nonmovingGetMark(struct NonmovingSegment *seg, nonmoving_block_idx i) +{ + return seg->bitmap[i]; +} + +INLINE_HEADER void nonmovingSetClosureMark(StgPtr p) +{ + nonmovingSetMark(nonmovingGetSegment(p), nonmovingGetBlockIdx(p)); +} + +// TODO: Audit the uses of these +/* Was the given closure marked this major GC cycle? */ +INLINE_HEADER bool nonmovingClosureMarkedThisCycle(StgPtr p) +{ + struct NonmovingSegment *seg = nonmovingGetSegment(p); + nonmoving_block_idx blk_idx = nonmovingGetBlockIdx(p); + return nonmovingGetMark(seg, blk_idx) == nonmovingMarkEpoch; +} + +INLINE_HEADER bool nonmovingClosureMarked(StgPtr p) +{ + struct NonmovingSegment *seg = nonmovingGetSegment(p); + nonmoving_block_idx blk_idx = nonmovingGetBlockIdx(p); + return nonmovingGetMark(seg, blk_idx) != 0; +} + +// Can be called during a major collection to determine whether a particular +// segment is in the set of segments that will be swept this collection cycle. +INLINE_HEADER bool nonmovingSegmentBeingSwept(struct NonmovingSegment *seg) +{ + struct NonmovingSegmentInfo *seginfo = nonmovingSegmentInfo(seg); + unsigned int n = nonmovingBlockCountFromSize(seginfo->log_block_size); + return seginfo->next_free_snap >= n; +} + +// Can be called during a major collection to determine whether a particular +// closure lives in a segment that will be swept this collection cycle. +// Note that this returns true for both large and normal objects. +INLINE_HEADER bool nonmovingClosureBeingSwept(StgClosure *p) +{ + bdescr *bd = Bdescr((StgPtr) p); + if (HEAP_ALLOCED_GC(p)) { + if (bd->flags & BF_NONMOVING_SWEEPING) { + return true; + } else if (bd->flags & BF_NONMOVING) { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + return nonmovingSegmentBeingSwept(seg); + } else { + // outside of the nonmoving heap + return false; + } + } else { + // a static object + return true; + } +} + +INLINE_HEADER bool isNonmovingClosure(StgClosure *p) +{ + return !HEAP_ALLOCED_GC(p) || Bdescr((P_)p)->flags & BF_NONMOVING; +} + +#if defined(DEBUG) + +void nonmovingPrintSegment(struct NonmovingSegment *seg); +void nonmovingPrintAllocator(struct NonmovingAllocator *alloc); +void locate_object(P_ obj); +void nonmovingPrintSweepList(void); +// Check if the object is in one of non-moving heap mut_lists +void check_in_mut_list(StgClosure *p); +void print_block_list(bdescr *bd); +void print_thread_list(StgTSO* tso); + +#endif + +#include "EndPrivate.h" + +#endif // CMINUSMINUS diff --git a/rts/sm/NonMovingCensus.c b/rts/sm/NonMovingCensus.c new file mode 100644 index 0000000000000000000000000000000000000000..670d51263cf8898f779bfc5e1464a17c9c3cf4a4 --- /dev/null +++ b/rts/sm/NonMovingCensus.c @@ -0,0 +1,129 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Accounting census + * + * This is a simple space accounting census useful for characterising + * fragmentation in the nonmoving heap. + * + * ---------------------------------------------------------------------------*/ + +#include "Rts.h" +#include "NonMoving.h" +#include "Trace.h" +#include "NonMovingCensus.h" + +// N.B. This may miss segments in the event of concurrent mutation (e.g. if a +// mutator retires its current segment to the filled list). +// +// all_stopped is whether we can guarantee that all mutators and minor GCs are +// stopped. In this case is safe to look at active and current segments so we can +// also collect statistics on live words. +static inline struct NonmovingAllocCensus +nonmovingAllocatorCensus_(struct NonmovingAllocator *alloc, bool collect_live_words) +{ + struct NonmovingAllocCensus census = {0, 0, 0, 0}; + + for (struct NonmovingSegment *seg = alloc->filled; + seg != NULL; + seg = seg->link) + { + unsigned int n = nonmovingSegmentBlockCount(seg); + census.n_filled_segs++; + census.n_live_blocks += n; + if (collect_live_words) { + for (unsigned int i=0; i < n; i++) { + StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i); + census.n_live_words += closure_sizeW(c); + } + } + } + + for (struct NonmovingSegment *seg = alloc->active; + seg != NULL; + seg = seg->link) + { + census.n_active_segs++; + unsigned int n = nonmovingSegmentBlockCount(seg); + for (unsigned int i=0; i < n; i++) { + if (nonmovingGetMark(seg, i)) { + StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i); + if (collect_live_words) + census.n_live_words += closure_sizeW(c); + census.n_live_blocks++; + } + } + } + + for (unsigned int cap=0; cap < n_capabilities; cap++) + { + struct NonmovingSegment *seg = alloc->current[cap]; + unsigned int n = nonmovingSegmentBlockCount(seg); + for (unsigned int i=0; i < n; i++) { + if (nonmovingGetMark(seg, i)) { + StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i); + if (collect_live_words) + census.n_live_words += closure_sizeW(c); + census.n_live_blocks++; + } + } + } + return census; +} + +/* This must not be used when mutators are active since it assumes that + * all blocks in nonmoving heap are valid closures. + */ +struct NonmovingAllocCensus +nonmovingAllocatorCensusWithWords(struct NonmovingAllocator *alloc) +{ + return nonmovingAllocatorCensus_(alloc, true); +} + +struct NonmovingAllocCensus +nonmovingAllocatorCensus(struct NonmovingAllocator *alloc) +{ + return nonmovingAllocatorCensus_(alloc, false); +} + + +void nonmovingPrintAllocatorCensus() +{ + if (!RtsFlags.GcFlags.useNonmoving) + return; + + for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) { + struct NonmovingAllocCensus census = + nonmovingAllocatorCensus(nonmovingHeap.allocators[i]); + + uint32_t blk_size = 1 << (i + NONMOVING_ALLOCA0); + // We define occupancy as the fraction of space that is used for useful + // data (that is, live and not slop). + double occupancy = 100.0 * census.n_live_words * sizeof(W_) + / (census.n_live_blocks * blk_size); + if (census.n_live_blocks == 0) occupancy = 100; + (void) occupancy; // silence warning if !DEBUG + debugTrace(DEBUG_nonmoving_gc, "Allocator %d (%d bytes - %d bytes): " + "%d active segs, %d filled segs, %d live blocks, %d live words " + "(%2.1f%% occupancy)", + i, 1 << (i + NONMOVING_ALLOCA0 - 1), 1 << (i + NONMOVING_ALLOCA0), + census.n_active_segs, census.n_filled_segs, census.n_live_blocks, census.n_live_words, + occupancy); + } +} + +void nonmovingTraceAllocatorCensus() +{ +#if defined(TRACING) + if (!RtsFlags.GcFlags.useNonmoving && !TRACE_nonmoving_gc) + return; + + for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) { + const struct NonmovingAllocCensus census = + nonmovingAllocatorCensus(nonmovingHeap.allocators[i]); + const uint32_t log_blk_size = i + NONMOVING_ALLOCA0; + traceNonmovingHeapCensus(log_blk_size, &census); + } +#endif +} diff --git a/rts/sm/NonMovingCensus.h b/rts/sm/NonMovingCensus.h new file mode 100644 index 0000000000000000000000000000000000000000..7a66dc9b699064b86a212d0b54da8063e76a428b --- /dev/null +++ b/rts/sm/NonMovingCensus.h @@ -0,0 +1,28 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Accounting census + * + * ---------------------------------------------------------------------------*/ + +#pragma once + +#include "NonMoving.h" + +struct NonmovingAllocCensus { + uint32_t n_active_segs; + uint32_t n_filled_segs; + uint32_t n_live_blocks; + uint32_t n_live_words; +}; + + +struct NonmovingAllocCensus +nonmovingAllocatorCensusWithWords(struct NonmovingAllocator *alloc); + +struct NonmovingAllocCensus +nonmovingAllocatorCensus(struct NonmovingAllocator *alloc); + +void nonmovingPrintAllocatorCensus(void); +void nonmovingTraceAllocatorCensus(void); diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c new file mode 100644 index 0000000000000000000000000000000000000000..03e342806a6786ba42de5e430943ef08f0eb4c1b --- /dev/null +++ b/rts/sm/NonMovingMark.c @@ -0,0 +1,1958 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Mark phase + * + * ---------------------------------------------------------------------------*/ + +#include "Rts.h" +// We call evacuate, which expects the thread-local gc_thread to be valid; +// This is sometimes declared as a register variable therefore it is necessary +// to include the declaration so that the compiler doesn't clobber the register. +#include "NonMovingMark.h" +#include "NonMovingShortcut.h" +#include "NonMoving.h" +#include "BlockAlloc.h" /* for countBlocks */ +#include "HeapAlloc.h" +#include "Task.h" +#include "Trace.h" +#include "HeapUtils.h" +#include "Printer.h" +#include "Schedule.h" +#include "Weak.h" +#include "STM.h" +#include "MarkWeak.h" +#include "sm/Storage.h" +#include "CNF.h" + +static void mark_closure (MarkQueue *queue, const StgClosure *p, StgClosure **origin); +static void mark_tso (MarkQueue *queue, StgTSO *tso); +static void mark_stack (MarkQueue *queue, StgStack *stack); +static void mark_PAP_payload (MarkQueue *queue, + StgClosure *fun, + StgClosure **payload, + StgWord size); + +// How many Array# entries to add to the mark queue at once? +#define MARK_ARRAY_CHUNK_LENGTH 128 + +/* Note [Large objects in the non-moving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * The nonmoving collector keeps a separate list of its large objects, apart from + * oldest_gen->large_objects. There are two reasons for this: + * + * 1. oldest_gen is mutated by minor collections, which happen concurrently with + * marking + * 2. the non-moving collector needs a consistent picture + * + * At the beginning of a major collection, nonmovingCollect takes the objects in + * oldest_gen->large_objects (which includes all large objects evacuated by the + * moving collector) and adds them to nonmoving_large_objects. This is the set + * of large objects that will being collected in the current major GC cycle. + * + * As the concurrent mark phase proceeds, the large objects in + * nonmoving_large_objects that are found to be live are moved to + * nonmoving_marked_large_objects. During sweep we discard all objects that remain + * in nonmoving_large_objects and move everything in nonmoving_marked_larged_objects + * back to nonmoving_large_objects. + * + * During minor collections large objects will accumulate on + * oldest_gen->large_objects, where they will be picked up by the nonmoving + * collector and moved to nonmoving_large_objects during the next major GC. + * When this happens the block gets its BF_NONMOVING_SWEEPING flag set to + * indicate that it is part of the snapshot and consequently should be marked by + * the nonmoving mark phase.. + */ + +bdescr *nonmoving_large_objects = NULL; +bdescr *nonmoving_marked_large_objects = NULL; +memcount n_nonmoving_large_blocks = 0; +memcount n_nonmoving_marked_large_blocks = 0; + +bdescr *nonmoving_compact_objects = NULL; +bdescr *nonmoving_marked_compact_objects = NULL; +memcount n_nonmoving_compact_blocks = 0; +memcount n_nonmoving_marked_compact_blocks = 0; + +#if defined(THREADED_RTS) +/* Protects everything above. Furthermore, we only set the BF_MARKED bit of + * large object blocks when this is held. This ensures that the write barrier + * (e.g. finish_upd_rem_set_mark) and the collector (mark_closure) don't try to + * move the same large object to nonmoving_marked_large_objects more than once. + */ +static Mutex nonmoving_large_objects_mutex; +// Note that we don't need a similar lock for compact objects becuase we never +// mark a compact object eagerly in a write barrier; all compact objects are +// marked by the mark thread, so there can't be any races here. +#endif + +/* + * Where we keep our threads during collection since we must have a snapshot of + * the threads that lived in the nonmoving heap at the time that the snapshot + * was taken to safely resurrect. + */ +StgTSO *nonmoving_old_threads = END_TSO_QUEUE; +/* Same for weak pointers */ +StgWeak *nonmoving_old_weak_ptr_list = NULL; +/* Because we can "tidy" thread and weak lists concurrently with a minor GC we + * need to move marked threads and weaks to these lists until we pause for sync. + * Then we move them to oldest_gen lists. */ +StgTSO *nonmoving_threads = END_TSO_QUEUE; +StgWeak *nonmoving_weak_ptr_list = NULL; + +#if defined(DEBUG) +// TODO (osa): Document +StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST; +#endif + +/* Note [Update remembered set] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * The concurrent non-moving collector uses a remembered set to ensure + * that its marking is consistent with the snapshot invariant defined in + * the design. This remembered set, known as the update remembered set, + * records all pointers that have been overwritten since the beginning + * of the concurrent mark. This ensures that concurrent mutation cannot hide + * pointers to live objects from the nonmoving garbage collector. + * + * The update remembered set is maintained via a write barrier that + * is enabled whenever a concurrent mark is active. This write barrier + * can be found in a number of places: + * + * - In rts/Primops.cmm in primops responsible for modifying mutable closures + * (e.g. MVARs, MUT_VARs, etc.) + * + * - In rts/STM.c, where + * + * - In the dirty_* functions found in rts/Storage.c where we dirty MVARs, + * MUT_VARs, TSOs and STACKs. STACK is a somewhat special case, as described + * in Note [StgStack dirtiness flags and concurrent marking] in TSO.h. + * + * - In the code generated by the STG code generator for pointer array writes + * + * - In thunk updates (e.g. updateWithIndirection) to ensure that the free + * variables of the original thunk remain reachable. + * + * There is also a read barrier to handle weak references, as described in + * Note [Concurrent read barrier on deRefWeak#]. + * + * The representation of the update remembered set is the same as that of + * the mark queue. For efficiency, each capability maintains its own local + * accumulator of remembered set entries. When a capability fills its + * accumulator it is linked in to the global remembered set + * (upd_rem_set_block_list), where it is consumed by the mark phase. + * + * The mark phase is responsible for freeing update remembered set block + * allocations. + * + * Note that we eagerly flush update remembered sets during minor GCs as + * described in Note [Eager update remembered set flushing]. + * + * + * Note [Eager update remembered set flushing] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * We eagerly flush update remembered sets during minor GCs to avoid scenarios + * like the following which could result in long sync pauses: + * + * 1. We start a major GC, all thread stacks are added to the mark queue. + * 2. The concurrent mark thread starts. + * 3. The mutator is allowed to resume. One mutator thread T is scheduled and marks its + * stack to its local update remembered set. + * 4. The mark thread eventually encounters the mutator thread's stack but + * sees that it has already been marked; skips it. + * 5. Thread T continues running but does not push enough to its update + * remembered set to require a flush. + * 6. Eventually the mark thread finished marking and requests a final sync. + * 7. The thread T flushes its update remembered set. + * 8. We find that a large fraction of the heap (namely the subset that is + * only reachable from the thread T's stack) needs to be marked, incurring + * a large sync pause + * + * We avoid this by periodically (during minor GC) forcing a flush of the + * update remembered set. + * + * A better (but more complex) approach that would be worthwhile trying in the + * future would be to rather do the following: + * + * 1. Concurrent mark phase is on-going + * 2. Mark thread runs out of things to mark + * 3. Mark thread sends a signal to capabilities requesting that they send + * their update remembered sets without suspending their execution + * 4. The mark thread marks everything it was sent; runs out of things to mark + * 5. Mark thread initiates a sync + * 6. Capabilities send their final update remembered sets and suspend execution + * 7. Mark thread marks everything is was sent + * 8. Mark thead allows capabilities to resume. + * + * However, this is obviously a fair amount of complexity and so far the + * periodic eager flushing approach has been sufficient. + * + * + * Note [Concurrent read barrier on deRefWeak#] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * In general the non-moving GC assumes that all pointers reachable from a + * marked object are themselves marked (or in the mark queue). However, + * weak pointers are an obvious exception to this rule. In particular, + * deRefWeakPtr# allows the mutator to turn a weak reference into a strong + * reference. This interacts badly with concurrent collection. For + * instance, consider this program: + * + * f :: a -> b -> IO b + * f k v = do + * -- assume that k and v are the only references to the + * -- closures to which they refer. + * weak <- mkWeakPtr k v Nothing + * + * -- N.B. k is now technically dead since the only reference to it is + * -- weak, but we've not yet had a chance to tombstone the WeakPtr + * -- (which will happen in the course of major GC). + * performMajorGC + * -- Now we are running concurrently with the mark... + + * Just x <- deRefWeak weak + * -- We have now introduced a reference to `v`, which will + * -- not be marked as the only reference to `v` when the snapshot was + * -- taken is via a WeakPtr. + * return x + * + */ +static Mutex upd_rem_set_lock; +bdescr *upd_rem_set_block_list = NULL; + +#if defined(THREADED_RTS) +/* Used during the mark/sweep phase transition to track how many capabilities + * have pushed their update remembered sets. Protected by upd_rem_set_lock. + */ +static volatile StgWord upd_rem_set_flush_count = 0; +#endif + + +/* Signaled by each capability when it has flushed its update remembered set */ +static Condition upd_rem_set_flushed_cond; + +/* Indicates to mutators that the write barrier must be respected. Set while + * concurrent mark is running. + */ +StgWord nonmoving_write_barrier_enabled = false; + +/* Used to provide the current mark queue to the young generation + * collector for scavenging. + */ +MarkQueue *current_mark_queue = NULL; + +/* Initialise update remembered set data structures */ +void nonmovingMarkInitUpdRemSet() { + initMutex(&upd_rem_set_lock); + initCondition(&upd_rem_set_flushed_cond); +#if defined(THREADED_RTS) + initMutex(&nonmoving_large_objects_mutex); +#endif +} + +#if defined(THREADED_RTS) && defined(DEBUG) +static uint32_t markQueueLength(MarkQueue *q); +#endif +static void init_mark_queue_(MarkQueue *queue); + +/* Transfers the given capability's update-remembered set to the global + * remembered set. + * + * Really the argument type should be UpdRemSet* but this would be rather + * inconvenient without polymorphism. + */ +void nonmovingAddUpdRemSetBlocks(MarkQueue *rset) +{ + if (markQueueIsEmpty(rset)) return; + + // find the tail of the queue + bdescr *start = rset->blocks; + bdescr *end = start; + while (end->link != NULL) + end = end->link; + + // add the blocks to the global remembered set + ACQUIRE_LOCK(&upd_rem_set_lock); + end->link = upd_rem_set_block_list; + upd_rem_set_block_list = start; + RELEASE_LOCK(&upd_rem_set_lock); + + // Reset remembered set + ACQUIRE_SM_LOCK; + init_mark_queue_(rset); + rset->is_upd_rem_set = true; + RELEASE_SM_LOCK; +} + +#if defined(THREADED_RTS) +/* Called by capabilities to flush their update remembered sets when + * synchronising with the non-moving collector as it transitions from mark to + * sweep phase. + */ +void nonmovingFlushCapUpdRemSetBlocks(Capability *cap) +{ + debugTrace(DEBUG_nonmoving_gc, + "Capability %d flushing update remembered set: %d", + cap->no, markQueueLength(&cap->upd_rem_set.queue)); + traceConcUpdRemSetFlush(cap); + nonmovingAddUpdRemSetBlocks(&cap->upd_rem_set.queue); + atomic_inc(&upd_rem_set_flush_count, 1); + signalCondition(&upd_rem_set_flushed_cond); + // After this mutation will remain suspended until nonmovingFinishFlush + // releases its capabilities. +} + +/* Request that all capabilities flush their update remembered sets and suspend + * execution until the further notice. + */ +void nonmovingBeginFlush(Task *task) +{ + debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush..."); + traceConcSyncBegin(); + upd_rem_set_flush_count = 0; + stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET); + + // XXX: We may have been given a capability via releaseCapability (i.e. a + // task suspended due to a foreign call) in which case our requestSync + // logic won't have been hit. Make sure that everyone so far has flushed. + // Ideally we want to mark asynchronously with syncing. + for (uint32_t i = 0; i < n_capabilities; i++) { + nonmovingFlushCapUpdRemSetBlocks(capabilities[i]); + } +} + +/* Wait until a capability has flushed its update remembered set. Returns true + * if all capabilities have flushed. + */ +bool nonmovingWaitForFlush() +{ + ACQUIRE_LOCK(&upd_rem_set_lock); + debugTrace(DEBUG_nonmoving_gc, "Flush count %d", upd_rem_set_flush_count); + bool finished = upd_rem_set_flush_count == n_capabilities; + if (!finished) { + waitCondition(&upd_rem_set_flushed_cond, &upd_rem_set_lock); + } + RELEASE_LOCK(&upd_rem_set_lock); + return finished; +} + +/* Note [Unintentional marking in resurrectThreads] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * In both moving and non-moving collectors threads found to be unreachable are + * evacuated/marked and then resurrected with resurrectThreads. resurrectThreads + * raises an exception in the unreachable thread via raiseAsync, which does + * mutations on the heap. These mutations cause adding stuff to UpdRemSet of the + * thread's capability. Here's an example backtrace where this happens: + * + * #0 updateRemembSetPushClosure + * #1 0x000000000072b363 in dirty_TVAR + * #2 0x00000000007162e5 in remove_watch_queue_entries_for_trec + * #3 0x0000000000717098 in stmAbortTransaction + * #4 0x000000000070c6eb in raiseAsync + * #5 0x000000000070b473 in throwToSingleThreaded__ + * #6 0x000000000070b4ab in throwToSingleThreaded + * #7 0x00000000006fce82 in resurrectThreads + * #8 0x00000000007215db in nonmovingMark_ + * #9 0x0000000000721438 in nonmovingConcurrentMark + * #10 0x00007f1ee81cd6db in start_thread + * #11 0x00007f1ee850688f in clone + * + * However we don't really want to run write barriers when calling + * resurrectThreads here, because we're in a GC pause, and overwritten values + * are definitely gone forever (as opposed to being inserted in a marked object + * or kept in registers and used later). + * + * When this happens, if we don't reset the UpdRemSets, what happens is in the + * next mark we see these objects that were added in previous mark's + * resurrectThreads in UpdRemSets, and mark those. This causes keeping + * unreachable objects alive, and effects weak finalization and thread resurrect + * (which rely on things become unreachable). As an example, stm048 fails when + * we get this wrong, because when we do raiseAsync on a thread that was blocked + * on an STM transaction we mutate a TVAR_WATCH_QUEUE, which has a reference to + * the TSO that was running the STM transaction. If the TSO becomes unreachable + * again in the next GC we don't realize this, because it was added to an + * UpdRemSet in the previous GC's mark phase, because of raiseAsync. + * + * To fix this we clear all UpdRemSets in nonmovingFinishFlush, right before + * releasing capabilities. This is somewhat inefficient (we allow adding objects + * to UpdRemSets, only to later reset them), but the only case where we add to + * UpdRemSets during mark is resurrectThreads, and I don't think we do so many + * resurrection in a thread that we fill UpdRemSets and allocate new blocks. So + * pushing an UpdRemSet in this case is really fast, and resetting is even + * faster (we just update a pointer). + * + * TODO (osa): What if we actually marked UpdRemSets in this case, in the mark + * loop? Would that work? Or what would break? + */ + +/* Notify capabilities that the synchronisation is finished; they may resume + * execution. + */ +void nonmovingFinishFlush(Task *task) +{ + // See Note [Unintentional marking in resurrectThreads] + for (uint32_t i = 0; i < n_capabilities; i++) { + reset_upd_rem_set(&capabilities[i]->upd_rem_set); + } + // Also reset upd_rem_set_block_list in case some of the UpdRemSets were + // filled and we flushed them. + freeChain_lock(upd_rem_set_block_list); + upd_rem_set_block_list = NULL; + + debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush..."); + traceConcSyncEnd(); + releaseAllCapabilities(n_capabilities, NULL, task); +} +#endif + +/********************************************************* + * Pushing to either the mark queue or remembered set + *********************************************************/ + +STATIC_INLINE void +push (MarkQueue *q, const MarkQueueEnt *ent) +{ + // Are we at the end of the block? + if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) { + // Yes, this block is full. + if (q->is_upd_rem_set) { + nonmovingAddUpdRemSetBlocks(q); + } else { + // allocate a fresh block. + ACQUIRE_SM_LOCK; + bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS); + bd->link = q->blocks; + q->blocks = bd; + q->top = (MarkQueueBlock *) bd->start; + q->top->head = 0; + RELEASE_SM_LOCK; + } + } + + q->top->entries[q->top->head] = *ent; + q->top->head++; +} + +/* A variant of push to be used by the minor GC when it encounters a reference + * to an object in the non-moving heap. In contrast to the other push + * operations this uses the gc_alloc_block_sync spinlock instead of the + * SM_LOCK to allocate new blocks in the event that the mark queue is full. + */ +void +markQueuePushClosureGC (MarkQueue *q, StgClosure *p) +{ + /* We should not make it here if we are doing a deadlock detect GC. + * See Note [Deadlock detection under nonmoving collector]. + */ + ASSERT(!deadlock_detect_gc); + + // Are we at the end of the block? + if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) { + // Yes, this block is full. + // allocate a fresh block. + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS); + bd->link = q->blocks; + q->blocks = bd; + q->top = (MarkQueueBlock *) bd->start; + q->top->head = 0; + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + } + + MarkQueueEnt ent = { + .mark_closure = { + .p = UNTAG_CLOSURE(p), + .origin = NULL, + } + }; + q->top->entries[q->top->head] = ent; + q->top->head++; +} + +static inline +void push_closure (MarkQueue *q, + StgClosure *p, + StgClosure **origin) +{ +#if defined(DEBUG) + ASSERT(!HEAP_ALLOCED_GC(p) || (Bdescr((StgPtr) p)->gen == oldest_gen)); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); + // Commenting out: too slow + // if (RtsFlags.DebugFlags.sanity) { + // assert_in_nonmoving_heap((P_)p); + // if (origin) + // assert_in_nonmoving_heap((P_)origin); + // } +#endif + + // This must be true as origin points to a pointer and therefore must be + // word-aligned. However, we check this as otherwise we would confuse this + // with a mark_array entry + ASSERT(((uintptr_t) origin & 0x3) == 0); + + MarkQueueEnt ent = { + .mark_closure = { + .p = p, + .origin = origin, + } + }; + push(q, &ent); +} + +static +void push_array (MarkQueue *q, + const StgMutArrPtrs *array, + StgWord start_index) +{ + // TODO: Push this into callers where they already have the Bdescr + if (HEAP_ALLOCED_GC(array) && (Bdescr((StgPtr) array)->gen != oldest_gen)) + return; + + MarkQueueEnt ent = { + .mark_array = { + .array = array, + .start_index = (start_index << 16) | 0x3, + } + }; + push(q, &ent); +} + +static +void push_thunk_srt (MarkQueue *q, const StgInfoTable *info) +{ + const StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(info); + if (thunk_info->i.srt) { + push_closure(q, (StgClosure*)GET_SRT(thunk_info), NULL); + } +} + +static +void push_fun_srt (MarkQueue *q, const StgInfoTable *info) +{ + const StgFunInfoTable *fun_info = itbl_to_fun_itbl(info); + if (fun_info->i.srt) { + push_closure(q, (StgClosure*)GET_FUN_SRT(fun_info), NULL); + } +} + +/********************************************************* + * Pushing to the update remembered set + * + * upd_rem_set_push_* functions are directly called by + * mutators and need to check whether the value is in + * non-moving heap. + *********************************************************/ + +// Check if the object is traced by the non-moving collector. This holds in two +// conditions: +// +// - Object is in non-moving heap +// - Object is a large (BF_LARGE) and marked as BF_NONMOVING +// - Object is static (HEAP_ALLOCED_GC(obj) == false) +// +static +bool check_in_nonmoving_heap(StgClosure *p) { + if (HEAP_ALLOCED_GC(p)) { + // This works for both large and small objects: + return Bdescr((P_)p)->flags & BF_NONMOVING; + } else { + return true; // a static object + } +} + +/* Push the free variables of a (now-evaluated) thunk to the + * update remembered set. + */ +inline void updateRemembSetPushThunk(Capability *cap, StgThunk *thunk) +{ + const StgInfoTable *info; + do { + info = get_volatile_itbl((StgClosure *) thunk); + } while (info->type == WHITEHOLE); + updateRemembSetPushThunkEager(cap, (StgThunkInfoTable *) info, thunk); +} + +/* Push the free variables of a thunk to the update remembered set. + * This is called by the thunk update code (e.g. updateWithIndirection) before + * we update the indirectee to ensure that the thunk's free variables remain + * visible to the concurrent collector. + * + * See Note [Update rememembered set]. + */ +void updateRemembSetPushThunkEager(Capability *cap, + const StgThunkInfoTable *info, + StgThunk *thunk) +{ + /* N.B. info->i.type mustn't be WHITEHOLE */ + MarkQueue *queue = &cap->upd_rem_set.queue; + switch (info->i.type) { + case THUNK: + case THUNK_1_0: + case THUNK_0_1: + case THUNK_2_0: + case THUNK_1_1: + case THUNK_0_2: + { + push_thunk_srt(queue, &info->i); + + for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) { + if (check_in_nonmoving_heap(thunk->payload[i])) { + // Don't bother to push origin; it makes the barrier needlessly + // expensive with little benefit. + push_closure(queue, thunk->payload[i], NULL); + } + } + break; + } + case AP: + { + StgAP *ap = (StgAP *) thunk; + if (check_in_nonmoving_heap(ap->fun)) { + push_closure(queue, ap->fun, NULL); + } + mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args); + break; + } + case THUNK_SELECTOR: + case BLACKHOLE: + // TODO: This is right, right? + break; + // The selector optimization performed by the nonmoving mark may have + // overwritten a thunk which we are updating with an indirection. + case IND: + { + StgInd *ind = (StgInd *) thunk; + if (check_in_nonmoving_heap(ind->indirectee)) { + push_closure(queue, ind->indirectee, NULL); + } + break; + } + default: + barf("updateRemembSetPushThunk: invalid thunk pushed: p=%p, type=%d", + thunk, info->i.type); + } +} + +void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p) +{ + updateRemembSetPushThunk(regTableToCapability(reg), p); +} + +inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p) +{ + if (check_in_nonmoving_heap(p)) { + MarkQueue *queue = &cap->upd_rem_set.queue; + push_closure(queue, p, NULL); + } +} + +void updateRemembSetPushClosure_(StgRegTable *reg, struct StgClosure_ *p) +{ + updateRemembSetPushClosure(regTableToCapability(reg), p); +} + +STATIC_INLINE bool needs_upd_rem_set_mark(StgClosure *p) +{ + // TODO: Deduplicate with mark_closure + bdescr *bd = Bdescr((StgPtr) p); + if (bd->gen != oldest_gen) { + return false; + } else if (bd->flags & BF_LARGE) { + if (! (bd->flags & BF_NONMOVING_SWEEPING)) { + return false; + } else { + return ! (bd->flags & BF_MARKED); + } + } else { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p); + return nonmovingGetMark(seg, block_idx) != nonmovingMarkEpoch; + } +} + +/* Set the mark bit; only to be called *after* we have fully marked the closure */ +STATIC_INLINE void finish_upd_rem_set_mark(StgClosure *p) +{ + bdescr *bd = Bdescr((StgPtr) p); + if (bd->flags & BF_LARGE) { + // Someone else may have already marked it. + ACQUIRE_LOCK(&nonmoving_large_objects_mutex); + if (! (bd->flags & BF_MARKED)) { + bd->flags |= BF_MARKED; + dbl_link_remove(bd, &nonmoving_large_objects); + dbl_link_onto(bd, &nonmoving_marked_large_objects); + n_nonmoving_large_blocks -= bd->blocks; + n_nonmoving_marked_large_blocks += bd->blocks; + } + RELEASE_LOCK(&nonmoving_large_objects_mutex); + } else { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p); + nonmovingSetMark(seg, block_idx); + } +} + +void updateRemembSetPushTSO(Capability *cap, StgTSO *tso) +{ + if (needs_upd_rem_set_mark((StgClosure *) tso)) { + debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: TSO %p", tso); + mark_tso(&cap->upd_rem_set.queue, tso); + finish_upd_rem_set_mark((StgClosure *) tso); + } +} + +void updateRemembSetPushStack(Capability *cap, StgStack *stack) +{ + // N.B. caller responsible for checking nonmoving_write_barrier_enabled + if (needs_upd_rem_set_mark((StgClosure *) stack)) { + StgWord8 marking = stack->marking; + // See Note [StgStack dirtiness flags and concurrent marking] + if (cas_word8(&stack->marking, marking, nonmovingMarkEpoch) + != nonmovingMarkEpoch) { + // We have claimed the right to mark the stack. + debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: STACK %p", stack->sp); + mark_stack(&cap->upd_rem_set.queue, stack); + finish_upd_rem_set_mark((StgClosure *) stack); + return; + } else { + // The concurrent GC has claimed the right to mark the stack. + // Wait until it finishes marking before proceeding with + // mutation. + while (needs_upd_rem_set_mark((StgClosure *) stack)); +#if defined(PARALLEL_GC) + busy_wait_nop(); // TODO: Spinning here is unfortunate +#endif + return; + } + } +} + +/********************************************************* + * Pushing to the mark queue + *********************************************************/ + +void markQueuePush (MarkQueue *q, const MarkQueueEnt *ent) +{ + push(q, ent); +} + +void markQueuePushClosure (MarkQueue *q, + StgClosure *p, + StgClosure **origin) +{ + // TODO: Push this into callers where they already have the Bdescr + if (check_in_nonmoving_heap(p)) { + push_closure(q, p, origin); + } +} + +/* TODO: Do we really never want to specify the origin here? */ +void markQueueAddRoot (MarkQueue* q, StgClosure** root) +{ + markQueuePushClosure(q, *root, NULL); +} + +/* Push a closure to the mark queue without origin information */ +void markQueuePushClosure_ (MarkQueue *q, StgClosure *p) +{ + markQueuePushClosure(q, p, NULL); +} + +void markQueuePushFunSrt (MarkQueue *q, const StgInfoTable *info) +{ + push_fun_srt(q, info); +} + +void markQueuePushThunkSrt (MarkQueue *q, const StgInfoTable *info) +{ + push_thunk_srt(q, info); +} + +void markQueuePushArray (MarkQueue *q, + const StgMutArrPtrs *array, + StgWord start_index) +{ + push_array(q, array, start_index); +} + +/********************************************************* + * Popping from the mark queue + *********************************************************/ + +// Returns invalid MarkQueueEnt if queue is empty. +static MarkQueueEnt markQueuePop_ (MarkQueue *q) +{ + MarkQueueBlock *top; + +again: + top = q->top; + + // Are we at the beginning of the block? + if (top->head == 0) { + // Is this the first block of the queue? + if (q->blocks->link == NULL) { + // Yes, therefore queue is empty... + MarkQueueEnt none = { .null_entry = { .p = NULL } }; + return none; + } else { + // No, unwind to the previous block and try popping again... + bdescr *old_block = q->blocks; + q->blocks = old_block->link; + q->top = (MarkQueueBlock*)q->blocks->start; + ACQUIRE_SM_LOCK; + freeGroup(old_block); // TODO: hold on to a block to avoid repeated allocation/deallocation? + RELEASE_SM_LOCK; + goto again; + } + } + + top->head--; + MarkQueueEnt ent = top->entries[top->head]; + return ent; +} + +static MarkQueueEnt markQueuePop (MarkQueue *q) +{ +#if MARK_PREFETCH_QUEUE_DEPTH == 0 + return markQueuePop_(q); +#else + unsigned int i = q->prefetch_head; + while (nonmovingMarkQueueEntryType(&q->prefetch_queue[i]) == NULL_ENTRY) { + MarkQueueEnt new = markQueuePop_(q); + if (nonmovingMarkQueueEntryType(&new) == NULL_ENTRY) { + // Mark queue is empty; look for any valid entries in the prefetch + // queue + for (unsigned int j = (i+1) % MARK_PREFETCH_QUEUE_DEPTH; + j != i; + j = (j+1) % MARK_PREFETCH_QUEUE_DEPTH) + { + if (nonmovingMarkQueueEntryType(&q->prefetch_queue[j]) != NULL_ENTRY) { + i = j; + goto done; + } + } + return new; + } + + // The entry may not be a MARK_CLOSURE but it doesn't matter, our + // MarkQueueEnt encoding always places the pointer to the object to be + // marked first. + prefetchForRead(&new.mark_closure.p->header.info); + prefetchForRead(Bdescr((StgPtr) new.mark_closure.p)); + q->prefetch_queue[i] = new; + i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH; + } + + done: + ; + MarkQueueEnt ret = q->prefetch_queue[i]; + q->prefetch_queue[i].null_entry.p = NULL; + q->prefetch_head = i; + return ret; +#endif +} + +/********************************************************* + * Creating and destroying MarkQueues and UpdRemSets + *********************************************************/ + +/* Must hold sm_mutex. */ +static void init_mark_queue_ (MarkQueue *queue) +{ + bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS); + queue->blocks = bd; + queue->top = (MarkQueueBlock *) bd->start; + queue->top->head = 0; +#if MARK_PREFETCH_QUEUE_DEPTH > 0 + memset(&queue->prefetch_queue, 0, sizeof(queue->prefetch_queue)); + queue->prefetch_head = 0; +#endif +} + +/* Must hold sm_mutex. */ +void initMarkQueue (MarkQueue *queue) +{ + init_mark_queue_(queue); + queue->is_upd_rem_set = false; +} + +/* Must hold sm_mutex. */ +void init_upd_rem_set (UpdRemSet *rset) +{ + init_mark_queue_(&rset->queue); + rset->queue.is_upd_rem_set = true; +} + +void reset_upd_rem_set (UpdRemSet *rset) +{ + // UpdRemSets always have one block for the mark queue. This assertion is to + // update this code if we change that. + ASSERT(rset->queue.blocks->link == NULL); + rset->queue.top->head = 0; +} + +void freeMarkQueue (MarkQueue *queue) +{ + freeChain_lock(queue->blocks); +} + +#if defined(THREADED_RTS) && defined(DEBUG) +static uint32_t +markQueueLength (MarkQueue *q) +{ + uint32_t n = 0; + for (bdescr *block = q->blocks; block; block = block->link) { + MarkQueueBlock *queue = (MarkQueueBlock*)block->start; + n += queue->head; + } + return n; +} +#endif + + +/********************************************************* + * Marking + *********************************************************/ + +/* + * N.B. Mutation of TRecHeaders is completely unprotected by any write + * barrier. Consequently it's quite important that we deeply mark + * any outstanding transactions. + */ +static void +mark_trec_header (MarkQueue *queue, StgTRecHeader *trec) +{ + while (trec != NO_TREC) { + StgTRecChunk *chunk = trec->current_chunk; + markQueuePushClosure_(queue, (StgClosure *) trec); + markQueuePushClosure_(queue, (StgClosure *) chunk); + while (chunk != END_STM_CHUNK_LIST) { + for (StgWord i=0; i < chunk->next_entry_idx; i++) { + TRecEntry *ent = &chunk->entries[i]; + markQueuePushClosure_(queue, (StgClosure *) ent->tvar); + markQueuePushClosure_(queue, ent->expected_value); + markQueuePushClosure_(queue, ent->new_value); + } + chunk = chunk->prev_chunk; + } + trec = trec->enclosing_trec; + } +} + +static void +mark_tso (MarkQueue *queue, StgTSO *tso) +{ + // TODO: Clear dirty if contains only old gen objects + + if (tso->bound != NULL) { + markQueuePushClosure_(queue, (StgClosure *) tso->bound->tso); + } + + markQueuePushClosure_(queue, (StgClosure *) tso->blocked_exceptions); + markQueuePushClosure_(queue, (StgClosure *) tso->bq); + mark_trec_header(queue, tso->trec); + markQueuePushClosure_(queue, (StgClosure *) tso->stackobj); + markQueuePushClosure_(queue, (StgClosure *) tso->_link); + if ( tso->why_blocked == BlockedOnMVar + || tso->why_blocked == BlockedOnMVarRead + || tso->why_blocked == BlockedOnBlackHole + || tso->why_blocked == BlockedOnMsgThrowTo + || tso->why_blocked == NotBlocked + ) { + markQueuePushClosure_(queue, tso->block_info.closure); + } +} + +static void +do_push_closure (StgClosure **p, void *user) +{ + MarkQueue *queue = (MarkQueue *) user; + // TODO: Origin? need reference to containing closure + markQueuePushClosure_(queue, *p); +} + +static void +mark_large_bitmap (MarkQueue *queue, + StgClosure **p, + StgLargeBitmap *large_bitmap, + StgWord size) +{ + walk_large_bitmap(do_push_closure, p, large_bitmap, size, queue); +} + +static void +mark_small_bitmap (MarkQueue *queue, StgClosure **p, StgWord size, StgWord bitmap) +{ + while (size > 0) { + if ((bitmap & 1) == 0) { + // TODO: Origin? + markQueuePushClosure(queue, *p, NULL); + } + p++; + bitmap = bitmap >> 1; + size--; + } +} + +static GNUC_ATTR_HOT +void mark_PAP_payload (MarkQueue *queue, + StgClosure *fun, + StgClosure **payload, + StgWord size) +{ + const StgFunInfoTable *fun_info = get_fun_itbl(UNTAG_CONST_CLOSURE(fun)); + ASSERT(fun_info->i.type != PAP); + StgPtr p = (StgPtr) payload; + + StgWord bitmap; + switch (fun_info->f.fun_type) { + case ARG_GEN: + bitmap = BITMAP_BITS(fun_info->f.b.bitmap); + goto small_bitmap; + case ARG_GEN_BIG: + mark_large_bitmap(queue, payload, GET_FUN_LARGE_BITMAP(fun_info), size); + break; + case ARG_BCO: + mark_large_bitmap(queue, payload, BCO_BITMAP(fun), size); + break; + default: + bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]); + small_bitmap: + mark_small_bitmap(queue, (StgClosure **) p, size, bitmap); + break; + } +} + +/* Helper for mark_stack; returns next stack frame. */ +static StgPtr +mark_arg_block (MarkQueue *queue, const StgFunInfoTable *fun_info, StgClosure **args) +{ + StgWord bitmap, size; + + StgPtr p = (StgPtr)args; + switch (fun_info->f.fun_type) { + case ARG_GEN: + bitmap = BITMAP_BITS(fun_info->f.b.bitmap); + size = BITMAP_SIZE(fun_info->f.b.bitmap); + goto small_bitmap; + case ARG_GEN_BIG: + size = GET_FUN_LARGE_BITMAP(fun_info)->size; + mark_large_bitmap(queue, (StgClosure**)p, GET_FUN_LARGE_BITMAP(fun_info), size); + p += size; + break; + default: + bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]); + size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]); + small_bitmap: + mark_small_bitmap(queue, (StgClosure**)p, size, bitmap); + p += size; + break; + } + return p; +} + +static GNUC_ATTR_HOT void +mark_stack_ (MarkQueue *queue, StgPtr sp, StgPtr spBottom) +{ + ASSERT(sp <= spBottom); + + while (sp < spBottom) { + const StgRetInfoTable *info = get_ret_itbl((StgClosure *)sp); + switch (info->i.type) { + case UPDATE_FRAME: + { + // See Note [upd-black-hole] in rts/Scav.c + StgUpdateFrame *frame = (StgUpdateFrame *) sp; + markQueuePushClosure_(queue, frame->updatee); + sp += sizeofW(StgUpdateFrame); + continue; + } + + // small bitmap (< 32 entries, or 64 on a 64-bit machine) + case CATCH_STM_FRAME: + case CATCH_RETRY_FRAME: + case ATOMICALLY_FRAME: + case UNDERFLOW_FRAME: + case STOP_FRAME: + case CATCH_FRAME: + case RET_SMALL: + { + StgWord bitmap = BITMAP_BITS(info->i.layout.bitmap); + StgWord size = BITMAP_SIZE(info->i.layout.bitmap); + // NOTE: the payload starts immediately after the info-ptr, we + // don't have an StgHeader in the same sense as a heap closure. + sp++; + mark_small_bitmap(queue, (StgClosure **) sp, size, bitmap); + sp += size; + } + follow_srt: + if (info->i.srt) { + markQueuePushClosure_(queue, (StgClosure*)GET_SRT(info)); + } + continue; + + case RET_BCO: { + sp++; + markQueuePushClosure_(queue, *(StgClosure**)sp); + StgBCO *bco = (StgBCO *)*sp; + sp++; + StgWord size = BCO_BITMAP_SIZE(bco); + mark_large_bitmap(queue, (StgClosure **) sp, BCO_BITMAP(bco), size); + sp += size; + continue; + } + + // large bitmap (> 32 entries, or > 64 on a 64-bit machine) + case RET_BIG: + { + StgWord size; + + size = GET_LARGE_BITMAP(&info->i)->size; + sp++; + mark_large_bitmap(queue, (StgClosure **) sp, GET_LARGE_BITMAP(&info->i), size); + sp += size; + // and don't forget to follow the SRT + goto follow_srt; + } + + case RET_FUN: + { + StgRetFun *ret_fun = (StgRetFun *)sp; + const StgFunInfoTable *fun_info; + + markQueuePushClosure_(queue, ret_fun->fun); + fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun)); + sp = mark_arg_block(queue, fun_info, ret_fun->payload); + goto follow_srt; + } + + default: + barf("mark_stack: weird activation record found on stack: %d", (int)(info->i.type)); + } + } +} + +static GNUC_ATTR_HOT void +mark_stack (MarkQueue *queue, StgStack *stack) +{ + // TODO: Clear dirty if contains only old gen objects + + mark_stack_(queue, stack->sp, stack->stack + stack->stack_size); +} + +/* See Note [Static objects under the nonmoving collector]. + * + * Returns true if the object needs to be marked. + */ +static bool +bump_static_flag(StgClosure **link_field, StgClosure *q STG_UNUSED) +{ + while (1) { + StgWord link = (StgWord) *link_field; + StgWord new = (link & ~STATIC_BITS) | static_flag; + if ((link & STATIC_BITS) == static_flag) + return false; + else if (cas((StgVolatilePtr) link_field, link, new) == link) { + return true; + } + } +} + +static GNUC_ATTR_HOT void +mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin) +{ + StgClosure *p = (StgClosure*)p0; + + try_again: + ; + bdescr *bd = NULL; + StgClosure *p_next = NULL; + StgWord tag = GET_CLOSURE_TAG(p); + p = UNTAG_CLOSURE(p); + +# define PUSH_FIELD(obj, field) \ + markQueuePushClosure(queue, \ + (StgClosure *) (obj)->field, \ + (StgClosure **) &(obj)->field) + + if (!HEAP_ALLOCED_GC(p)) { + const StgInfoTable *info = get_itbl(p); + StgHalfWord type = info->type; + + if (type == CONSTR_0_1 || type == CONSTR_0_2 || type == CONSTR_NOCAF) { + // no need to put these on the static linked list, they don't need + // to be marked. + return; + } + + switch (type) { + + case THUNK_STATIC: + if (info->srt != 0) { + if (bump_static_flag(THUNK_STATIC_LINK((StgClosure *)p), p)) { + markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above + } + } + goto done; + + case FUN_STATIC: + if (info->srt != 0 || info->layout.payload.ptrs != 0) { + if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) { + markQueuePushFunSrt(queue, info); // TODO this function repeats the check above + + // a FUN_STATIC can also be an SRT, so it may have pointer + // fields. See Note [SRTs] in CmmBuildInfoTables, specifically + // the [FUN] optimisation. + // TODO (osa) I don't understand this comment + for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) { + PUSH_FIELD(p, payload[i]); + } + } + } + goto done; + + case IND_STATIC: + if (bump_static_flag(IND_STATIC_LINK((StgClosure *)p), p)) { + PUSH_FIELD((StgInd *) p, indirectee); + } + goto done; + + case CONSTR: + case CONSTR_1_0: + case CONSTR_2_0: + case CONSTR_1_1: + if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) { + for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) { + PUSH_FIELD(p, payload[i]); + } + } + goto done; + + case WHITEHOLE: + while (get_volatile_itbl(p)->type == WHITEHOLE); + // busy_wait_nop(); // FIXME + goto try_again; + + default: + barf("mark_closure(static): strange closure type %d", (int)(info->type)); + } + } + + bd = Bdescr((StgPtr) p); + + if (bd->gen != oldest_gen) { + // Here we have an object living outside of the non-moving heap. While + // we likely evacuated nearly everything to the nonmoving heap during + // preparation there are nevertheless a few ways in which we might trace + // a reference into younger generations: + // + // * a mutable object might have been updated + // * we might have aged an object + goto done; + } + + ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info); + + ASSERT(!IS_FORWARDING_PTR(p->header.info)); + + // N.B. only the first block of a compact region is guaranteed to carry + // BF_NONMOVING; conseqently we must separately check for BF_COMPACT. + if (bd->flags & (BF_COMPACT | BF_NONMOVING)) { + + if (bd->flags & BF_COMPACT) { + StgCompactNFData *str = objectGetCompact((StgClosure*)p); + bd = Bdescr((P_)str); + + if (! (bd->flags & BF_NONMOVING_SWEEPING)) { + // Not in the snapshot + return; + } + if (bd->flags & BF_MARKED) { + goto done; + } + } else if (bd->flags & BF_LARGE) { + if (! (bd->flags & BF_NONMOVING_SWEEPING)) { + // Not in the snapshot + goto done; + } + if (bd->flags & BF_MARKED) { + goto done; + } + + // Mark contents + p = (StgClosure*)bd->start; + } else { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p); + + /* We don't mark blocks that, + * - were not live at the time that the snapshot was taken, or + * - we have already marked this cycle + */ + uint8_t mark = nonmovingGetMark(seg, block_idx); + /* Don't mark things we've already marked (since we may loop) */ + if (mark == nonmovingMarkEpoch) + goto done; + + StgClosure *snapshot_loc = + (StgClosure *) nonmovingSegmentGetBlock(seg, nonmovingSegmentInfo(seg)->next_free_snap); + if (p >= snapshot_loc && mark == 0) { + /* + * In this case we are looking at a block that wasn't allocated + * at the time that the snapshot was taken. We mustn't trace + * things above the allocation pointer that aren't marked since + * they may not be valid objects. + */ + goto done; + } + } + } + + // A pinned object that is still attached to a capability (because it's not + // filled yet). No need to trace it pinned objects can't contain poiners. + else if (bd->flags & BF_PINNED) { +#if defined(DEBUG) + bool found_it = false; + for (uint32_t i = 0; i < n_capabilities; ++i) { + if (capabilities[i]->pinned_object_block == bd) { + found_it = true; + break; + } + } + ASSERT(found_it); +#endif + return; // we don't update origin here! TODO(osa): explain this + } + + else { + barf("Strange closure in nonmoving mark: %p", p); + } + + ///////////////////////////////////////////////////// + // Trace pointers + ///////////////////////////////////////////////////// + + const StgInfoTable *info = get_itbl(p); + switch (info->type) { + + case MVAR_CLEAN: + case MVAR_DIRTY: { + StgMVar *mvar = (StgMVar *) p; + PUSH_FIELD(mvar, head); + PUSH_FIELD(mvar, tail); + PUSH_FIELD(mvar, value); + break; + } + + case TVAR: { + StgTVar *tvar = ((StgTVar *)p); + PUSH_FIELD(tvar, current_value); + PUSH_FIELD(tvar, first_watch_queue_entry); + break; + } + + case FUN_2_0: + markQueuePushFunSrt(queue, info); + PUSH_FIELD(p, payload[1]); + PUSH_FIELD(p, payload[0]); + break; + + case THUNK_2_0: { + StgThunk *thunk = (StgThunk *) p; + markQueuePushThunkSrt(queue, info); + PUSH_FIELD(thunk, payload[1]); + PUSH_FIELD(thunk, payload[0]); + break; + } + + case CONSTR_2_0: + PUSH_FIELD(p, payload[1]); + PUSH_FIELD(p, payload[0]); + break; + + case THUNK_1_0: + markQueuePushThunkSrt(queue, info); + PUSH_FIELD((StgThunk *) p, payload[0]); + break; + + case FUN_1_0: + markQueuePushFunSrt(queue, info); + PUSH_FIELD(p, payload[0]); + break; + + case CONSTR_1_0: + PUSH_FIELD(p, payload[0]); + break; + + case THUNK_0_1: + markQueuePushThunkSrt(queue, info); + break; + + case FUN_0_1: + markQueuePushFunSrt(queue, info); + break; + + case CONSTR_0_1: + case CONSTR_0_2: + break; + + case THUNK_0_2: + markQueuePushThunkSrt(queue, info); + break; + + case FUN_0_2: + markQueuePushFunSrt(queue, info); + break; + + case THUNK_1_1: + markQueuePushThunkSrt(queue, info); + PUSH_FIELD((StgThunk *) p, payload[0]); + break; + + case FUN_1_1: + markQueuePushFunSrt(queue, info); + PUSH_FIELD(p, payload[0]); + break; + + case CONSTR_1_1: + PUSH_FIELD(p, payload[0]); + break; + + case FUN: + markQueuePushFunSrt(queue, info); + goto gen_obj; + + case THUNK: { + markQueuePushThunkSrt(queue, info); + for (StgWord i = 0; i < info->layout.payload.ptrs; i++) { + StgClosure **field = &((StgThunk *) p)->payload[i]; + markQueuePushClosure(queue, *field, field); + } + break; + } + + gen_obj: + case CONSTR: + case CONSTR_NOCAF: + case WEAK: + case PRIM: + { + for (StgWord i = 0; i < info->layout.payload.ptrs; i++) { + StgClosure **field = &((StgClosure *) p)->payload[i]; + markQueuePushClosure(queue, *field, field); + } + break; + } + + case BCO: { + StgBCO *bco = (StgBCO *)p; + PUSH_FIELD(bco, instrs); + PUSH_FIELD(bco, literals); + PUSH_FIELD(bco, ptrs); + break; + } + + + case IND: { + PUSH_FIELD((StgInd *) p, indirectee); + if (origin != NULL) { + p_next = ((StgInd*)p)->indirectee; + } + break; + } + + case BLACKHOLE: { + PUSH_FIELD((StgInd *) p, indirectee); + StgClosure *indirectee = ((StgInd*)p)->indirectee; + if (GET_CLOSURE_TAG(indirectee) == 0 || origin == NULL) { + // do nothing + } else { + p_next = indirectee; + } + break; + } + + case MUT_VAR_CLEAN: + case MUT_VAR_DIRTY: + PUSH_FIELD((StgMutVar *)p, var); + break; + + case BLOCKING_QUEUE: { + StgBlockingQueue *bq = (StgBlockingQueue *)p; + PUSH_FIELD(bq, bh); + PUSH_FIELD(bq, owner); + PUSH_FIELD(bq, queue); + PUSH_FIELD(bq, link); + break; + } + + case THUNK_SELECTOR: + if (RtsFlags.GcFlags.nonmovingSelectorOpt) { + nonmoving_eval_thunk_selector(queue, (StgSelector*)p, origin); + } else { + PUSH_FIELD((StgSelector *) p, selectee); + } + break; + + case AP_STACK: { + StgAP_STACK *ap = (StgAP_STACK *)p; + PUSH_FIELD(ap, fun); + mark_stack_(queue, (StgPtr) ap->payload, (StgPtr) ap->payload + ap->size); + break; + } + + case PAP: { + StgPAP *pap = (StgPAP *) p; + PUSH_FIELD(pap, fun); + mark_PAP_payload(queue, pap->fun, pap->payload, pap->n_args); + break; + } + + case AP: { + StgAP *ap = (StgAP *) p; + PUSH_FIELD(ap, fun); + mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args); + break; + } + + case ARR_WORDS: + // nothing to follow + break; + + case MUT_ARR_PTRS_CLEAN: + case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: + // TODO: Check this against Scav.c + markQueuePushArray(queue, (StgMutArrPtrs *) p, 0); + break; + + case SMALL_MUT_ARR_PTRS_CLEAN: + case SMALL_MUT_ARR_PTRS_DIRTY: + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: { + StgSmallMutArrPtrs *arr = (StgSmallMutArrPtrs *) p; + for (StgWord i = 0; i < arr->ptrs; i++) { + StgClosure **field = &arr->payload[i]; + markQueuePushClosure(queue, *field, field); + } + break; + } + + case TSO: + mark_tso(queue, (StgTSO *) p); + break; + + case STACK: { + // See Note [StgStack dirtiness flags and concurrent marking] + StgStack *stack = (StgStack *) p; + StgWord8 marking = stack->marking; + + // N.B. stack->marking must be != nonmovingMarkEpoch unless + // someone has already marked it. + if (cas_word8(&stack->marking, marking, nonmovingMarkEpoch) + != nonmovingMarkEpoch) { + // We have claimed the right to mark the stack. + mark_stack(queue, stack); + } else { + // A mutator has already started marking the stack; we just let it + // do its thing and move on. There's no reason to wait; we know that + // the stack will be fully marked before we sweep due to the final + // post-mark synchronization. Most importantly, we do not set its + // mark bit, the mutator is responsible for this. + goto done; + } + break; + } + + case MUT_PRIM: { + for (StgHalfWord p_idx = 0; p_idx < info->layout.payload.ptrs; ++p_idx) { + StgClosure **field = &p->payload[p_idx]; + markQueuePushClosure(queue, *field, field); + } + break; + } + + case TREC_CHUNK: { + // TODO: Should we abort here? This should have already been marked + // when we dirtied the TSO + StgTRecChunk *tc = ((StgTRecChunk *) p); + PUSH_FIELD(tc, prev_chunk); + TRecEntry *end = &tc->entries[tc->next_entry_idx]; + for (TRecEntry *e = &tc->entries[0]; e < end; e++) { + markQueuePushClosure_(queue, (StgClosure *) e->tvar); + markQueuePushClosure_(queue, (StgClosure *) e->expected_value); + markQueuePushClosure_(queue, (StgClosure *) e->new_value); + } + break; + } + + case WHITEHOLE: + while (get_volatile_itbl(p)->type == WHITEHOLE); + goto try_again; + + case COMPACT_NFDATA: + break; + + default: + barf("mark_closure: unimplemented/strange closure type %d @ %p", + info->type, p); + } + +# undef PUSH_FIELD + + /* Set the mark bit: it's important that we do this only after we actually push + * the object's pointers since in the case of marking stacks there may be a + * mutator waiting for us to finish so it can start execution. + */ + if (bd->flags & BF_COMPACT) { + StgCompactNFData *str = objectGetCompact((StgClosure*)p); + dbl_link_remove(bd, &nonmoving_compact_objects); + dbl_link_onto(bd, &nonmoving_marked_compact_objects); + StgWord blocks = str->totalW / BLOCK_SIZE_W; + n_nonmoving_compact_blocks -= blocks; + n_nonmoving_marked_compact_blocks += blocks; + bd->flags |= BF_MARKED; + } else if (bd->flags & BF_LARGE) { + /* Marking a large object isn't idempotent since we move it to + * nonmoving_marked_large_objects; to ensure that we don't repeatedly + * mark a large object, we only set BF_MARKED on large objects in the + * nonmoving heap while holding nonmoving_large_objects_mutex + */ + ACQUIRE_LOCK(&nonmoving_large_objects_mutex); + if (! (bd->flags & BF_MARKED)) { + // Remove the object from nonmoving_large_objects and link it to + // nonmoving_marked_large_objects + dbl_link_remove(bd, &nonmoving_large_objects); + dbl_link_onto(bd, &nonmoving_marked_large_objects); + n_nonmoving_large_blocks -= bd->blocks; + n_nonmoving_marked_large_blocks += bd->blocks; + bd->flags |= BF_MARKED; + } + RELEASE_LOCK(&nonmoving_large_objects_mutex); + } else if (bd->flags & BF_NONMOVING) { + // TODO: Kill repetition + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p); + nonmovingSetMark(seg, block_idx); + nonmoving_live_words += nonmovingSegmentBlockSize(seg) / sizeof(W_); + } + + // If we found a indirection to shortcut keep going. + if (p_next) { + p = p_next; + goto try_again; + } + +done: + if (origin != NULL && (!HEAP_ALLOCED(p) || bd->flags & BF_NONMOVING)) { + if (UNTAG_CLOSURE((StgClosure*)p0) != p && *origin == p0) { + if (cas((StgVolatilePtr)origin, (StgWord)p0, (StgWord)TAG_CLOSURE(tag, p)) == (StgWord)p0) { + // debugBelch("Thunk optimization successful\n"); + } + } + } +} + +/* This is the main mark loop. + * Invariants: + * + * a. nonmovingPrepareMark has been called. + * b. the nursery has been fully evacuated into the non-moving generation. + * c. the mark queue has been seeded with a set of roots. + * + */ +GNUC_ATTR_HOT void +nonmovingMark (MarkQueue *queue) +{ + traceConcMarkBegin(); + debugTrace(DEBUG_nonmoving_gc, "Starting mark pass"); + unsigned int count = 0; + while (true) { + count++; + MarkQueueEnt ent = markQueuePop(queue); + + switch (nonmovingMarkQueueEntryType(&ent)) { + case MARK_CLOSURE: + mark_closure(queue, ent.mark_closure.p, ent.mark_closure.origin); + break; + case MARK_ARRAY: { + const StgMutArrPtrs *arr = ent.mark_array.array; + StgWord start = ent.mark_array.start_index >> 16; + StgWord end = start + MARK_ARRAY_CHUNK_LENGTH; + if (end < arr->ptrs) { + markQueuePushArray(queue, ent.mark_array.array, end); + } else { + end = arr->ptrs; + } + for (StgWord i = start; i < end; i++) { + markQueuePushClosure_(queue, arr->payload[i]); + } + break; + } + case NULL_ENTRY: + // Perhaps the update remembered set has more to mark... + if (upd_rem_set_block_list) { + ACQUIRE_LOCK(&upd_rem_set_lock); + bdescr *old = queue->blocks; + queue->blocks = upd_rem_set_block_list; + queue->top = (MarkQueueBlock *) queue->blocks->start; + upd_rem_set_block_list = NULL; + RELEASE_LOCK(&upd_rem_set_lock); + + ACQUIRE_SM_LOCK; + freeGroup(old); + RELEASE_SM_LOCK; + } else { + // Nothing more to do + debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count); + traceConcMarkEnd(count); + return; + } + } + } +} + +// A variant of `isAlive` that works for non-moving heap. Used for: +// +// - Collecting weak pointers; checking key of a weak pointer. +// - Resurrecting threads; checking if a thread is dead. +// - Sweeping object lists: large_objects, mut_list, stable_name_table. +// +// This may only be used after a full mark but before nonmovingSweep as it +// relies on the correctness of the next_free_snap and mark bitmaps. +bool nonmovingIsAlive (StgClosure *p) +{ + // Ignore static closures. See comments in `isAlive`. + if (!HEAP_ALLOCED_GC(p)) { + return true; + } + + bdescr *bd = Bdescr((P_)p); + + // All non-static objects in the non-moving heap should be marked as + // BF_NONMOVING + ASSERT(bd->flags & BF_NONMOVING); + + if (bd->flags & (BF_COMPACT | BF_LARGE)) { + if (bd->flags & BF_COMPACT) { + StgCompactNFData *str = objectGetCompact((StgClosure*)p); + bd = Bdescr((P_)str); + } + return (bd->flags & BF_NONMOVING_SWEEPING) == 0 + // the large object wasn't in the snapshot and therefore wasn't marked + || (bd->flags & BF_MARKED) != 0; + // The object was marked + } else { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx i = nonmovingGetBlockIdx((StgPtr) p); + uint8_t mark = nonmovingGetMark(seg, i); + if (i >= nonmovingSegmentInfo(seg)->next_free_snap) { + // If the object is allocated after next_free_snap then one of the + // following must be true: + // + // * if its mark is 0 then the block was not allocated last time + // the segment was swept; however, it may have been allocated since + // then and therefore we must conclude that the block is alive. + // + // * if its mark is equal to nonmovingMarkEpoch then we found that + // the object was alive in the snapshot of the current GC (recall + // that this function may only be used after a mark). + // Consequently we must conclude that the object is still alive. + // + // * if its mark is not equal to nonmovingMarkEpoch then we found + // that the object was not reachable in the last snapshot. + // Assuming that the mark is complete we can conclude that the + // object is dead since the snapshot invariant guarantees that + // all objects alive in the snapshot would be marked. + // + return mark == nonmovingMarkEpoch || mark == 0; + } else { + // If the object is below next_free_snap then the snapshot + // invariant guarantees that it is marked if reachable. + return mark == nonmovingMarkEpoch; + } + } +} + +// Check whether a snapshotted object is alive. That is for an object that we +// know to be in the snapshot, is its mark bit set. It is imperative that the +// object is in the snapshot (e.g. was in the nonmoving heap at the time that +// the snapshot was taken) since we assume that its mark bit reflects its +// reachability. +// +// This is used when +// +// - Collecting weak pointers; checking key of a weak pointer. +// - Resurrecting threads; checking if a thread is dead. +// - Sweeping object lists: large_objects, mut_list, stable_name_table. +// +static bool nonmovingIsNowAlive (StgClosure *p) +{ + // Ignore static closures. See comments in `isAlive`. + if (!HEAP_ALLOCED_GC(p)) { + return true; + } + + bdescr *bd = Bdescr((P_)p); + + // All non-static objects in the non-moving heap should be marked as + // BF_NONMOVING + ASSERT(bd->flags & BF_NONMOVING); + + if (bd->flags & BF_LARGE) { + return (bd->flags & BF_NONMOVING_SWEEPING) == 0 + // the large object wasn't in the snapshot and therefore wasn't marked + || (bd->flags & BF_MARKED) != 0; + // The object was marked + } else { + return nonmovingClosureMarkedThisCycle((P_)p); + } +} + +// Non-moving heap variant of `tidyWeakList` +bool nonmovingTidyWeaks (struct MarkQueue_ *queue) +{ + bool did_work = false; + + StgWeak **last_w = &nonmoving_old_weak_ptr_list; + StgWeak *next_w; + for (StgWeak *w = nonmoving_old_weak_ptr_list; w != NULL; w = next_w) { + if (w->header.info == &stg_DEAD_WEAK_info) { + // finalizeWeak# was called on the weak + next_w = w->link; + *last_w = next_w; + continue; + } + + // Otherwise it's a live weak + ASSERT(w->header.info == &stg_WEAK_info); + + if (nonmovingIsNowAlive(w->key)) { + nonmovingMarkLiveWeak(queue, w); + did_work = true; + + // remove this weak ptr from old_weak_ptr list + *last_w = w->link; + next_w = w->link; + + // and put it on the weak ptr list + w->link = nonmoving_weak_ptr_list; + nonmoving_weak_ptr_list = w; + } else { + last_w = &(w->link); + next_w = w->link; + } + } + + return did_work; +} + +void nonmovingMarkDeadWeak (struct MarkQueue_ *queue, StgWeak *w) +{ + if (w->cfinalizers != &stg_NO_FINALIZER_closure) { + markQueuePushClosure_(queue, w->value); + } + markQueuePushClosure_(queue, w->finalizer); +} + +void nonmovingMarkLiveWeak (struct MarkQueue_ *queue, StgWeak *w) +{ + ASSERT(nonmovingClosureMarkedThisCycle((P_)w)); + markQueuePushClosure_(queue, w->value); + markQueuePushClosure_(queue, w->finalizer); + markQueuePushClosure_(queue, w->cfinalizers); +} + +// When we're done with marking, any weak pointers with non-marked keys will be +// considered "dead". We mark values and finalizers of such weaks, and then +// schedule them for finalization in `scheduleFinalizers` (which we run during +// synchronization). +void nonmovingMarkDeadWeaks (struct MarkQueue_ *queue, StgWeak **dead_weaks) +{ + StgWeak *next_w; + for (StgWeak *w = nonmoving_old_weak_ptr_list; w; w = next_w) { + ASSERT(!nonmovingClosureMarkedThisCycle((P_)(w->key))); + nonmovingMarkDeadWeak(queue, w); + next_w = w ->link; + w->link = *dead_weaks; + *dead_weaks = w; + } +} + +// Non-moving heap variant of of `tidyThreadList` +void nonmovingTidyThreads () +{ + StgTSO *next; + StgTSO **prev = &nonmoving_old_threads; + for (StgTSO *t = nonmoving_old_threads; t != END_TSO_QUEUE; t = next) { + + next = t->global_link; + + // N.B. This thread is in old_threads, consequently we *know* it is in + // the snapshot and it is therefore safe to rely on the bitmap to + // determine its reachability. + if (nonmovingIsNowAlive((StgClosure*)t)) { + // alive + *prev = next; + + // move this thread onto threads list + t->global_link = nonmoving_threads; + nonmoving_threads = t; + } else { + // not alive (yet): leave this thread on the old_threads list + prev = &(t->global_link); + } + } +} + +void nonmovingResurrectThreads (struct MarkQueue_ *queue, StgTSO **resurrected_threads) +{ + StgTSO *next; + for (StgTSO *t = nonmoving_old_threads; t != END_TSO_QUEUE; t = next) { + next = t->global_link; + + switch (t->what_next) { + case ThreadKilled: + case ThreadComplete: + continue; + default: + markQueuePushClosure_(queue, (StgClosure*)t); + t->global_link = *resurrected_threads; + *resurrected_threads = t; + } + } +} + +#if defined(DEBUG) + +void printMarkQueueEntry (MarkQueueEnt *ent) +{ + switch(nonmovingMarkQueueEntryType(ent)) { + case MARK_CLOSURE: + debugBelch("Closure: "); + printClosure(ent->mark_closure.p); + break; + case MARK_ARRAY: + debugBelch("Array\n"); + break; + case NULL_ENTRY: + debugBelch("End of mark\n"); + break; + } +} + +void printMarkQueue (MarkQueue *q) +{ + debugBelch("======== MARK QUEUE ========\n"); + for (bdescr *block = q->blocks; block; block = block->link) { + MarkQueueBlock *queue = (MarkQueueBlock*)block->start; + for (uint32_t i = 0; i < queue->head; ++i) { + printMarkQueueEntry(&queue->entries[i]); + } + } + debugBelch("===== END OF MARK QUEUE ====\n"); +} + +#endif diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h new file mode 100644 index 0000000000000000000000000000000000000000..fe150f47cb4971e575c0864b69f05e28d95e0293 --- /dev/null +++ b/rts/sm/NonMovingMark.h @@ -0,0 +1,205 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Mark phase + * + * ---------------------------------------------------------------------------*/ + +#pragma once + +#include "Hash.h" +#include "Task.h" +#include "NonMoving.h" + +#include "BeginPrivate.h" + +#include "Hash.h" + +enum EntryType { + NULL_ENTRY = 0, + MARK_CLOSURE, + MARK_ARRAY +}; + +/* Note [Origin references in the nonmoving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * To implement indirection short-cutting and the selector optimisation the + * collector needs to know where it found references, so it can update the + * reference if it later turns out that points to an indirection. For this + * reason, each mark queue entry contains two things: + * + * - a pointer to the object to be marked (p), and + * + * - a pointer to the field where we found the reference (origin) + * + * Note that the origin pointer is an interior pointer: it points not to a + * valid closure (with info table pointer) but rather to a field inside a closure. + * Since such references can't be safely scavenged we establish the invariant + * that the origin pointer may only point to a field of an object living in the + * nonmoving heap, where no scavenging is needed. + * + */ + +typedef struct { + // Which kind of mark queue entry we have is determined by the low bits of + // the second word: they must be zero in the case of a mark_closure entry + // (since the second word of a mark_closure entry points to a pointer and + // pointers must be word-aligned). In the case of a mark_array we set them + // to 0x3 (the value of start_index is shifted to the left to accomodate + // this). null_entry where p==NULL is used to indicate the end of the queue. + union { + struct { + void *p; // must be NULL + } null_entry; + struct { + StgClosure *p; // the object to be marked + StgClosure **origin; // field where this reference was found. + // See Note [Origin references in the nonmoving collector] + } mark_closure; + struct { + const StgMutArrPtrs *array; + StgWord start_index; // start index is shifted to the left by 16 bits + } mark_array; + }; +} MarkQueueEnt; + +INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent) +{ + if (ent->null_entry.p == NULL) { + return NULL_ENTRY; + } else if (((uintptr_t) ent->mark_closure.origin & TAG_BITS) == 0) { + return MARK_CLOSURE; + } else { + ASSERT((ent->mark_array.start_index & TAG_BITS) == 0x3); + return MARK_ARRAY; + } +} + +typedef struct { + // index of first *unused* queue entry + uint32_t head; + + MarkQueueEnt entries[]; +} MarkQueueBlock; + +// How far ahead in mark queue to prefetch? +#define MARK_PREFETCH_QUEUE_DEPTH 5 + +/* The mark queue is not capable of concurrent read or write. + * + * invariants: + * + * a. top == blocks->start; + * b. there is always a valid MarkQueueChunk, although it may be empty + * (e.g. top->head == 0). + */ +typedef struct MarkQueue_ { + // A singly link-list of blocks, each containing a MarkQueueChunk. + bdescr *blocks; + + // Cached value of blocks->start. + MarkQueueBlock *top; + + // Is this a mark queue or a capability-local update remembered set? + bool is_upd_rem_set; + +#if MARK_PREFETCH_QUEUE_DEPTH > 0 + // A ring-buffer of entries which we will mark next + MarkQueueEnt prefetch_queue[MARK_PREFETCH_QUEUE_DEPTH]; + // The first free slot in prefetch_queue. + uint8_t prefetch_head; +#endif +} MarkQueue; + +/* While it shares its representation with MarkQueue, UpdRemSet differs in + * behavior when pushing; namely full chunks are immediately pushed to the + * global update remembered set, not accumulated into a chain. We make this + * distinction apparent in the types. + */ +typedef struct { + MarkQueue queue; +} UpdRemSet; + +// Number of blocks to allocate for a mark queue +#define MARK_QUEUE_BLOCKS 16 + +// The length of MarkQueueBlock.entries +#define MARK_QUEUE_BLOCK_ENTRIES ((MARK_QUEUE_BLOCKS * BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt)) + +extern bdescr *nonmoving_large_objects, *nonmoving_marked_large_objects, + *nonmoving_compact_objects, *nonmoving_marked_compact_objects; +extern memcount n_nonmoving_large_blocks, n_nonmoving_marked_large_blocks, + n_nonmoving_compact_blocks, n_nonmoving_marked_compact_blocks; + +extern StgTSO *nonmoving_old_threads; +extern StgWeak *nonmoving_old_weak_ptr_list; +extern StgTSO *nonmoving_threads; +extern StgWeak *nonmoving_weak_ptr_list; + +#if defined(DEBUG) +extern StgIndStatic *debug_caf_list_snapshot; +#endif + +extern MarkQueue *current_mark_queue; +extern bdescr *upd_rem_set_block_list; + + +void nonmovingMarkInitUpdRemSet(void); + +void init_upd_rem_set(UpdRemSet *rset); +void reset_upd_rem_set(UpdRemSet *rset); +void updateRemembSetPushClosure(Capability *cap, StgClosure *p); +void updateRemembSetPushThunk(Capability *cap, StgThunk *p); +void updateRemembSetPushTSO(Capability *cap, StgTSO *tso); +void updateRemembSetPushStack(Capability *cap, StgStack *stack); + +#if defined(THREADED_RTS) +void nonmovingFlushCapUpdRemSetBlocks(Capability *cap); +void nonmovingBeginFlush(Task *task); +bool nonmovingWaitForFlush(void); +void nonmovingFinishFlush(Task *task); +#endif + +void markQueueAddRoot(MarkQueue* q, StgClosure** root); + +void initMarkQueue(MarkQueue *queue); +void freeMarkQueue(MarkQueue *queue); +void nonmovingMark(struct MarkQueue_ *restrict queue); + +bool nonmovingTidyWeaks(struct MarkQueue_ *queue); +void nonmovingTidyThreads(void); +void nonmovingMarkDeadWeaks(struct MarkQueue_ *queue, StgWeak **dead_weak_ptr_list); +void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_threads); +bool nonmovingIsAlive(StgClosure *p); +void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w); +void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w); +void nonmovingAddUpdRemSetBlocks(struct MarkQueue_ *rset); + +void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent); +void markQueuePushClosureGC(MarkQueue *q, StgClosure *p); +void markQueuePushClosure(MarkQueue *q, + StgClosure *p, + StgClosure **origin); +void markQueuePushClosure_(MarkQueue *q, StgClosure *p); +void markQueuePushThunkSrt(MarkQueue *q, const StgInfoTable *info); +void markQueuePushFunSrt(MarkQueue *q, const StgInfoTable *info); +void markQueuePushArray(MarkQueue *q, const StgMutArrPtrs *array, StgWord start_index); +void updateRemembSetPushThunkEager(Capability *cap, + const StgThunkInfoTable *orig_info, + StgThunk *thunk); + +INLINE_HEADER bool markQueueIsEmpty(MarkQueue *q) +{ + return (q->blocks == NULL) || (q->top->head == 0 && q->blocks->link == NULL); +} + +#if defined(DEBUG) + +void printMarkQueueEntry(MarkQueueEnt *ent); +void printMarkQueue(MarkQueue *q); + +#endif + +#include "EndPrivate.h" diff --git a/rts/sm/NonMovingScav.c b/rts/sm/NonMovingScav.c new file mode 100644 index 0000000000000000000000000000000000000000..9583c7baf9ec7140aafd95043aceedf23c936539 --- /dev/null +++ b/rts/sm/NonMovingScav.c @@ -0,0 +1,389 @@ +#include "Rts.h" +#include "RtsUtils.h" +#include "NonMoving.h" +#include "NonMovingScav.h" +#include "Capability.h" +#include "Scav.h" +#include "Evac.h" +#include "GCThread.h" // for GCUtils.h +#include "GCUtils.h" +#include "Printer.h" +#include "MarkWeak.h" // scavengeLiveWeak + +void +nonmovingScavengeOne (StgClosure *q) +{ + ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); + StgPtr p = (StgPtr)q; + const StgInfoTable *info = get_itbl(q); + const bool saved_eager_promotion = gct->eager_promotion; + + switch (info->type) { + + case MVAR_CLEAN: + case MVAR_DIRTY: + { + StgMVar *mvar = ((StgMVar *)p); + gct->eager_promotion = false; + evacuate((StgClosure **)&mvar->head); + evacuate((StgClosure **)&mvar->tail); + evacuate((StgClosure **)&mvar->value); + gct->eager_promotion = saved_eager_promotion; + if (gct->failed_to_evac) { + mvar->header.info = &stg_MVAR_DIRTY_info; + } else { + mvar->header.info = &stg_MVAR_CLEAN_info; + } + break; + } + + case TVAR: + { + StgTVar *tvar = ((StgTVar *)p); + gct->eager_promotion = false; + evacuate((StgClosure **)&tvar->current_value); + evacuate((StgClosure **)&tvar->first_watch_queue_entry); + gct->eager_promotion = saved_eager_promotion; + if (gct->failed_to_evac) { + tvar->header.info = &stg_TVAR_DIRTY_info; + } else { + tvar->header.info = &stg_TVAR_CLEAN_info; + } + break; + } + + case FUN_2_0: + scavenge_fun_srt(info); + evacuate(&((StgClosure *)p)->payload[1]); + evacuate(&((StgClosure *)p)->payload[0]); + break; + + case THUNK_2_0: + scavenge_thunk_srt(info); + evacuate(&((StgThunk *)p)->payload[1]); + evacuate(&((StgThunk *)p)->payload[0]); + break; + + case CONSTR_2_0: + evacuate(&((StgClosure *)p)->payload[1]); + evacuate(&((StgClosure *)p)->payload[0]); + break; + + case THUNK_1_0: + scavenge_thunk_srt(info); + evacuate(&((StgThunk *)p)->payload[0]); + break; + + case FUN_1_0: + scavenge_fun_srt(info); + FALLTHROUGH; + case CONSTR_1_0: + evacuate(&((StgClosure *)p)->payload[0]); + break; + + case THUNK_0_1: + scavenge_thunk_srt(info); + break; + + case FUN_0_1: + scavenge_fun_srt(info); + FALLTHROUGH; + case CONSTR_0_1: + break; + + case THUNK_0_2: + scavenge_thunk_srt(info); + break; + + case FUN_0_2: + scavenge_fun_srt(info); + FALLTHROUGH; + case CONSTR_0_2: + break; + + case THUNK_1_1: + scavenge_thunk_srt(info); + evacuate(&((StgThunk *)p)->payload[0]); + break; + + case FUN_1_1: + scavenge_fun_srt(info); + FALLTHROUGH; + case CONSTR_1_1: + evacuate(&q->payload[0]); + break; + + case FUN: + scavenge_fun_srt(info); + goto gen_obj; + + case THUNK: + { + scavenge_thunk_srt(info); + StgPtr end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs; + for (p = (P_)((StgThunk *)p)->payload; p < end; p++) { + evacuate((StgClosure **)p); + } + break; + } + + case WEAK: + { + // We must evacuate the key since it may refer to an object in the + // moving heap which may be long gone by the time we call + // nonmovingTidyWeaks. + StgWeak *weak = (StgWeak *) p; + gct->eager_promotion = true; + evacuate(&weak->key); + gct->eager_promotion = saved_eager_promotion; + goto gen_obj; + } + + gen_obj: + case CONSTR: + case CONSTR_NOCAF: + case PRIM: + { + StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs; + for (p = (P_)((StgClosure *)p)->payload; p < end; p++) { + evacuate((StgClosure **)p); + } + break; + } + + case BCO: { + StgBCO *bco = (StgBCO *)p; + evacuate((StgClosure **)&bco->instrs); + evacuate((StgClosure **)&bco->literals); + evacuate((StgClosure **)&bco->ptrs); + break; + } + + case MUT_VAR_CLEAN: + case MUT_VAR_DIRTY: + gct->eager_promotion = false; + evacuate(&((StgMutVar *)p)->var); + gct->eager_promotion = saved_eager_promotion; + if (gct->failed_to_evac) { + ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info; + } else { + ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info; + } + break; + + case BLOCKING_QUEUE: + { + StgBlockingQueue *bq = (StgBlockingQueue *)p; + + gct->eager_promotion = false; + evacuate(&bq->bh); + evacuate((StgClosure**)&bq->owner); + evacuate((StgClosure**)&bq->queue); + evacuate((StgClosure**)&bq->link); + gct->eager_promotion = saved_eager_promotion; + + if (gct->failed_to_evac) { + bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info; + } else { + bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info; + } + break; + } + + case THUNK_SELECTOR: + { + StgSelector *s = (StgSelector *)p; + evacuate(&s->selectee); + break; + } + + // A chunk of stack saved in a heap object + case AP_STACK: + { + StgAP_STACK *ap = (StgAP_STACK *)p; + + evacuate(&ap->fun); + scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size); + break; + } + + case PAP: + p = scavenge_PAP((StgPAP *)p); + break; + + case AP: + scavenge_AP((StgAP *)p); + break; + + case ARR_WORDS: + // nothing to follow + break; + + case MUT_ARR_PTRS_CLEAN: + case MUT_ARR_PTRS_DIRTY: + { + gct->eager_promotion = false; + scavenge_mut_arr_ptrs((StgMutArrPtrs*)p); + gct->eager_promotion = saved_eager_promotion; + if (gct->failed_to_evac) { + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info; + } else { + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info; + } + gct->failed_to_evac = true; // always put it on the mutable list. + break; + } + + case MUT_ARR_PTRS_FROZEN_CLEAN: + case MUT_ARR_PTRS_FROZEN_DIRTY: + // follow everything + { + scavenge_mut_arr_ptrs((StgMutArrPtrs*)p); + + if (gct->failed_to_evac) { + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_DIRTY_info; + } else { + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_CLEAN_info; + } + break; + } + + case SMALL_MUT_ARR_PTRS_CLEAN: + case SMALL_MUT_ARR_PTRS_DIRTY: + // follow everything + { + StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p); + gct->eager_promotion = false; + for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) { + evacuate((StgClosure **)p); + } + gct->eager_promotion = saved_eager_promotion; + + if (gct->failed_to_evac) { + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info; + } else { + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_CLEAN_info; + } + gct->failed_to_evac = true; // always put it on the mutable list. + break; + } + + case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN: + case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY: + // follow everything + { + StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p); + for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) { + evacuate((StgClosure **)p); + } + + if (gct->failed_to_evac) { + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_DIRTY_info; + } else { + ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_FROZEN_CLEAN_info; + } + break; + } + + case TSO: + { + scavengeTSO((StgTSO *)p); + break; + } + + case STACK: + { + StgStack *stack = (StgStack*)p; + + gct->eager_promotion = false; + scavenge_stack(stack->sp, stack->stack + stack->stack_size); + gct->eager_promotion = saved_eager_promotion; + stack->dirty = gct->failed_to_evac; + break; + } + + case MUT_PRIM: + { + StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs; + gct->eager_promotion = false; + for (p = (P_)((StgClosure *)p)->payload; p < end; p++) { + evacuate((StgClosure **)p); + } + gct->eager_promotion = saved_eager_promotion; + gct->failed_to_evac = true; // mutable + break; + } + + case TREC_CHUNK: + { + StgWord i; + StgTRecChunk *tc = ((StgTRecChunk *) p); + TRecEntry *e = &(tc -> entries[0]); + gct->eager_promotion = false; + evacuate((StgClosure **)&tc->prev_chunk); + for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) { + evacuate((StgClosure **)&e->tvar); + evacuate((StgClosure **)&e->expected_value); + evacuate((StgClosure **)&e->new_value); + } + gct->eager_promotion = saved_eager_promotion; + gct->failed_to_evac = true; // mutable + break; + } + + case IND: + case BLACKHOLE: + case IND_STATIC: + evacuate(&((StgInd *)p)->indirectee); + break; + + case COMPACT_NFDATA: + scavenge_compact((StgCompactNFData*)p); + break; + + default: + barf("nonmoving scavenge: unimplemented/strange closure type %d @ %p", + info->type, p); + } + + if (gct->failed_to_evac) { + // Mutable object or points to a younger object, add to the mut_list + gct->failed_to_evac = false; + if (oldest_gen->no > 0) { + recordMutableGen_GC(q, oldest_gen->no); + } + } +} + +/* Scavenge objects evacuated into a nonmoving segment by a minor GC */ +void +scavengeNonmovingSegment (struct NonmovingSegment *seg) +{ + const StgWord blk_size = nonmovingSegmentBlockSize(seg); + gct->evac_gen_no = oldest_gen->no; + gct->failed_to_evac = false; + + // scavenge objects between scan and free_ptr whose bitmap bits are 0 + bdescr *seg_block = Bdescr((P_)seg); + + ASSERT(seg_block->u.scan >= (P_)nonmovingSegmentGetBlock(seg, 0)); + ASSERT(seg_block->u.scan <= (P_)nonmovingSegmentGetBlock(seg, seg->next_free)); + + StgPtr scan_end = (P_)nonmovingSegmentGetBlock(seg, seg->next_free); + if (seg_block->u.scan == scan_end) + return; + + nonmoving_block_idx p_idx = nonmovingGetBlockIdx(seg_block->u.scan); + while (seg_block->u.scan < scan_end) { + StgClosure *p = (StgClosure*)seg_block->u.scan; + + // bit set = was allocated in a previous GC, no need to scavenge + // bit not set = new allocation, so scavenge + if (nonmovingGetMark(seg, p_idx) == 0) { + nonmovingScavengeOne(p); + } + + p_idx++; + seg_block->u.scan = (P_)(((uint8_t*)seg_block->u.scan) + blk_size); + } +} diff --git a/rts/sm/NonMovingScav.h b/rts/sm/NonMovingScav.h new file mode 100644 index 0000000000000000000000000000000000000000..021385e1e99e35d2fb0e9c70559719e9d022423e --- /dev/null +++ b/rts/sm/NonMovingScav.h @@ -0,0 +1,10 @@ +#pragma once + +#include "NonMoving.h" + +#include "BeginPrivate.h" + +void nonmovingScavengeOne(StgClosure *p); +void scavengeNonmovingSegment(struct NonmovingSegment *seg); + +#include "EndPrivate.h" diff --git a/rts/sm/NonMovingShortcut.c b/rts/sm/NonMovingShortcut.c new file mode 100644 index 0000000000000000000000000000000000000000..83c4857677b4fd0327b130d844ad5a6c94a41f95 --- /dev/null +++ b/rts/sm/NonMovingShortcut.c @@ -0,0 +1,326 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2019 + * + * Non-moving garbage collector and allocator: + * Indirection short-cutting and the selector optimisation + * + * ---------------------------------------------------------------------------*/ + +#include "Rts.h" +#include "GC.h" +#include "SMPClosureOps.h" +#include "NonMovingMark.h" +#include "NonMovingShortcut.h" +#include "Printer.h" + +#define MAX_THUNK_SELECTOR_DEPTH 16 + +//#define SELECTOR_OPT_DEBUG + +#if defined(SELECTOR_OPT_DEBUG) +static void +print_selector_chain(StgClosure *p) +{ + debugBelch("Selector chain: %p", (void*)p); + StgClosure *next = p->payload[0]; + while (next != NULL) { + debugBelch(", %p", next); + next = next->payload[0]; + } + debugBelch("\n"); +} +#endif + +static void +update_selector_chain( + StgClosure *chain, + StgClosure **origin, + StgSelector * const p0, + StgClosure * const val +) { + ASSERT(val != NULL); + + // Make sure we don't introduce non-moving-to-moving pointers here. + ASSERT(isNonmovingClosure(val)); + + // This case we can't handle because we don't know info ptr of the closure + // before we locked it. + ASSERT(chain != val); + +#if defined(SELECTOR_OPT_DEBUG) + if (chain != NULL) { + print_selector_chain(chain); + debugBelch("Value: "); + printClosure(val); + } +#endif + + while (chain) { + // debugBelch("chain: %p\n", (void*)chain); + + StgClosure *next = chain->payload[0]; + + // We only update closures in the non-moving heap + ASSERT(isNonmovingClosure(chain)); + + ((StgInd*)chain)->indirectee = val; + unlockClosure((StgClosure*)chain, &stg_IND_info); + + chain = next; + } + + if (origin != NULL && (StgClosure*)p0 != val) { + cas((StgVolatilePtr)origin, (StgWord)p0, (StgWord)val); + } +} + +// Returns value of the selector thunk. The value is a non-moving closure. If +// it's not possible to evaluate the selector thunk the return value will be the +// selector itself. +static StgClosure* +nonmoving_eval_thunk_selector_( + MarkQueue *queue, + StgSelector * const p0, + StgClosure ** const origin, + int depth +) { + // This function should only be called on non-moving objects. + ASSERT(HEAP_ALLOCED_GC((P_)p0) && isNonmovingClosure((StgClosure*)p0)); + + markQueuePushClosure(queue, (StgClosure*)p0, NULL); + + // INVARIANT: A non-moving object. Locked (below). + StgClosure *p = (StgClosure*)p0; + + // Chain of non-moving selectors to update. These will be INDs to `p` when + // we reach to a value. INVARIANT: All objects in the chain are locked, and + // in the non-moving heap. + StgClosure *chain = NULL; + + // Variables to update: p. +selector_changed: + ; + + // debugBelch("Selector changed: %p\n", (void*)p); + + // Lock the selector to avoid concurrent modification in mutators + const StgInfoTable *selector_info_ptr = lockClosure((StgClosure*)p); + StgInfoTable *selector_info_tbl = INFO_PTR_TO_STRUCT(selector_info_ptr); + + if (selector_info_tbl->type != THUNK_SELECTOR) { + // Selector updated in the meantime, or we reached to a value. Update + // the chain. + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + + // The closure is locked and it's a selector thunk. If the selectee is a + // CONSTR we do the selection here and the In the selected value will be the + // value of this selector thunk. + // + // Two cases: + // + // - If the selected value is also a selector thunk, then we loop and + // evaluate it. The final value will be the value of both the current + // selector and the selected value (which is also a selector thunk). + // + // - If the selectee is a selector thunk, we recursively evaluate it (up to + // a certain depth, specified with MAX_THUNK_SELECTOR_DEPTH), then do the + // selection on the value of it. + + // + // Do the selection + // + + uint32_t field = selector_info_tbl->layout.selector_offset; + StgClosure *selectee = UNTAG_CLOSURE(((StgSelector*)p)->selectee); + + // Variables to update: selectee +selectee_changed: + // debugBelch("Selectee changed: %p\n", (void*)selectee); + + if (!isNonmovingClosure(selectee)) { + // The selectee is a moving object, and it may be moved by a concurrent + // minor GC while we read the info table and fields, so don't try to + // read the fields, just update the chain. + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + + // Selectee is a non-moving object, mark it. + markQueuePushClosure(queue, selectee, NULL); + + const StgInfoTable *selectee_info_tbl = get_itbl(selectee); + switch (selectee_info_tbl->type) { + case WHITEHOLE: { + // Probably a loop. Abort. + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + + case CONSTR: + case CONSTR_1_0: + case CONSTR_0_1: + case CONSTR_2_0: + case CONSTR_1_1: + case CONSTR_0_2: + case CONSTR_NOCAF: { + // Selectee is a constructor in the non-moving heap. + // Select the field. + + // Check that the size is in range. + ASSERT(field < (StgWord32)(selectee_info_tbl->layout.payload.ptrs + + selectee_info_tbl->layout.payload.nptrs)); + + StgClosure *val = UNTAG_CLOSURE(selectee->payload[field]); + + // `val` is the value of this selector thunk. We need to check a + // few cases: + // + // - If `val` is in the moving heap, we stop here and update the + // chain. All updated objects should be added to the mut_list. + // (TODO (osa): What happens if the value is evacuated as we do + // this?) + // + // - If `val` is in the non-moving heap, we check if it's also a + // selector. If it is we add it to the chain and loop. + + // Follow indirections. Variables to update: `val`. + val_changed: + if (!isNonmovingClosure(val)) { + // The selected value is a moving object, so we won't be + // updating the chain to this object. + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + + switch (get_itbl(val)->type) { + case IND: + case IND_STATIC: + ; + // Follow the indirection + StgClosure *indirectee = UNTAG_CLOSURE(((StgInd*)val)->indirectee); + if (isNonmovingClosure(indirectee)) { + val = UNTAG_CLOSURE(((StgInd*)val)->indirectee); + goto val_changed; + } else { + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + case THUNK_SELECTOR: + // Value of the selector thunk is again a selector thunk in the + // non-moving heap. Add the current selector to the chain and + // loop. + p->payload[0] = chain; + chain = p; + p = val; + goto selector_changed; + default: + // Found a value, add the current selector to the chain and + // update it. + p->payload[0] = chain; + chain = p; + update_selector_chain(chain, origin, p0, val); + return val; + } + } + + case IND: + case IND_STATIC: { + StgClosure *indirectee = UNTAG_CLOSURE(((StgInd *)selectee)->indirectee); + if (isNonmovingClosure(indirectee)) { + selectee = indirectee; + goto selectee_changed; + } else { + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + } + + case BLACKHOLE: { + StgClosure *indirectee = ((StgInd*)selectee)->indirectee; + + if (!isNonmovingClosure(UNTAG_CLOSURE(indirectee))) { + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return p; + } + + // Establish whether this BH has been updated, and is now an + // indirection, as in evacuate(). + if (GET_CLOSURE_TAG(indirectee) == 0) { + const StgInfoTable *i = indirectee->header.info; + if (i == &stg_TSO_info + || i == &stg_WHITEHOLE_info + || i == &stg_BLOCKING_QUEUE_CLEAN_info + || i == &stg_BLOCKING_QUEUE_DIRTY_info) { + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return (StgClosure*)p; + } + ASSERT(i != &stg_IND_info); // TODO not sure about this part + } + + // It's an indirection, follow it. + selectee = UNTAG_CLOSURE(indirectee); + goto selectee_changed; + } + + case AP: + case AP_STACK: + case THUNK: + case THUNK_1_0: + case THUNK_0_1: + case THUNK_2_0: + case THUNK_1_1: + case THUNK_0_2: + case THUNK_STATIC: { + // Not evaluated yet + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return (StgClosure*)p; + } + + case THUNK_SELECTOR: { + // Selectee is a selector thunk. Evaluate it if we haven't reached + // the recursion limit yet. + if (depth < MAX_THUNK_SELECTOR_DEPTH) { + StgClosure *new_selectee = + UNTAG_CLOSURE(nonmoving_eval_thunk_selector_( + queue, (StgSelector*)selectee, NULL, depth+1)); + ASSERT(isNonmovingClosure(new_selectee)); + if (selectee == new_selectee) { + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return (StgClosure*)p; + } else { + selectee = new_selectee; + goto selectee_changed; + } + } else { + // Recursion limit reached + unlockClosure(p, selector_info_ptr); + update_selector_chain(chain, origin, p0, p); + return (StgClosure*)p; + } + } + + default: { + barf("nonmoving_eval_thunk_selector: strange selectee %d", + (int)(selectee_info_tbl->type)); + } + } +} + +void +nonmoving_eval_thunk_selector(MarkQueue *queue, StgSelector *p, StgClosure **origin) +{ + nonmoving_eval_thunk_selector_(queue, p, origin, 0); +} diff --git a/rts/sm/NonMovingShortcut.h b/rts/sm/NonMovingShortcut.h new file mode 100644 index 0000000000000000000000000000000000000000..72297884aacdc58eccb7734e38b0ae38123cc329 --- /dev/null +++ b/rts/sm/NonMovingShortcut.h @@ -0,0 +1,17 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2019 + * + * Non-moving garbage collector and allocator: + * Indirection short-cutting and the selector optimisation + * + * ---------------------------------------------------------------------------*/ + +#pragma once + +#include "BeginPrivate.h" + +void +nonmoving_eval_thunk_selector(MarkQueue *queue, StgSelector *p, StgClosure **origin); + +#include "EndPrivate.h" diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c new file mode 100644 index 0000000000000000000000000000000000000000..cf5fcd70d7650225dfc3ee86a81d8425c1b0b699 --- /dev/null +++ b/rts/sm/NonMovingSweep.c @@ -0,0 +1,402 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Sweep phase + * + * ---------------------------------------------------------------------------*/ + +#include "Rts.h" +#include "NonMovingSweep.h" +#include "NonMoving.h" +#include "NonMovingMark.h" // for nonmovingIsAlive +#include "Capability.h" +#include "GCThread.h" // for GCUtils.h +#include "GCUtils.h" +#include "Storage.h" +#include "Trace.h" +#include "StableName.h" +#include "CNF.h" // compactFree + +// On which list should a particular segment be placed? +enum SweepResult { + SEGMENT_FREE, // segment is empty: place on free list + SEGMENT_PARTIAL, // segment is partially filled: place on active list + SEGMENT_FILLED // segment is full: place on filled list +}; + +// Determine which list a marked segment should be placed on and initialize +// next_free indices as appropriate. +GNUC_ATTR_HOT static enum SweepResult +nonmovingSweepSegment(struct NonmovingSegment *seg) +{ + bool found_free = false; + bool found_live = false; + + for (nonmoving_block_idx i = 0; + i < nonmovingSegmentBlockCount(seg); + ++i) + { + if (seg->bitmap[i] == nonmovingMarkEpoch) { + found_live = true; + } else if (!found_free) { + found_free = true; + seg->next_free = i; + nonmovingSegmentInfo(seg)->next_free_snap = i; + Bdescr((P_)seg)->u.scan = (P_)nonmovingSegmentGetBlock(seg, i); + seg->bitmap[i] = 0; + } else { + seg->bitmap[i] = 0; + } + + if (found_free && found_live) { + // zero the remaining dead object's mark bits + for (; i < nonmovingSegmentBlockCount(seg); ++i) { + if (seg->bitmap[i] != nonmovingMarkEpoch) { + seg->bitmap[i] = 0; + } + } + return SEGMENT_PARTIAL; + } + } + + if (found_live) { + return SEGMENT_FILLED; + } else { + ASSERT(seg->next_free == 0); + ASSERT(nonmovingSegmentInfo(seg)->next_free_snap == 0); + return SEGMENT_FREE; + } +} + +#if defined(DEBUG) + +void nonmovingGcCafs() +{ + uint32_t i = 0; + StgIndStatic *next; + + for (StgIndStatic *caf = debug_caf_list_snapshot; + caf != (StgIndStatic*) END_OF_CAF_LIST; + caf = next) + { + next = (StgIndStatic*)caf->saved_info; + + const StgInfoTable *info = get_itbl((StgClosure*)caf); + ASSERT(info->type == IND_STATIC); + + StgWord flag = ((StgWord) caf->static_link) & STATIC_BITS; + if (flag != 0 && flag != static_flag) { + debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf); + SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it + } else { + // CAF is alive, move it back to the debug_caf_list + ++i; + debugTrace(DEBUG_gccafs, "CAF alive at 0x%p", caf); + ACQUIRE_SM_LOCK; // debug_caf_list is global, locked by sm_mutex + caf->saved_info = (const StgInfoTable*)debug_caf_list; + debug_caf_list = caf; + RELEASE_SM_LOCK; + } + } + + debugTrace(DEBUG_gccafs, "%d CAFs live", i); + debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST; +} + +static void +clear_segment(struct NonmovingSegment* seg) +{ + size_t end = ((size_t)seg) + NONMOVING_SEGMENT_SIZE; + memset(&seg->bitmap, 0, end - (size_t)&seg->bitmap); +} + +static void +clear_segment_free_blocks(struct NonmovingSegment* seg) +{ + unsigned int block_size = nonmovingSegmentBlockSize(seg); + for (unsigned int p_idx = 0; p_idx < nonmovingSegmentBlockCount(seg); ++p_idx) { + // after mark, so bit not set == dead + if (nonmovingGetMark(seg, p_idx) == 0) { + memset(nonmovingSegmentGetBlock(seg, p_idx), 0, block_size); + } + } +} + +#endif + +GNUC_ATTR_HOT void nonmovingSweep(void) +{ + while (nonmovingHeap.sweep_list) { + struct NonmovingSegment *seg = nonmovingHeap.sweep_list; + + // Pushing the segment to one of the free/active/filled segments + // updates the link field, so update sweep_list here + nonmovingHeap.sweep_list = seg->link; + + enum SweepResult ret = nonmovingSweepSegment(seg); + + switch (ret) { + case SEGMENT_FREE: + IF_DEBUG(sanity, clear_segment(seg)); + nonmovingPushFreeSegment(seg); + break; + case SEGMENT_PARTIAL: + IF_DEBUG(sanity, clear_segment_free_blocks(seg)); + nonmovingPushActiveSegment(seg); + break; + case SEGMENT_FILLED: + nonmovingPushFilledSegment(seg); + break; + default: + barf("nonmovingSweep: weird sweep return: %d\n", ret); + } + } +} + +/* Must a closure remain on the mutable list? + * + * A closure must remain if any of the following applies: + * + * 1. it contains references to a younger generation + * 2. it's a mutable closure (e.g. mutable array or MUT_PRIM) + */ +static bool is_closure_clean(StgClosure *p) +{ + const StgInfoTable *info = get_itbl(p); + +#define CLEAN(ptr) (!HEAP_ALLOCED((StgClosure*) ptr) || Bdescr((StgPtr) ptr)->gen == oldest_gen) + + switch (info->type) { + case MVAR_CLEAN: + case MVAR_DIRTY: + { + StgMVar *mvar = ((StgMVar *)p); + if (!CLEAN(mvar->head)) goto dirty_MVAR; + if (!CLEAN(mvar->tail)) goto dirty_MVAR; + if (!CLEAN(mvar->value)) goto dirty_MVAR; + mvar->header.info = &stg_MVAR_CLEAN_info; + return true; + +dirty_MVAR: + mvar->header.info = &stg_MVAR_DIRTY_info; + return false; + } + + case TVAR: + { + StgTVar *tvar = ((StgTVar *)p); + if (!CLEAN(tvar->current_value)) goto dirty_TVAR; + if (!CLEAN(tvar->first_watch_queue_entry)) goto dirty_TVAR; + tvar->header.info = &stg_TVAR_CLEAN_info; + return true; + +dirty_TVAR: + tvar->header.info = &stg_TVAR_DIRTY_info; + return false; + } + + case THUNK: + case THUNK_1_0: + case THUNK_0_1: + case THUNK_1_1: + case THUNK_0_2: + case THUNK_2_0: + { + StgPtr end = (StgPtr)((StgThunk *)p)->payload + info->layout.payload.ptrs; + for (StgPtr q = (StgPtr)((StgThunk *)p)->payload; q < end; q++) { + if (!CLEAN(*q)) return false; + } + return true; + } + + case FUN: + case FUN_1_0: // hardly worth specialising these guys + case FUN_0_1: + case FUN_1_1: + case FUN_0_2: + case FUN_2_0: + case CONSTR: + case CONSTR_NOCAF: + case CONSTR_1_0: + case CONSTR_0_1: + case CONSTR_1_1: + case CONSTR_0_2: + case CONSTR_2_0: + case PRIM: + { + StgPtr end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs; + for (StgPtr q = (StgPtr)((StgClosure *)p)->payload; q < end; q++) { + if (!CLEAN(*q)) return false; + } + return true; + } + + case WEAK: + return false; // TODO + + case MUT_VAR_CLEAN: + case MUT_VAR_DIRTY: + if (!CLEAN(((StgMutVar *)p)->var)) { + p->header.info = &stg_MUT_VAR_DIRTY_info; + return false; + } else { + p->header.info = &stg_MUT_VAR_CLEAN_info; + return true; + } + + case BLOCKING_QUEUE: + { + StgBlockingQueue *bq = (StgBlockingQueue *)p; + + if (!CLEAN(bq->bh)) goto dirty_BLOCKING_QUEUE; + if (!CLEAN(bq->owner)) goto dirty_BLOCKING_QUEUE; + if (!CLEAN(bq->queue)) goto dirty_BLOCKING_QUEUE; + if (!CLEAN(bq->link)) goto dirty_BLOCKING_QUEUE; + bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info; + return true; + +dirty_BLOCKING_QUEUE: + bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info; + return false; + } + + case THUNK_SELECTOR: + return CLEAN(((StgSelector *) p)->selectee); + + case ARR_WORDS: + return true; + + default: + // TODO: the rest + return false; + } +#undef CLEAN +} + +/* N.B. This happens during the pause so we own all capabilities. */ +void nonmovingSweepMutLists() +{ + for (uint32_t n = 0; n < n_capabilities; n++) { + Capability *cap = capabilities[n]; + bdescr *old_mut_list = cap->mut_lists[oldest_gen->no]; + cap->mut_lists[oldest_gen->no] = allocBlockOnNode_sync(cap->node); + for (bdescr *bd = old_mut_list; bd; bd = bd->link) { + for (StgPtr p = bd->start; p < bd->free; p++) { + StgClosure **q = (StgClosure**)p; + if (nonmovingIsAlive(*q) && !is_closure_clean(*q)) { + recordMutableCap(*q, cap, oldest_gen->no); + } + } + } + freeChain_lock(old_mut_list); + } +} + +/* A variant of freeChain_lock that will only hold the lock for at most max_dur + * freed blocks to ensure that we don't starve other lock users (e.g. the + * mutator). + */ +static void freeChain_lock_max(bdescr *bd, int max_dur) +{ + ACQUIRE_SM_LOCK; + bdescr *next_bd; + int i = 0; + while (bd != NULL) { + next_bd = bd->link; + freeGroup(bd); + bd = next_bd; + if (i == max_dur) { + RELEASE_SM_LOCK; + yieldThread(); + ACQUIRE_SM_LOCK; + i = 0; + } + i++; + } + RELEASE_SM_LOCK; +} + +void nonmovingSweepLargeObjects() +{ + freeChain_lock_max(nonmoving_large_objects, 10000); + nonmoving_large_objects = nonmoving_marked_large_objects; + n_nonmoving_large_blocks = n_nonmoving_marked_large_blocks; + nonmoving_marked_large_objects = NULL; + n_nonmoving_marked_large_blocks = 0; +} + +void nonmovingSweepCompactObjects() +{ + bdescr *next; + ACQUIRE_SM_LOCK; + for (bdescr *bd = nonmoving_compact_objects; bd; bd = next) { + next = bd->link; + compactFree(((StgCompactNFDataBlock*)bd->start)->owner); + } + RELEASE_SM_LOCK; + nonmoving_compact_objects = nonmoving_marked_compact_objects; + n_nonmoving_compact_blocks = n_nonmoving_marked_compact_blocks; + nonmoving_marked_compact_objects = NULL; + n_nonmoving_marked_compact_blocks = 0; +} + +// Helper for nonmovingSweepStableNameTable. Essentially nonmovingIsAlive, +// but works when the object died in moving heap, see +// nonmovingSweepStableNameTable +static bool is_alive(StgClosure *p) +{ + if (!HEAP_ALLOCED_GC(p)) { + return true; + } + + if (nonmovingClosureBeingSwept(p)) { + return nonmovingIsAlive(p); + } else { + // We don't want to sweep any stable names which weren't in the + // set of segments that we swept. + // See Note [Sweeping stable names in the concurrent collector] + return true; + } +} + +void nonmovingSweepStableNameTable() +{ + // See comments in gcStableTables + + /* Note [Sweeping stable names in the concurrent collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * When collecting concurrently we need to take care to avoid freeing + * stable names the we didn't sweep this collection cycle. For instance, + * consider the following situation: + * + * 1. We take a snapshot and start collection + * 2. A mutator allocates a new object, then makes a stable name for it + * 3. The mutator performs a minor GC and promotes the new object to the nonmoving heap + * 4. The GC thread gets to the sweep phase and, when traversing the stable + * name table, finds the new object unmarked. It then assumes that the + * object is dead and removes the stable name from the stable name table. + * + */ + + // FIXME: We can't use nonmovingIsAlive here without first using isAlive: + // a stable name can die during moving heap collection and we can't use + // nonmovingIsAlive on those objects. Inefficient. + + stableNameLock(); + FOR_EACH_STABLE_NAME( + p, { + if (p->sn_obj != NULL) { + if (!is_alive((StgClosure*)p->sn_obj)) { + p->sn_obj = NULL; // Just to make an assertion happy + freeSnEntry(p); + } else if (p->addr != NULL) { + if (!is_alive((StgClosure*)p->addr)) { + p->addr = NULL; + } + } + } + }); + stableNameUnlock(); +} diff --git a/rts/sm/NonMovingSweep.h b/rts/sm/NonMovingSweep.h new file mode 100644 index 0000000000000000000000000000000000000000..24e9eccd5e77b6482995b09841aabea6e5b25b03 --- /dev/null +++ b/rts/sm/NonMovingSweep.h @@ -0,0 +1,31 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Sweep phase + * + * ---------------------------------------------------------------------------*/ + +#pragma once + +#include "NonMoving.h" +#include "Hash.h" + +GNUC_ATTR_HOT void nonmovingSweep(void); + +// Remove unmarked entries in oldest generation mut_lists +void nonmovingSweepMutLists(void); + +// Remove unmarked entries in oldest generation scavenged_large_objects list +void nonmovingSweepLargeObjects(void); + +// Remove unmarked entries in oldest generation compact_objects list +void nonmovingSweepCompactObjects(void); + +// Remove dead entries in the stable name table +void nonmovingSweepStableNameTable(void); + +#if defined(DEBUG) +// The non-moving equivalent of the moving collector's gcCAFs. +void nonmovingGcCafs(void); +#endif diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c index 3585bd93b4fabb9768900505b95ec146a43528f8..23f0fc57b4ad99b427ce0f68951653d1a0f38422 100644 --- a/rts/sm/Sanity.c +++ b/rts/sm/Sanity.c @@ -29,6 +29,8 @@ #include "Arena.h" #include "RetainerProfile.h" #include "CNF.h" +#include "sm/NonMoving.h" +#include "sm/NonMovingMark.h" #include "Profiling.h" // prof_arena /* ----------------------------------------------------------------------------- @@ -40,6 +42,9 @@ static void checkLargeBitmap ( StgPtr payload, StgLargeBitmap*, uint32_t ); static void checkClosureShallow ( const StgClosure * ); static void checkSTACK (StgStack *stack); +static W_ countNonMovingSegments ( struct NonmovingSegment *segs ); +static W_ countNonMovingHeap ( struct NonmovingHeap *heap ); + /* ----------------------------------------------------------------------------- Check stack sanity -------------------------------------------------------------------------- */ @@ -478,6 +483,41 @@ void checkHeapChain (bdescr *bd) } } +/* ----------------------------------------------------------------------------- + * Check nonmoving heap sanity + * + * After a concurrent sweep the nonmoving heap can be checked for validity. + * -------------------------------------------------------------------------- */ + +static void checkNonmovingSegments (struct NonmovingSegment *seg) +{ + while (seg != NULL) { + const nonmoving_block_idx count = nonmovingSegmentBlockCount(seg); + for (nonmoving_block_idx i=0; i < count; i++) { + if (seg->bitmap[i] == nonmovingMarkEpoch) { + StgPtr p = nonmovingSegmentGetBlock(seg, i); + checkClosure((StgClosure *) p); + } else if (i < nonmovingSegmentInfo(seg)->next_free_snap){ + seg->bitmap[i] = 0; + } + } + seg = seg->link; + } +} + +void checkNonmovingHeap (const struct NonmovingHeap *heap) +{ + for (unsigned int i=0; i < NONMOVING_ALLOCA_CNT; i++) { + const struct NonmovingAllocator *alloc = heap->allocators[i]; + checkNonmovingSegments(alloc->filled); + checkNonmovingSegments(alloc->active); + for (unsigned int cap=0; cap < n_capabilities; cap++) { + checkNonmovingSegments(alloc->current[cap]); + } + } +} + + void checkHeapChunk(StgPtr start, StgPtr end) { @@ -632,9 +672,9 @@ checkGlobalTSOList (bool checkTSOs) stack = tso->stackobj; while (1) { - if (stack->dirty & 1) { - ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & TSO_MARKED)); - stack->dirty &= ~TSO_MARKED; + if (stack->dirty & STACK_DIRTY) { + ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & STACK_SANE)); + stack->dirty &= ~STACK_SANE; } frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size - sizeofW(StgUnderflowFrame)); @@ -669,7 +709,7 @@ checkMutableList( bdescr *mut_bd, uint32_t gen ) ((StgTSO *)p)->flags |= TSO_MARKED; break; case STACK: - ((StgStack *)p)->dirty |= TSO_MARKED; + ((StgStack *)p)->dirty |= STACK_SANE; break; } } @@ -766,16 +806,42 @@ static void checkGeneration (generation *gen, uint32_t n; gen_workspace *ws; - ASSERT(countBlocks(gen->blocks) == gen->n_blocks); + //ASSERT(countBlocks(gen->blocks) == gen->n_blocks); ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); #if defined(THREADED_RTS) - // heap sanity checking doesn't work with SMP, because we can't - // zero the slop (see Updates.h). However, we can sanity-check - // the heap after a major gc, because there is no slop. + // heap sanity checking doesn't work with SMP for two reasons: + // * we can't zero the slop (see Updates.h). However, we can sanity-check + // the heap after a major gc, because there is no slop. + // + // * the nonmoving collector may be mutating its large object lists, unless we + // were in fact called by the nonmoving collector. if (!after_major_gc) return; #endif + if (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen) { + ASSERT(countNonMovingSegments(nonmovingHeap.free) == (W_) nonmovingHeap.n_free * NONMOVING_SEGMENT_BLOCKS); + ASSERT(countBlocks(nonmoving_large_objects) == n_nonmoving_large_blocks); + ASSERT(countBlocks(nonmoving_marked_large_objects) == n_nonmoving_marked_large_blocks); + + // Compact regions + // Accounting here is tricky due to the fact that the CNF allocation + // code modifies generation->n_compact_blocks directly. However, most + // objects being swept by the nonmoving GC are tracked in + // nonmoving_*_compact_objects. Consequently we can only maintain a very loose + // sanity invariant here. + uint32_t counted_cnf_blocks = 0; + counted_cnf_blocks += countCompactBlocks(nonmoving_marked_compact_objects); + counted_cnf_blocks += countCompactBlocks(nonmoving_compact_objects); + counted_cnf_blocks += countCompactBlocks(oldest_gen->compact_objects); + + uint32_t total_cnf_blocks = 0; + total_cnf_blocks += n_nonmoving_compact_blocks + oldest_gen->n_compact_blocks; + total_cnf_blocks += n_nonmoving_marked_compact_blocks; + + ASSERT(counted_cnf_blocks == total_cnf_blocks); + } + checkHeapChain(gen->blocks); for (n = 0; n < n_capabilities; n++) { @@ -824,6 +890,15 @@ markCompactBlocks(bdescr *bd) } } +static void +markNonMovingSegments(struct NonmovingSegment *seg) +{ + while (seg) { + markBlocks(Bdescr((P_)seg)); + seg = seg->link; + } +} + // If memInventory() calculates that we have a memory leak, this // function will try to find the block(s) that are leaking by marking // all the ones that we know about, and search through memory to find @@ -834,7 +909,7 @@ markCompactBlocks(bdescr *bd) static void findMemoryLeak (void) { - uint32_t g, i; + uint32_t g, i, j; for (g = 0; g < RtsFlags.GcFlags.generations; g++) { for (i = 0; i < n_capabilities; i++) { markBlocks(capabilities[i]->mut_lists[g]); @@ -854,6 +929,27 @@ findMemoryLeak (void) for (i = 0; i < n_capabilities; i++) { markBlocks(gc_threads[i]->free_blocks); markBlocks(capabilities[i]->pinned_object_block); + markBlocks(capabilities[i]->upd_rem_set.queue.blocks); + } + + if (RtsFlags.GcFlags.useNonmoving) { + markBlocks(upd_rem_set_block_list); + markBlocks(nonmoving_large_objects); + markBlocks(nonmoving_marked_large_objects); + markBlocks(nonmoving_compact_objects); + markBlocks(nonmoving_marked_compact_objects); + for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) { + struct NonmovingAllocator *alloc = nonmovingHeap.allocators[i]; + markNonMovingSegments(alloc->filled); + markNonMovingSegments(alloc->active); + for (j = 0; j < n_capabilities; j++) { + markNonMovingSegments(alloc->current[j]); + } + } + markNonMovingSegments(nonmovingHeap.sweep_list); + markNonMovingSegments(nonmovingHeap.free); + if (current_mark_queue) + markBlocks(current_mark_queue->blocks); } #if defined(PROFILING) @@ -914,14 +1010,65 @@ void findSlop(bdescr *bd) static W_ genBlocks (generation *gen) { - ASSERT(countBlocks(gen->blocks) == gen->n_blocks); + W_ ret = 0; + if (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen) { + // See Note [Live data accounting in nonmoving collector]. + ASSERT(countNonMovingHeap(&nonmovingHeap) == gen->n_blocks); + ret += countAllocdBlocks(nonmoving_large_objects); + ret += countAllocdBlocks(nonmoving_marked_large_objects); + ret += countAllocdCompactBlocks(nonmoving_compact_objects); + ret += countAllocdCompactBlocks(nonmoving_marked_compact_objects); + ret += countNonMovingHeap(&nonmovingHeap); + if (current_mark_queue) + ret += countBlocks(current_mark_queue->blocks); + } else { + ASSERT(countBlocks(gen->blocks) == gen->n_blocks); + ASSERT(countCompactBlocks(gen->compact_objects) == gen->n_compact_blocks); + ASSERT(countCompactBlocks(gen->compact_blocks_in_import) == gen->n_compact_blocks_in_import); + ret += gen->n_blocks; + } + ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); - ASSERT(countCompactBlocks(gen->compact_objects) == gen->n_compact_blocks); - ASSERT(countCompactBlocks(gen->compact_blocks_in_import) == gen->n_compact_blocks_in_import); - return gen->n_blocks + gen->n_old_blocks + + + ret += gen->n_old_blocks + countAllocdBlocks(gen->large_objects) + countAllocdCompactBlocks(gen->compact_objects) + countAllocdCompactBlocks(gen->compact_blocks_in_import); + return ret; +} + +static W_ +countNonMovingSegments(struct NonmovingSegment *segs) +{ + W_ ret = 0; + while (segs) { + ret += countBlocks(Bdescr((P_)segs)); + segs = segs->link; + } + return ret; +} + +static W_ +countNonMovingAllocator(struct NonmovingAllocator *alloc) +{ + W_ ret = countNonMovingSegments(alloc->filled) + + countNonMovingSegments(alloc->active); + for (uint32_t i = 0; i < n_capabilities; ++i) { + ret += countNonMovingSegments(alloc->current[i]); + } + return ret; +} + +static W_ +countNonMovingHeap(struct NonmovingHeap *heap) +{ + W_ ret = 0; + for (int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++) { + ret += countNonMovingAllocator(heap->allocators[alloc_idx]); + } + ret += countNonMovingSegments(heap->sweep_list); + ret += countNonMovingSegments(heap->free); + return ret; } void @@ -929,11 +1076,20 @@ memInventory (bool show) { uint32_t g, i; W_ gen_blocks[RtsFlags.GcFlags.generations]; - W_ nursery_blocks, retainer_blocks, - arena_blocks, exec_blocks, gc_free_blocks = 0; + W_ nursery_blocks = 0, retainer_blocks = 0, + arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0, + upd_rem_set_blocks = 0; W_ live_blocks = 0, free_blocks = 0; bool leak; +#if defined(THREADED_RTS) + // Can't easily do a memory inventory: We might race with the nonmoving + // collector. In principle we could try to take nonmoving_collection_mutex + // and do an inventory if we have it but we don't currently implement this. + if (RtsFlags.GcFlags.useNonmoving) + return; +#endif + // count the blocks we current have for (g = 0; g < RtsFlags.GcFlags.generations; g++) { @@ -947,20 +1103,19 @@ memInventory (bool show) gen_blocks[g] += genBlocks(&generations[g]); } - nursery_blocks = 0; for (i = 0; i < n_nurseries; i++) { ASSERT(countBlocks(nurseries[i].blocks) == nurseries[i].n_blocks); nursery_blocks += nurseries[i].n_blocks; } for (i = 0; i < n_capabilities; i++) { - gc_free_blocks += countBlocks(gc_threads[i]->free_blocks); + W_ n = countBlocks(gc_threads[i]->free_blocks); + gc_free_blocks += n; if (capabilities[i]->pinned_object_block != NULL) { nursery_blocks += capabilities[i]->pinned_object_block->blocks; } nursery_blocks += countBlocks(capabilities[i]->pinned_object_blocks); } - retainer_blocks = 0; #if defined(PROFILING) if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) { retainer_blocks = retainerStackBlocks(); @@ -976,12 +1131,19 @@ memInventory (bool show) /* count the blocks on the free list */ free_blocks = countFreeList(); + // count UpdRemSet blocks + for (i = 0; i < n_capabilities; ++i) { + upd_rem_set_blocks += countBlocks(capabilities[i]->upd_rem_set.queue.blocks); + } + upd_rem_set_blocks += countBlocks(upd_rem_set_block_list); + live_blocks = 0; for (g = 0; g < RtsFlags.GcFlags.generations; g++) { live_blocks += gen_blocks[g]; } live_blocks += nursery_blocks + - + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks; + + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks + + upd_rem_set_blocks; #define MB(n) (((double)(n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_))) @@ -1010,6 +1172,8 @@ memInventory (bool show) gc_free_blocks, MB(gc_free_blocks)); debugBelch(" free : %5" FMT_Word " blocks (%6.1lf MB)\n", free_blocks, MB(free_blocks)); + debugBelch(" UpdRemSet : %5" FMT_Word " blocks (%6.1lf MB)\n", + upd_rem_set_blocks, MB(upd_rem_set_blocks)); debugBelch(" total : %5" FMT_Word " blocks (%6.1lf MB)\n", live_blocks + free_blocks, MB(live_blocks+free_blocks)); if (leak) { diff --git a/rts/sm/Sanity.h b/rts/sm/Sanity.h index 9227e6fd181b19c9d913394fc8c09a1034b85287..b6f2054383177985a3f08a0342f980f183e5dc34 100644 --- a/rts/sm/Sanity.h +++ b/rts/sm/Sanity.h @@ -31,6 +31,7 @@ void checkStaticObjects ( StgClosure* static_objects ); void checkStackChunk ( StgPtr sp, StgPtr stack_end ); StgOffset checkStackFrame ( StgPtr sp ); StgOffset checkClosure ( const StgClosure* p ); +void checkNonmovingHeap ( const struct NonmovingHeap *heap ); void checkRunQueue (Capability *cap); diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c index c486cd96c5867e79926dfbb2702732cdd8918b44..501d958aaef6d54c4c850ead0902060e8adffcc8 100644 --- a/rts/sm/Scav.c +++ b/rts/sm/Scav.c @@ -62,8 +62,8 @@ #include "Hash.h" #include "sm/MarkWeak.h" - -static void scavenge_stack (StgPtr p, StgPtr stack_end); +#include "sm/NonMoving.h" // for nonmoving_set_closure_mark_bit +#include "sm/NonMovingScav.h" static void scavenge_large_bitmap (StgPtr p, StgLargeBitmap *large_bitmap, @@ -76,6 +76,15 @@ static void scavenge_large_bitmap (StgPtr p, # define scavenge_block(a) scavenge_block1(a) # define scavenge_mutable_list(bd,g) scavenge_mutable_list1(bd,g) # define scavenge_capability_mut_lists(cap) scavenge_capability_mut_Lists1(cap) +# define scavengeTSO(tso) scavengeTSO1(tso) +# define scavenge_stack(p, stack_end) scavenge_stack1(p, stack_end) +# define scavenge_fun_srt(info) scavenge_fun_srt1(info) +# define scavenge_fun_srt(info) scavenge_fun_srt1(info) +# define scavenge_thunk_srt(info) scavenge_thunk_srt1(info) +# define scavenge_mut_arr_ptrs(info) scavenge_mut_arr_ptrs1(info) +# define scavenge_PAP(pap) scavenge_PAP1(pap) +# define scavenge_AP(ap) scavenge_AP1(ap) +# define scavenge_compact(str) scavenge_compact1(str) #endif static void do_evacuate(StgClosure **p, void *user STG_UNUSED) @@ -87,7 +96,7 @@ static void do_evacuate(StgClosure **p, void *user STG_UNUSED) Scavenge a TSO. -------------------------------------------------------------------------- */ -static void +void scavengeTSO (StgTSO *tso) { bool saved_eager; @@ -165,7 +174,10 @@ evacuate_hash_entry(MapHashData *dat, StgWord key, const void *value) SET_GCT(old_gct); } -static void +/* Here we scavenge the sharing-preservation hash-table, which may contain keys + * living in from-space. + */ +void scavenge_compact(StgCompactNFData *str) { bool saved_eager; @@ -198,7 +210,7 @@ scavenge_compact(StgCompactNFData *str) Mutable arrays of pointers -------------------------------------------------------------------------- */ -static StgPtr scavenge_mut_arr_ptrs (StgMutArrPtrs *a) +StgPtr scavenge_mut_arr_ptrs (StgMutArrPtrs *a) { W_ m; bool any_failed; @@ -348,14 +360,14 @@ scavenge_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size) return p; } -STATIC_INLINE GNUC_ATTR_HOT StgPtr +GNUC_ATTR_HOT StgPtr scavenge_PAP (StgPAP *pap) { evacuate(&pap->fun); return scavenge_PAP_payload (pap->fun, pap->payload, pap->n_args); } -STATIC_INLINE StgPtr +StgPtr scavenge_AP (StgAP *ap) { evacuate(&ap->fun); @@ -366,7 +378,7 @@ scavenge_AP (StgAP *ap) Scavenge SRTs -------------------------------------------------------------------------- */ -STATIC_INLINE GNUC_ATTR_HOT void +GNUC_ATTR_HOT void scavenge_thunk_srt(const StgInfoTable *info) { StgThunkInfoTable *thunk_info; @@ -380,7 +392,7 @@ scavenge_thunk_srt(const StgInfoTable *info) } } -STATIC_INLINE GNUC_ATTR_HOT void +GNUC_ATTR_HOT void scavenge_fun_srt(const StgInfoTable *info) { StgFunInfoTable *fun_info; @@ -1570,10 +1582,10 @@ static void scavenge_mutable_list(bdescr *bd, generation *gen) { StgPtr p, q; - uint32_t gen_no; - gen_no = gen->no; + uint32_t gen_no = gen->no; gct->evac_gen_no = gen_no; + for (; bd != NULL; bd = bd->link) { for (q = bd->start; q < bd->free; q++) { p = (StgPtr)*q; @@ -1648,7 +1660,10 @@ scavenge_mutable_list(bdescr *bd, generation *gen) ; } - if (scavenge_one(p)) { + if (RtsFlags.GcFlags.useNonmoving && major_gc && gen == oldest_gen) { + // We can't use scavenge_one here as we need to scavenge SRTs + nonmovingScavengeOne((StgClosure *)p); + } else if (scavenge_one(p)) { // didn't manage to promote everything, so put the // object back on the list. recordMutableGen_GC((StgClosure *)p,gen_no); @@ -1660,7 +1675,14 @@ scavenge_mutable_list(bdescr *bd, generation *gen) void scavenge_capability_mut_lists (Capability *cap) { - uint32_t g; + // In a major GC only nonmoving heap's mut list is root + if (RtsFlags.GcFlags.useNonmoving && major_gc) { + uint32_t g = oldest_gen->no; + scavenge_mutable_list(cap->saved_mut_lists[g], oldest_gen); + freeChain_sync(cap->saved_mut_lists[g]); + cap->saved_mut_lists[g] = NULL; + return; + } /* Mutable lists from each generation > N * we want to *scavenge* these roots, not evacuate them: they're not @@ -1668,7 +1690,7 @@ scavenge_capability_mut_lists (Capability *cap) * Also do them in reverse generation order, for the usual reason: * namely to reduce the likelihood of spurious old->new pointers. */ - for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { + for (uint32_t g = RtsFlags.GcFlags.generations-1; g > N; g--) { scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]); freeChain_sync(cap->saved_mut_lists[g]); cap->saved_mut_lists[g] = NULL; @@ -1795,7 +1817,7 @@ scavenge_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, StgWord size ) AP_STACK_UPDs, since these are just sections of copied stack. -------------------------------------------------------------------------- */ -static void +void scavenge_stack(StgPtr p, StgPtr stack_end) { const StgRetInfoTable* info; @@ -2038,6 +2060,16 @@ loop: for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { ws = &gct->gens[g]; + if (ws->todo_seg != END_NONMOVING_TODO_LIST) { + struct NonmovingSegment *seg = ws->todo_seg; + ASSERT(seg->todo_link); + ws->todo_seg = seg->todo_link; + seg->todo_link = NULL; + scavengeNonmovingSegment(seg); + did_something = true; + break; + } + gct->scan_bd = NULL; // If we have a scan block with some work to do, diff --git a/rts/sm/Scav.h b/rts/sm/Scav.h index 21ca691bff04d9d38d98aff47b3b7e0888dfe3d3..94250bcf7a1bca1f50d0bff699d34a6269cd1a85 100644 --- a/rts/sm/Scav.h +++ b/rts/sm/Scav.h @@ -17,10 +17,26 @@ void scavenge_loop (void); void scavenge_capability_mut_lists (Capability *cap); +void scavengeTSO (StgTSO *tso); +void scavenge_stack (StgPtr p, StgPtr stack_end); +void scavenge_fun_srt (const StgInfoTable *info); +void scavenge_thunk_srt (const StgInfoTable *info); +StgPtr scavenge_mut_arr_ptrs (StgMutArrPtrs *a); +StgPtr scavenge_PAP (StgPAP *pap); +StgPtr scavenge_AP (StgAP *ap); +void scavenge_compact (StgCompactNFData *str); #if defined(THREADED_RTS) void scavenge_loop1 (void); void scavenge_capability_mut_Lists1 (Capability *cap); +void scavengeTSO1 (StgTSO *tso); +void scavenge_stack1 (StgPtr p, StgPtr stack_end); +void scavenge_fun_srt1 (const StgInfoTable *info); +void scavenge_thunk_srt1 (const StgInfoTable *info); +StgPtr scavenge_mut_arr_ptrs1 (StgMutArrPtrs *a); +StgPtr scavenge_PAP1 (StgPAP *pap); +StgPtr scavenge_AP1 (StgAP *ap); +void scavenge_compact1 (StgCompactNFData *str); #endif #include "EndPrivate.h" diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 0130a08f7c4219596f1ed050bd5ffa8a7c07e705..f04b3c5929195a11e352d6d5efb58d351d31b696 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -29,6 +29,7 @@ #include "Trace.h" #include "GC.h" #include "Evac.h" +#include "NonMoving.h" #if defined(ios_HOST_OS) #include "Hash.h" #endif @@ -82,7 +83,7 @@ Mutex sm_mutex; static void allocNurseries (uint32_t from, uint32_t to); static void assignNurseriesToCapabilities (uint32_t from, uint32_t to); -static void +void initGeneration (generation *gen, int g) { gen->no = g; @@ -170,6 +171,18 @@ initStorage (void) } oldest_gen->to = oldest_gen; + // Nonmoving heap uses oldest_gen so initialize it after initializing oldest_gen + nonmovingInit(); + +#if defined(THREADED_RTS) + // nonmovingAddCapabilities allocates segments, which requires taking the gc + // sync lock, so initialize it before nonmovingAddCapabilities + initSpinLock(&gc_alloc_block_sync); +#endif + + if (RtsFlags.GcFlags.useNonmoving) + nonmovingAddCapabilities(n_capabilities); + /* The oldest generation has one step. */ if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) { if (RtsFlags.GcFlags.generations == 1) { @@ -195,9 +208,6 @@ initStorage (void) exec_block = NULL; -#if defined(THREADED_RTS) - initSpinLock(&gc_alloc_block_sync); -#endif N = 0; for (n = 0; n < n_numa_nodes; n++) { @@ -271,6 +281,14 @@ void storageAddCapabilities (uint32_t from, uint32_t to) } } + // Initialize NonmovingAllocators and UpdRemSets + if (RtsFlags.GcFlags.useNonmoving) { + nonmovingAddCapabilities(to); + for (i = 0; i < to; ++i) { + init_upd_rem_set(&capabilities[i]->upd_rem_set); + } + } + #if defined(THREADED_RTS) && defined(CC_LLVM_BACKEND) && (CC_SUPPORTS_TLS == 0) newThreadLocalKey(&gctKey); #endif @@ -282,6 +300,7 @@ void storageAddCapabilities (uint32_t from, uint32_t to) void exitStorage (void) { + nonmovingExit(); updateNurseriesStats(); stat_exit(); } @@ -302,7 +321,8 @@ freeStorage (bool free_heap) } /* ----------------------------------------------------------------------------- - Note [CAF management]. + Note [CAF management] + ~~~~~~~~~~~~~~~~~~~~~ The entry code for every CAF does the following: @@ -337,6 +357,7 @@ freeStorage (bool free_heap) ------------------ Note [atomic CAF entry] + ~~~~~~~~~~~~~~~~~~~~~~~ With THREADED_RTS, newCAF() is required to be atomic (see #5558). This is because if two threads happened to enter the same @@ -350,6 +371,7 @@ freeStorage (bool free_heap) ------------------ Note [GHCi CAFs] + ~~~~~~~~~~~~~~~~ For GHCI, we have additional requirements when dealing with CAFs: @@ -369,6 +391,51 @@ freeStorage (bool free_heap) -- SDM 29/1/01 + ------------------ + Note [Static objects under the nonmoving collector] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Static object management is a bit tricky under the nonmoving collector as we + need to maintain a bit more state than in the moving collector. In + particular, the moving collector uses the low bits of the STATIC_LINK field + to determine whether the object has been moved to the scavenger's work list + (see Note [STATIC_LINK fields] in Storage.h). + + However, the nonmoving collector also needs a place to keep its mark bit. + This is problematic as we therefore need at least three bits of state + but can assume only two bits are available in STATIC_LINK (due to 32-bit + systems). + + To accomodate this we move handling of static objects entirely to the + oldest generation when the nonmoving collector is in use. To do this safely + and efficiently we allocate the blackhole created by lockCAF() directly in + the non-moving heap. This means that the moving collector can completely + ignore static objects in minor collections since they are guaranteed not to + have any references into the moving heap. Of course, the blackhole itself + likely will contain a reference into the moving heap but this is + significantly easier to handle, being a heap-allocated object (see Note + [Aging under the non-moving collector] in NonMoving.c for details). + + During the moving phase of a major collection we treat static objects + as we do any other reference into the non-moving heap by pushing them + to the non-moving mark queue (see Note [Aging under the non-moving + collector]). + + This allows the non-moving collector to have full control over the flags + in STATIC_LINK, which it uses as described in Note [STATIC_LINK fields]). + This is implemented by NonMovingMark.c:bump_static_flag. + + In short, the plan is: + + - lockCAF allocates its blackhole in the nonmoving heap. This is important + to ensure that we do not need to place the static object on the mut_list + lest we would need somw way to ensure that it evacuate only once during + a moving collection. + + - evacuate_static_object adds merely pushes objects to the mark queue + + - the nonmoving collector uses the flags in STATIC_LINK as its mark bit. + -------------------------------------------------------------------------- */ STATIC_INLINE StgInd * @@ -402,11 +469,36 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf) // successfully claimed by us; overwrite with IND_STATIC #endif + // Push stuff that will become unreachable after updating to UpdRemSet to + // maintain snapshot invariant + const StgInfoTable *orig_info_tbl = INFO_PTR_TO_STRUCT(orig_info); + // OSA: Assertions to make sure my understanding of static thunks is correct + ASSERT(orig_info_tbl->type == THUNK_STATIC); + // Secondly I think static thunks can't have payload: anything that they + // reference should be in SRTs + ASSERT(orig_info_tbl->layout.payload.ptrs == 0); + // Becuase the payload is empty we just push the SRT + IF_NONMOVING_WRITE_BARRIER_ENABLED { + StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(orig_info_tbl); + if (thunk_info->i.srt) { + updateRemembSetPushClosure(cap, GET_SRT(thunk_info)); + } + } + // For the benefit of revertCAFs(), save the original info pointer caf->saved_info = orig_info; // Allocate the blackhole indirection closure - bh = (StgInd *)allocate(cap, sizeofW(*bh)); + if (RtsFlags.GcFlags.useNonmoving) { + // See Note [Static objects under the nonmoving collector]. + ACQUIRE_SM_LOCK; + bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh)); + RELEASE_SM_LOCK; + recordMutableCap((StgClosure*)bh, + regTableToCapability(reg), oldest_gen->no); + } else { + bh = (StgInd *)allocate(cap, sizeofW(*bh)); + } bh->indirectee = (StgClosure *)cap->r.rCurrentTSO; SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs); // Ensure that above writes are visible before we introduce reference as CAF indirectee. @@ -448,7 +540,9 @@ newCAF(StgRegTable *reg, StgIndStatic *caf) else { // Put this CAF on the mutable list for the old generation. - if (oldest_gen->no != 0) { + // N.B. the nonmoving collector works a bit differently: see + // Note [Static objects under the nonmoving collector]. + if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) { recordMutableCap((StgClosure*)caf, regTableToCapability(reg), oldest_gen->no); } @@ -525,7 +619,9 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf) if (!bh) return NULL; // Put this CAF on the mutable list for the old generation. - if (oldest_gen->no != 0) { + // N.B. the nonmoving collector works a bit differently: + // see Note [Static objects under the nonmoving collector]. + if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) { recordMutableCap((StgClosure*)caf, regTableToCapability(reg), oldest_gen->no); } @@ -1073,6 +1169,27 @@ allocatePinned (Capability *cap, W_ n) Write Barriers -------------------------------------------------------------------------- */ +/* These write barriers on heavily mutated objects serve two purposes: + * + * - Efficient maintenance of the generational invariant: Record whether or not + * we have added a particular mutable object to mut_list as they may contain + * references to younger generations. + * + * - Maintenance of the nonmoving collector's snapshot invariant: Record objects + * which are about to no longer be reachable due to mutation. + * + * In each case we record whether the object has been added to the mutable list + * by way of either the info pointer or a dedicated "dirty" flag. The GC will + * clear this flag and remove the object from mut_list (or rather, not re-add it) + * to if it finds the object contains no references into any younger generation. + * + * Note that all dirty objects will be marked as clean during preparation for a + * concurrent collection. Consequently, we can use the dirtiness flag to determine + * whether or not we need to add overwritten pointers to the update remembered + * set (since we need only write the value prior to the first update to maintain + * the snapshot invariant). + */ + /* This is the write barrier for MUT_VARs, a.k.a. IORefs. A MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY @@ -1080,25 +1197,39 @@ allocatePinned (Capability *cap, W_ n) and is put on the mutable list. */ void -dirty_MUT_VAR(StgRegTable *reg, StgClosure *p) +dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old) { Capability *cap = regTableToCapability(reg); // No barrier required here as no other heap object fields are read. See // note [Heap memory barriers] in SMP.h. - if (p->header.info == &stg_MUT_VAR_CLEAN_info) { - p->header.info = &stg_MUT_VAR_DIRTY_info; - recordClosureMutated(cap,p); + if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) { + mvar->header.info = &stg_MUT_VAR_DIRTY_info; + recordClosureMutated(cap, (StgClosure *) mvar); + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure_(reg, old); + } } } +/* + * This is the write barrier for TVARs. + * old is the pointer that we overwrote, which is required by the concurrent + * garbage collector. Note that we, while StgTVars contain multiple pointers, + * only overwrite one per dirty_TVAR call so we only need to take one old + * pointer argument. + */ void -dirty_TVAR(Capability *cap, StgTVar *p) +dirty_TVAR(Capability *cap, StgTVar *p, + StgClosure *old) { // No barrier required here as no other heap object fields are read. See // note [Heap memory barriers] in SMP.h. if (p->header.info == &stg_TVAR_CLEAN_info) { p->header.info = &stg_TVAR_DIRTY_info; recordClosureMutated(cap,(StgClosure*)p); + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure(cap, old); + } } } @@ -1113,6 +1244,9 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target) if (tso->dirty == 0) { tso->dirty = 1; recordClosureMutated(cap,(StgClosure*)tso); + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure(cap, (StgClosure *) tso->_link); + } } tso->_link = target; } @@ -1123,6 +1257,9 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target) if (tso->dirty == 0) { tso->dirty = 1; recordClosureMutated(cap,(StgClosure*)tso); + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev); + } } tso->block_info.prev = target; } @@ -1134,15 +1271,49 @@ dirty_TSO (Capability *cap, StgTSO *tso) tso->dirty = 1; recordClosureMutated(cap,(StgClosure*)tso); } + + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushTSO(cap, tso); + } } void dirty_STACK (Capability *cap, StgStack *stack) { - if (stack->dirty == 0) { - stack->dirty = 1; + // First push to upd_rem_set before we set stack->dirty since we + // the nonmoving collector may already be marking the stack. + IF_NONMOVING_WRITE_BARRIER_ENABLED { + updateRemembSetPushStack(cap, stack); + } + + if (! (stack->dirty & STACK_DIRTY)) { + stack->dirty = STACK_DIRTY; recordClosureMutated(cap,(StgClosure*)stack); } + +} + +/* + * This is the concurrent collector's write barrier for MVARs. In the other + * write barriers above this is folded into the dirty_* functions. However, in + * the case of MVars we need to separate the acts of adding the MVar to the + * mutable list and adding its fields to the update remembered set. + * + * Specifically, the wakeup loop in stg_putMVarzh wants to freely mutate the + * pointers of the MVar but needs to keep its lock, meaning we can't yet add it + * to the mutable list lest the assertion checking for clean MVars on the + * mutable list would fail. + */ +void +update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val) +{ + Capability *cap = regTableToCapability(reg); + IF_NONMOVING_WRITE_BARRIER_ENABLED { + StgMVar *mvar = (StgMVar *) p; + updateRemembSetPushClosure(cap, old_val); + updateRemembSetPushClosure(cap, (StgClosure *) mvar->head); + updateRemembSetPushClosure(cap, (StgClosure *) mvar->tail); + } } /* @@ -1154,9 +1325,11 @@ dirty_STACK (Capability *cap, StgStack *stack) such as Chaneneos and cheap-concurrency. */ void -dirty_MVAR(StgRegTable *reg, StgClosure *p) +dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val) { - recordClosureMutated(regTableToCapability(reg),p); + Capability *cap = regTableToCapability(reg); + update_MVAR(reg, p, old_val); + recordClosureMutated(cap, p); } /* ----------------------------------------------------------------------------- @@ -1232,8 +1405,8 @@ W_ countOccupied (bdescr *bd) W_ genLiveWords (generation *gen) { - return gen->n_words + gen->n_large_words + - gen->n_compact_blocks * BLOCK_SIZE_W; + return (gen->live_estimate ? gen->live_estimate : gen->n_words) + + gen->n_large_words + gen->n_compact_blocks * BLOCK_SIZE_W; } W_ genLiveBlocks (generation *gen) @@ -1289,9 +1462,9 @@ calcNeeded (bool force_major, memcount *blocks_needed) for (uint32_t g = 0; g < RtsFlags.GcFlags.generations; g++) { generation *gen = &generations[g]; - W_ blocks = gen->n_blocks // or: gen->n_words / BLOCK_SIZE_W (?) - + gen->n_large_blocks - + gen->n_compact_blocks; + W_ blocks = gen->live_estimate ? (gen->live_estimate / BLOCK_SIZE_W) : gen->n_blocks; + blocks += gen->n_large_blocks + + gen->n_compact_blocks; // we need at least this much space needed += blocks; @@ -1309,7 +1482,7 @@ calcNeeded (bool force_major, memcount *blocks_needed) // mark stack: needed += gen->n_blocks / 100; } - if (gen->compact) { + if (gen->compact || (RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) { continue; // no additional space needed for compaction } else { needed += gen->n_blocks; @@ -1408,6 +1581,7 @@ void flushExec (W_ len, AdjustorExecutable exec_addr) __clear_cache((void*)begin, (void*)end); # endif #elif defined(__GNUC__) + /* For all other platforms, fall back to a libgcc builtin. */ unsigned char* begin = (unsigned char*)exec_addr; unsigned char* end = begin + len; # if GCC_HAS_BUILTIN_CLEAR_CACHE diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h index aaa44428b3dfa66f408615f1e2350b00650b6c00..cdb9720650f79ba62ff0387430719076a87f217c 100644 --- a/rts/sm/Storage.h +++ b/rts/sm/Storage.h @@ -17,6 +17,7 @@ -------------------------------------------------------------------------- */ void initStorage(void); +void initGeneration(generation *gen, int g); void exitStorage(void); void freeStorage(bool free_heap); @@ -46,8 +47,9 @@ extern Mutex sm_mutex; The write barrier for MVARs and TVARs -------------------------------------------------------------------------- */ -void dirty_MVAR(StgRegTable *reg, StgClosure *p); -void dirty_TVAR(Capability *cap, StgTVar *p); +void update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val); +void dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old); +void dirty_TVAR(Capability *cap, StgTVar *p, StgClosure *old); /* ----------------------------------------------------------------------------- Nursery manipulation diff --git a/testsuite/config/ghc b/testsuite/config/ghc index bc888d166103571ec3d874283213a2f23a2fc7b6..9a3459ea96fa7459f1f42056c36ac15f12b0647b 100644 --- a/testsuite/config/ghc +++ b/testsuite/config/ghc @@ -26,7 +26,10 @@ config.other_ways = ['prof', 'normal_h', 'profllvm', 'profoptllvm', 'profthreadedllvm', 'debug', 'ghci-ext', 'ghci-ext-prof', - 'ext-interp'] + 'ext-interp', + 'nonmoving', + 'nonmoving_thr', + 'nonmoving_thr_ghc'] if ghc_with_native_codegen: config.compile_ways.append('optasm') @@ -96,7 +99,10 @@ config.way_flags = { 'profthreadedllvm' : ['-O', '-prof', '-static', '-fprof-auto', '-threaded', '-fllvm'], 'ghci-ext' : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '+RTS', '-I0.1', '-RTS'], 'ghci-ext-prof' : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '-prof', '+RTS', '-I0.1', '-RTS'], - 'ext-interp' : ['-fexternal-interpreter'], + 'ext-interp' : ['-fexternal-interpreter'], + 'nonmoving' : [], + 'nonmoving_thr': ['-threaded'], + 'nonmoving_thr_ghc': ['+RTS', '-xn', '-N2', '-RTS', '-threaded'], } config.way_rts_flags = { @@ -135,6 +141,9 @@ config.way_rts_flags = { 'ghci-ext' : [], 'ghci-ext-prof' : [], 'ext-interp' : [], + 'nonmoving' : ['-xn'], + 'nonmoving_thr' : ['-xn', '-N2'], + 'nonmoving_thr_ghc': ['-xn', '-N2'], } # Useful classes of ways that can be used with only_ways(), omit_ways() and diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T index 0882f2b6050cf5395c5f50d10a3a096b8303df64..f96820de81fa4a4c8d87c1c713c67b67d89599ed 100644 --- a/testsuite/tests/codeGen/should_run/all.T +++ b/testsuite/tests/codeGen/should_run/all.T @@ -1,5 +1,6 @@ # Test +RTS -G1 here (it isn't tested anywhere else) -setTestOpts(unless(fast(), extra_ways(['g1']))) +# N.B. Nonmoving collector doesn't support -G1 +setTestOpts(unless(fast(), [ extra_ways(['g1']), omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])])) test('cgrun001', normal, compile_and_run, ['']) test('cgrun002', normal, compile_and_run, ['']) @@ -194,9 +195,11 @@ test('T15696_3', normal, compile_and_run, ['-O']) test('T15892', [ ignore_stdout, - # we want to do lots of major GC to make the bug more likely to - # happen, so -G1 -A32k: - extra_run_opts('+RTS -G1 -A32k -RTS') ], + # -G1 is unsupported by the nonmoving GC + omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']), + # we want to do lots of major GC to make the bug more likely to + # happen, so -G1 -A32k: + extra_run_opts('+RTS -G1 -A32k -RTS') ], compile_and_run, ['-O']) test('T16617', normal, compile_and_run, ['']) test('T16449_2', exit_code(0), compile_and_run, ['']) diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T index 467040223f5611cffd9d7d6abd4f817d0ca7d13d..9297c5890e365607cce9a8e8456f8e322bcbef99 100644 --- a/testsuite/tests/concurrent/should_run/all.T +++ b/testsuite/tests/concurrent/should_run/all.T @@ -7,7 +7,7 @@ test('conc027', normal, compile_and_run, ['']) test('conc051', normal, compile_and_run, ['']) if ('threaded1' in config.run_ways): - only_threaded_ways = only_ways(['ghci','threaded1','threaded2']) + only_threaded_ways = only_ways(['ghci','threaded1','threaded2', 'nonmoving_thr']) else: only_threaded_ways = skip @@ -203,8 +203,8 @@ test('foreignInterruptible', [when(fast(), skip), ], compile_and_run, ['']) -test('conc037', only_ways(['threaded1','threaded2']), compile_and_run, ['']) -test('conc038', only_ways(['threaded1','threaded2']), compile_and_run, ['']) +test('conc037', only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), compile_and_run, ['']) +test('conc038', only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), compile_and_run, ['']) # Omit for GHCi, uses foreign export # Omit for the threaded ways, because in this case the main thread is allowed to @@ -224,7 +224,7 @@ test('conc045', normal, compile_and_run, ['']) test('conc058', normal, compile_and_run, ['']) test('conc059', - [only_ways(['threaded1', 'threaded2']), + [only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), pre_cmd('$MAKE -s --no-print-directory conc059_setup')], compile_and_run, ['conc059_c.c -no-hs-main']) @@ -243,7 +243,7 @@ test('conc067', ignore_stdout, compile_and_run, ['']) test('conc068', [ omit_ways(concurrent_ways), exit_code(1) ], compile_and_run, ['']) test('setnumcapabilities001', - [ only_ways(['threaded1','threaded2']), + [ only_ways(['threaded1','threaded2', 'nonmoving_thr']), extra_run_opts('8 12 2000'), req_smp ], compile_and_run, ['']) @@ -254,7 +254,7 @@ test('compareAndSwap', [omit_ways(['ghci','hpc']), reqlib('primitive')], compile test('hs_try_putmvar001', [ when(opsys('mingw32'),skip), # uses pthread APIs in the C code - only_ways(['threaded1','threaded2']), + only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), extra_clean(['hs_try_putmvar001_c.o'])], compile_and_run, ['hs_try_putmvar001_c.c']) @@ -272,7 +272,7 @@ test('hs_try_putmvar003', [ when(opsys('mingw32'),skip), # uses pthread APIs in the C code pre_cmd('$MAKE -s --no-print-directory hs_try_putmvar003_setup'), - only_ways(['threaded1','threaded2']), + only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), extra_clean(['hs_try_putmvar003_c.o']), extra_run_opts('1 16 32 100'), fragile_for(16361, ['threaded1']) diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T index afac3752fa2deb765b41bb47956c83540c6e8935..6b80e193d1f2a72181b2dc5a64f29525a793e4fb 100644 --- a/testsuite/tests/perf/compiler/all.T +++ b/testsuite/tests/perf/compiler/all.T @@ -385,7 +385,9 @@ test ('T9630', extra_clean(['T9630a.hi', 'T9630a.o']), # Use `+RTS -G1` for more stable residency measurements. Note [residency]. - extra_hc_opts('+RTS -G1 -RTS') + extra_hc_opts('+RTS -G1 -RTS'), + # The nonmoving collector does not support -G1 + omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']) ], multimod_compile, ['T9630', '-v0 -O']) diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T index 9e20ba0b81e216e032c6d680e6b629cd0c9dc5d1..36f63c571ead24c7092b5837a5bb8f87f0003e3d 100644 --- a/testsuite/tests/rts/all.T +++ b/testsuite/tests/rts/all.T @@ -12,7 +12,10 @@ test('testmblockalloc', # See bug #101, test requires +RTS -c (or equivalently +RTS -M<something>) # only GHCi triggers the bug, but we run the test all ways for completeness. -test('bug1010', normal, compile_and_run, ['+RTS -c -RTS']) +test('bug1010', + # Non-moving GC doesn't support -c + omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']), + compile_and_run, ['+RTS -c -RTS']) def normalise_address(str): return re.sub('Access violation in generated code when reading [0]+', @@ -67,8 +70,12 @@ test('outofmem', when(opsys('darwin'), skip), makefile_test, ['outofmem']) test('outofmem2', normal, makefile_test, ['outofmem2']) -test('T2047', [ignore_stdout, extra_run_opts('+RTS -c -RTS')], - compile_and_run, ['-package containers']) +test('T2047', + [ignore_stdout, + extra_run_opts('+RTS -c -RTS'), + # Non-moving collector doesn't support -c + omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])], + compile_and_run, ['-package containers']) # Blackhole-detection test. # Skip GHCi due to #2786 @@ -183,7 +190,7 @@ test('T6006', [ omit_ways(prof_ways + ['ghci']), test('T7037', [], makefile_test, ['T7037']) test('T7087', exit_code(1), compile_and_run, ['']) -test('T7160', normal, compile_and_run, ['']) +test('T7160', omit_ways(['nonmoving_thr', 'nonmoving_thr_ghc']), compile_and_run, ['']) test('T7040', [omit_ways(['ghci'])], compile_and_run, ['T7040_c.c']) diff --git a/testsuite/tests/rts/testblockalloc.c b/testsuite/tests/rts/testblockalloc.c index 577245f45e2739698bebb2f0c6a32bee3c26335d..53eed240156922fd8c32a2ec22ef3eb7d3b0d341 100644 --- a/testsuite/tests/rts/testblockalloc.c +++ b/testsuite/tests/rts/testblockalloc.c @@ -3,6 +3,7 @@ #include <stdio.h> extern bdescr *allocGroup_lock_lock(uint32_t n); +extern bdescr *allocAlignedGroupOnNode (uint32_t node, W_ n); extern void freeGroup_lock(bdescr *p); const int ARRSIZE = 256; @@ -13,64 +14,110 @@ const int SEED = 0xf00f00; extern StgWord mblocks_allocated; -int main (int argc, char *argv[]) +static void test_random_alloc(void) { - int i, j, b; - bdescr *a[ARRSIZE]; - srand(SEED); + // repeatedly sweep though the array, allocating new random-sized + // objects and deallocating the old ones. + for (int i=0; i < LOOPS; i++) + { + for (int j=0; j < ARRSIZE; j++) + { + if (i > 0) + { + IF_DEBUG(block_alloc, debugBelch("A%d: freeing %p, %d blocks @ %p\n", j, a[j], a[j]->blocks, a[j]->start)); + freeGroup_lock(a[j]); + DEBUG_ONLY(checkFreeListSanity()); + } + + int b = (rand() % MAXALLOC) + 1; + a[j] = allocGroup_lock(b); + IF_DEBUG(block_alloc, debugBelch("A%d: allocated %p, %d blocks @ %p\n", j, a[j], b, a[j]->start)); + // allocating zero blocks isn't allowed + DEBUG_ONLY(checkFreeListSanity()); + } + } + for (int j=0; j < ARRSIZE; j++) { - RtsConfig conf = defaultRtsConfig; - conf.rts_opts_enabled = RtsOptsAll; - hs_init_ghc(&argc, &argv, conf); + freeGroup_lock(a[j]); } +} + +static void test_sequential_alloc(void) +{ + bdescr *a[ARRSIZE]; - // repeatedly sweep though the array, allocating new random-sized - // objects and deallocating the old ones. - for (i=0; i < LOOPS; i++) - { - for (j=0; j < ARRSIZE; j++) - { - if (i > 0) - { - IF_DEBUG(block_alloc, debugBelch("A%d: freeing %p, %d blocks @ %p\n", j, a[j], a[j]->blocks, a[j]->start)); - freeGroup_lock(a[j]); - DEBUG_ONLY(checkFreeListSanity()); - } - b = (rand() % MAXALLOC) + 1; - a[j] = allocGroup_lock(b); - IF_DEBUG(block_alloc, debugBelch("A%d: allocated %p, %d blocks @ %p\n", j, a[j], b, a[j]->start)); - // allocating zero blocks isn't allowed - DEBUG_ONLY(checkFreeListSanity()); - } - } - - for (j=0; j < ARRSIZE; j++) - { - freeGroup_lock(a[j]); - } - // this time, sweep forwards allocating new blocks, and then // backwards deallocating them. - for (i=0; i < LOOPS; i++) + for (int i=0; i < LOOPS; i++) { - for (j=0; j < ARRSIZE; j++) + for (int j=0; j < ARRSIZE; j++) { - b = (rand() % MAXALLOC) + 1; + int b = (rand() % MAXALLOC) + 1; a[j] = allocGroup_lock(b); IF_DEBUG(block_alloc, debugBelch("B%d,%d: allocated %p, %d blocks @ %p\n", i, j, a[j], b, a[j]->start)); DEBUG_ONLY(checkFreeListSanity()); } - for (j=ARRSIZE-1; j >= 0; j--) + for (int j=ARRSIZE-1; j >= 0; j--) { IF_DEBUG(block_alloc, debugBelch("B%d,%d: freeing %p, %d blocks @ %p\n", i, j, a[j], a[j]->blocks, a[j]->start)); freeGroup_lock(a[j]); DEBUG_ONLY(checkFreeListSanity()); } } - +} + +static void test_aligned_alloc(void) +{ + bdescr *a[ARRSIZE]; + + // this time, sweep forwards allocating new blocks, and then + // backwards deallocating them. + for (int i=0; i < LOOPS; i++) + { + for (int j=0; j < ARRSIZE; j++) + { + // allocAlignedGroupOnNode does not support allocating more than + // BLOCKS_PER_MBLOCK/2 blocks. + int b = rand() % (BLOCKS_PER_MBLOCK / 2); + if (b == 0) { b = 1; } + a[j] = allocAlignedGroupOnNode(0, b); + if ((((W_)(a[j]->start)) % (b*BLOCK_SIZE)) != 0) + { + barf("%p is not aligned to allocation size %d", a[j], b); + } + IF_DEBUG(block_alloc, debugBelch("B%d,%d: allocated %p, %d blocks @ %p\n", i, j, a[j], b, a[j]->start)); + DEBUG_ONLY(checkFreeListSanity()); + } + for (int j=ARRSIZE-1; j >= 0; j--) + { + IF_DEBUG(block_alloc, debugBelch("B%d,%d: freeing %p, %d blocks @ %p\n", i, j, a[j], a[j]->blocks, a[j]->start)); + freeGroup_lock(a[j]); + DEBUG_ONLY(checkFreeListSanity()); + } + } +} + +int main (int argc, char *argv[]) +{ + int i, j, b; + + bdescr *a[ARRSIZE]; + + srand(SEED); + + { + RtsConfig conf = defaultRtsConfig; + conf.rts_opts_enabled = RtsOptsAll; + hs_init_ghc(&argc, &argv, conf); + } + + test_random_alloc(); + test_sequential_alloc(); + test_aligned_alloc(); + DEBUG_ONLY(checkFreeListSanity()); hs_exit(); // will do a memory leak test diff --git a/utils/deriveConstants/Main.hs b/utils/deriveConstants/Main.hs index 54533254dd1adfe502bd64b07619696c91320ec0..f6f590715ba73c46313744aa597250d6e67e0fe6 100644 --- a/utils/deriveConstants/Main.hs +++ b/utils/deriveConstants/Main.hs @@ -307,6 +307,9 @@ wanteds os = concat "sizeofW(StgHeader) - sizeofW(StgProfHeader)" ,constantWord Both "PROF_HDR_SIZE" "sizeofW(StgProfHeader)" + -- Stack flags for C-- + ,constantWord C "STACK_DIRTY" "STACK_DIRTY" + -- Size of a storage manager block (in bytes). ,constantWord Both "BLOCK_SIZE" "BLOCK_SIZE" ,constantWord C "MBLOCK_SIZE" "MBLOCK_SIZE"