Commit 9e5ea67e authored by Simon Marlow's avatar Simon Marlow

NUMA support

Summary:
The aim here is to reduce the number of remote memory accesses on
systems with a NUMA memory architecture, typically multi-socket servers.

Linux provides a NUMA API for doing two things:
* Allocating memory local to a particular node
* Binding a thread to a particular node

When given the +RTS --numa flag, the runtime will
* Determine the number of NUMA nodes (N) by querying the OS
* Assign capabilities to nodes, so cap C is on node C%N
* Bind worker threads on a capability to the correct node
* Keep a separate free lists in the block layer for each node
* Allocate the nursery for a capability from node-local memory
* Allocate blocks in the GC from node-local memory

For example, using nofib/parallel/queens on a 24-core 2-socket machine:

```
$ ./Main 15 +RTS -N24 -s -A64m
  Total   time  173.960s  (  7.467s elapsed)

$ ./Main 15 +RTS -N24 -s -A64m --numa
  Total   time  150.836s  (  6.423s elapsed)
```

The biggest win here is expected to be allocating from node-local
memory, so that means programs using a large -A value (as here).

According to perf, on this program the number of remote memory accesses
were reduced by more than 50% by using `--numa`.

Test Plan:
* validate
* There's a new flag --debug-numa=<n> that pretends to do NUMA without
  actually making the OS calls, which is useful for testing the code
  on non-NUMA systems.
* TODO: I need to add some unit tests

Reviewers: erikd, austin, rwbarton, ezyang, bgamari, hvr, niteria

Subscribers: thomie

Differential Revision: https://phabricator.haskell.org/D2199
parent b9fa72a2
......@@ -1103,6 +1103,13 @@ if test $UseLibdw = "YES" ; then
fi
AC_DEFINE_UNQUOTED([USE_LIBDW], [$USE_LIBDW], [Set to 1 to use libdw])
dnl ** Have libnuma?
dnl --------------------------------------------------------------
AC_CHECK_HEADERS([numa.h numaif.h])
AC_CHECK_LIB(numa, numa_available,
[AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have libnuma.])]
[])
dnl ** Documentation
dnl --------------------------------------------------------------
if test -n "$SPHINXBUILD"; then
......
......@@ -643,6 +643,56 @@ performance.
``-F`` parameter will be reduced in order to avoid exceeding the
maximum heap size.
.. rts-flag:: --numa
--numa=<mask>
.. index::
single: NUMA, enabling in the runtime
Enable NUMA-aware memory allocation in the runtime (only available
with ``-threaded``, and only on Linux currently).
Background: some systems have a Non-Uniform Memory Architecture,
whereby main memory is split into banks which are "local" to
specific CPU cores. Accessing local memory is faster than
accessing remote memory. The OS provides APIs for allocating
local memory and binding threads to particular CPU cores, so that
we can ensure certain memory accesses are using local memory.
The ``--numa`` option tells the RTS to tune its memory usage to
maximize local memory accesses. In particular, the RTS will:
- Determine the number of NUMA nodes (N) by querying the OS.
- Manage separate memory pools for each node.
- Map capabilities to NUMA nodes. Capability C is mapped to
NUMA node C mod N.
- Bind worker threads on a capability to the appropriate node.
- Allocate the nursery from node-local memory.
- Perform other memory allocation, including in the GC, from
node-local memory.
- When load-balancing, we prefer to migrate threads to another
Capability on the same node.
The ``--numa`` flag is typically beneficial when a program is
using all cores of a large multi-core NUMA system, with a large
allocation area (``-A``). All memory accesses to the allocation
area will go to local memory, which can save a significant amount
of remote memory access. A runtime speedup on the order of 10%
is typical, but can vary a lot depending on the hardware and the
memory behaviour of the program.
Note that the RTS will not set CPU affinity for bound threads and
threads entering Haskell from C/C++, so if your program uses bound
threads you should ensure that each bound thread calls the RTS API
`rts_setInCallCapability(c,1)` from C/C++ before calling into
Haskell. Otherwise there could be a mismatch between the CPU that
the thread is running on and the memory it is using while running
Haskell code, which will negate any benefits of ``--numa``.
If given an explicit <mask>, the <mask> is interpreted as a bitmap
that indicates the NUMA nodes on which to run the program. For
example, ``--numa=3`` would run the program on NUMA nodes 0 and 1.
.. _rts-options-statistics:
RTS options to produce runtime statistics
......
......@@ -325,7 +325,6 @@
#include "DerivedConstants.h"
#include "rts/storage/ClosureTypes.h"
#include "rts/storage/FunTypes.h"
#include "rts/storage/SMPClosureOps.h"
#include "rts/OSThreads.h"
/*
......
......@@ -203,7 +203,6 @@ INLINE_HEADER Time fsecondsToTime (double t)
#include "rts/storage/ClosureTypes.h"
#include "rts/storage/TSO.h"
#include "stg/MiscClosures.h" /* InfoTables, closures etc. defined in the RTS */
#include "rts/storage/SMPClosureOps.h"
#include "rts/storage/Block.h"
#include "rts/storage/ClosureMacros.h"
#include "rts/storage/MBlock.h"
......
......@@ -179,7 +179,11 @@ Capability *rts_unsafeGetMyCapability (void);
// Note that the thread may still be migrated by the RTS scheduler, but that
// will only happen if there are multiple threads running on one Capability and
// another Capability is free.
void setInCallCapability (int preferred_capability);
//
// If affinity is non-zero, the current thread will be bound to
// specific CPUs according to the prevailing affinity policy for the
// specified capability, set by either +RTS -qa or +RTS --numa.
void rts_setInCallCapability (int preferred_capability, int affinity);
/* ----------------------------------------------------------------------------
Building Haskell objects from C datatypes.
......
......@@ -295,4 +295,10 @@
#define MAX_SPARE_WORKERS 6
/*
* The maximum number of NUMA nodes we support. This is a fixed limit so that
* we can have static arrays of this size in the RTS for speed.
*/
#define MAX_NUMA_NODES 16
#endif /* RTS_CONSTANTS_H */
......@@ -73,6 +73,11 @@ typedef struct _GC_FLAGS {
* to handle the exception before we
* raise it again.
*/
rtsBool numa; /* Use NUMA */
uint32_t nNumaNodes; /* Number of nodes */
uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS
* node numbers */
} GC_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
......@@ -93,6 +98,7 @@ typedef struct _DEBUG_FLAGS {
rtsBool squeeze; /* 'z' stack squeezing & lazy blackholing */
rtsBool hpc; /* 'c' coverage */
rtsBool sparks; /* 'r' */
rtsBool numa; /* '--debug-numa' */
} DEBUG_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
......@@ -184,7 +190,7 @@ typedef struct _MISC_FLAGS {
#ifdef THREADED_RTS
/* See Note [Synchronization of flags and base APIs] */
typedef struct _PAR_FLAGS {
uint32_t nNodes; /* number of threads to run simultaneously */
uint32_t nCapabilities; /* number of threads to run simultaneously */
rtsBool migrate; /* migrate threads between capabilities */
uint32_t maxLocalSparks;
rtsBool parGcEnabled; /* enable parallel GC */
......
......@@ -200,7 +200,9 @@ void setThreadLocalVar (ThreadLocalKey *key, void *value);
void freeThreadLocalKey (ThreadLocalKey *key);
// Processors and affinity
void setThreadAffinity (uint32_t n, uint32_t m);
void setThreadAffinity (uint32_t n, uint32_t m);
void setThreadNode (uint32_t node);
void releaseThreadNode (void);
#endif // !CMINUSMINUS
#else
......
......@@ -58,7 +58,9 @@ pid_t forkProcess (HsStablePtr *entry)
HsBool rtsSupportsBoundThreads (void);
// The number of Capabilities
// The number of Capabilities.
// ToDo: I would like this to be private to the RTS and instead expose a
// function getNumCapabilities(), but it is used in compiler/cbits/genSym.c
extern unsigned int n_capabilities;
// The number of Capabilities that are not disabled
......
......@@ -111,7 +111,7 @@ typedef struct bdescr_ {
StgWord16 gen_no; // gen->no, cached
StgWord16 dest_no; // number of destination generation
StgWord16 _pad1;
StgWord16 node; // which memory node does this block live on?
StgWord16 flags; // block flags, see below
......@@ -280,12 +280,28 @@ extern void initBlockAllocator(void);
/* Allocation -------------------------------------------------------------- */
bdescr *allocGroup(W_ n);
bdescr *allocBlock(void);
EXTERN_INLINE bdescr* allocBlock(void);
EXTERN_INLINE bdescr* allocBlock(void)
{
return allocGroup(1);
}
bdescr *allocGroupOnNode(uint32_t node, W_ n);
EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node);
EXTERN_INLINE bdescr* allocBlockOnNode(uint32_t node)
{
return allocGroupOnNode(node,1);
}
// versions that take the storage manager lock for you:
bdescr *allocGroup_lock(W_ n);
bdescr *allocBlock_lock(void);
bdescr *allocGroupOnNode_lock(uint32_t node, W_ n);
bdescr *allocBlockOnNode_lock(uint32_t node);
/* De-Allocation ----------------------------------------------------------- */
void freeGroup(bdescr *p);
......
......@@ -18,6 +18,8 @@ extern W_ mblocks_allocated;
extern void initMBlocks(void);
extern void * getMBlock(void);
extern void * getMBlocks(uint32_t n);
extern void * getMBlockOnNode(uint32_t node);
extern void * getMBlocksOnNode(uint32_t node, uint32_t n);
extern void freeMBlocks(void *addr, uint32_t n);
extern void releaseFreeMemory(void);
extern void freeAllMBlocks(void);
......
......@@ -51,7 +51,7 @@ Capability **capabilities = NULL;
// an in-call has a chance of quickly finding a free Capability.
// Maintaining a global free list of Capabilities would require global
// locking, so we don't do that.
static Capability *last_free_capability = NULL;
static Capability *last_free_capability[MAX_NUMA_NODES];
/*
* Indicates that the RTS wants to synchronise all the Capabilities
......@@ -230,11 +230,12 @@ popReturningTask (Capability *cap)
* ------------------------------------------------------------------------- */
static void
initCapability( Capability *cap, uint32_t i )
initCapability (Capability *cap, uint32_t i)
{
uint32_t g;
cap->no = i;
cap->node = capNoToNumaNode(i);
cap->in_haskell = rtsFalse;
cap->idle = 0;
cap->disabled = rtsFalse;
......@@ -316,9 +317,10 @@ initCapability( Capability *cap, uint32_t i )
* controlled by the user via the RTS flag -N.
*
* ------------------------------------------------------------------------- */
void
initCapabilities( void )
void initCapabilities (void)
{
uint32_t i;
/* Declare a couple capability sets representing the process and
clock domain. Each capability will get added to these capsets. */
traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess);
......@@ -328,21 +330,22 @@ initCapabilities( void )
#ifndef REG_Base
// We can't support multiple CPUs if BaseReg is not a register
if (RtsFlags.ParFlags.nNodes > 1) {
if (RtsFlags.ParFlags.nCapabilities > 1) {
errorBelch("warning: multiple CPUs not supported in this build, reverting to 1");
RtsFlags.ParFlags.nNodes = 1;
RtsFlags.ParFlags.nCapabilities = 1;
}
#endif
n_capabilities = 0;
moreCapabilities(0, RtsFlags.ParFlags.nNodes);
n_capabilities = RtsFlags.ParFlags.nNodes;
moreCapabilities(0, RtsFlags.ParFlags.nCapabilities);
n_capabilities = RtsFlags.ParFlags.nCapabilities;
#else /* !THREADED_RTS */
n_capabilities = 1;
capabilities = stgMallocBytes(sizeof(Capability*), "initCapabilities");
capabilities[0] = &MainCapability;
initCapability(&MainCapability, 0);
#endif
......@@ -352,7 +355,9 @@ initCapabilities( void )
// There are no free capabilities to begin with. We will start
// a worker Task to each Capability, which will quickly put the
// Capability on the free list when it finds nothing to do.
last_free_capability = capabilities[0];
for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) {
last_free_capability[i] = capabilities[0];
}
}
void
......@@ -532,7 +537,7 @@ releaseCapability_ (Capability* cap,
#ifdef PROFILING
cap->r.rCCCS = CCS_IDLE;
#endif
last_free_capability = cap;
last_free_capability[cap->node] = cap;
debugTrace(DEBUG_sched, "freeing capability %d", cap->no);
}
......@@ -711,6 +716,7 @@ void waitForCapability (Capability **pCap, Task *task)
*pCap = &MainCapability;
#else
uint32_t i;
Capability *cap = *pCap;
if (cap == NULL) {
......@@ -719,12 +725,14 @@ void waitForCapability (Capability **pCap, Task *task)
enabled_capabilities];
} else {
// Try last_free_capability first
cap = last_free_capability;
cap = last_free_capability[task->node];
if (cap->running_task) {
uint32_t i;
// otherwise, search for a free capability
// Otherwise, search for a free capability on this node.
cap = NULL;
for (i = 0; i < n_capabilities; i++) {
for (i = task->node; i < enabled_capabilities;
i += RtsFlags.GcFlags.nNumaNodes) {
// visits all the capabilities on this node, because
// cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes
if (!capabilities[i]->running_task) {
cap = capabilities[i];
break;
......@@ -732,7 +740,7 @@ void waitForCapability (Capability **pCap, Task *task)
}
if (cap == NULL) {
// Can't find a free one, use last_free_capability.
cap = last_free_capability;
cap = last_free_capability[task->node];
}
}
}
......
......@@ -36,6 +36,15 @@ struct Capability_ {
uint32_t no; // capability number.
// The NUMA node on which this capability resides. This is used to allocate
// node-local memory in allocate().
//
// Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes.
// The reason we slice it this way is that if we add or remove capabilities
// via setNumCapabilities(), then we keep the number of capabilities on each
// NUMA node balanced.
uint32_t node;
// The Task currently holding this Capability. This task has
// exclusive access to the contents of this Capability (apart from
// returning_tasks_hd/returning_tasks_tl).
......@@ -151,6 +160,8 @@ struct Capability_ {
;
#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes)
#if defined(THREADED_RTS)
#define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
#else
......@@ -221,7 +232,6 @@ INLINE_HEADER void releaseCapability_ (Capability* cap STG_UNUSED,
// extern uint32_t enabled_capabilities;
// Array of all the capabilities
//
extern Capability **capabilities;
//
......@@ -364,7 +374,7 @@ recordMutableCap (const StgClosure *p, Capability *cap, uint32_t gen)
bd = cap->mut_lists[gen];
if (bd->free >= bd->start + BLOCK_SIZE_W) {
bdescr *new_bd;
new_bd = allocBlock_lock();
new_bd = allocBlockOnNode_lock(cap->node);
new_bd->link = bd;
bd = new_bd;
cap->mut_lists[gen] = bd;
......
......@@ -12,6 +12,7 @@
#include "Cmm.h"
#include "Updates.h"
#include "SMPClosureOps.h"
#ifdef __PIC__
import pthread_mutex_unlock;
......
......@@ -7,3 +7,4 @@
#include "Schedule.h"
#include "Capability.h"
#include "WSDeque.h"
#include "SMPClosureOps.h"
......@@ -18,6 +18,7 @@ void sendMessage (Capability *from_cap, Capability *to_cap, Message *msg);
#include "Capability.h"
#include "Updates.h" // for DEBUG_FILL_SLOP
#include "SMPClosureOps.h"
INLINE_HEADER void
doneWithMsgThrowTo (MessageThrowTo *m)
......
......@@ -23,6 +23,7 @@
#include "Cmm.h"
#include "MachDeps.h"
#include "SMPClosureOps.h"
#ifdef __PIC__
import pthread_mutex_lock;
......
......@@ -9,6 +9,7 @@
#include "PosixSource.h"
#include "Rts.h"
#include "Capability.h"
#include "RtsFlags.h"
#include "RtsUtils.h"
#include "Profiling.h"
......
......@@ -15,6 +15,7 @@
#include "RtsFlags.h"
#include "sm/OSMem.h"
#include "hooks/Hooks.h"
#include "Capability.h"
#ifdef HAVE_CTYPE_H
#include <ctype.h>
......@@ -122,6 +123,7 @@ static void errorRtsOptsDisabled (const char *s);
void initRtsFlagsDefaults(void)
{
uint32_t i;
StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10;
// if getPhysicalMemorySize fails just move along with an 8MB limit
if (maxStkSize == 0)
......@@ -157,8 +159,12 @@ void initRtsFlagsDefaults(void)
#endif
RtsFlags.GcFlags.heapBase = 0; /* means don't care */
RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE;
RtsFlags.GcFlags.numa = rtsFalse;
RtsFlags.GcFlags.nNumaNodes = 1;
for (i = 0; i < MAX_NUMA_NODES; i++) {
RtsFlags.GcFlags.numaMap[i] = 0;
}
#ifdef DEBUG
RtsFlags.DebugFlags.scheduler = rtsFalse;
RtsFlags.DebugFlags.interpreter = rtsFalse;
RtsFlags.DebugFlags.weak = rtsFalse;
......@@ -174,7 +180,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.DebugFlags.squeeze = rtsFalse;
RtsFlags.DebugFlags.hpc = rtsFalse;
RtsFlags.DebugFlags.sparks = rtsFalse;
#endif
RtsFlags.DebugFlags.numa = rtsFalse;
#if defined(PROFILING)
RtsFlags.CcFlags.doCostCentres = 0;
......@@ -220,7 +226,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.MiscFlags.linkerMemBase = 0;
#ifdef THREADED_RTS
RtsFlags.ParFlags.nNodes = 1;
RtsFlags.ParFlags.nCapabilities = 1;
RtsFlags.ParFlags.migrate = rtsTrue;
RtsFlags.ParFlags.parGcEnabled = 1;
RtsFlags.ParFlags.parGcGen = 0;
......@@ -398,6 +404,14 @@ usage_text[] = {
" -qi<n> If a processor has been idle for the last <n> GCs, do not",
" wake it up for a non-load-balancing parallel GC.",
" (0 disables, default: 0)",
" --numa[=<node_mask>]",
" Use NUMA, nodes given by <node_mask> (default: off)",
#if defined(DEBUG)
" --debug-numa[=<num_nodes>]",
" Pretend NUMA: like --numa, but without the system calls.",
" Can be used on non-NUMA systems for debugging.",
"",
#endif
#endif
" --install-signal-handlers=<yes|no>",
" Install signal handlers (default: yes)",
......@@ -745,6 +759,76 @@ error = rtsTrue;
printRtsInfo();
stg_exit(0);
}
#if defined(THREADED_RTS)
else if (!strncmp("numa", &rts_argv[arg][2], 4)) {
OPTION_SAFE;
StgWord mask;
if (rts_argv[arg][6] == '=') {
mask = (StgWord)strtol(rts_argv[arg]+7,
(char **) NULL, 10);
} else {
mask = (StgWord)~0;
}
if (!osNumaAvailable()) {
errorBelch("%s: OS reports NUMA is not available",
rts_argv[arg]);
error = rtsTrue;
break;
}
uint32_t nNodes = osNumaNodes();
if (nNodes > MAX_NUMA_NODES) {
errorBelch("%s: Too many NUMA nodes (max %d)",
rts_argv[arg], MAX_NUMA_NODES);
error = rtsTrue;
} else {
RtsFlags.GcFlags.numa = rtsTrue;
mask = mask & osNumaMask();
uint32_t logical = 0, physical = 0;
for (; physical < MAX_NUMA_NODES; physical++) {
if (mask & 1) {
RtsFlags.GcFlags.numaMap[logical++] = physical;
}
mask = mask >> 1;
}
RtsFlags.GcFlags.nNumaNodes = logical;
if (logical == 0) {
errorBelch("%s: available node set is empty",
rts_argv[arg]);
error = rtsTrue;
}
}
}
#endif
#if defined(DEBUG) && defined(THREADED_RTS)
else if (!strncmp("debug-numa", &rts_argv[arg][2], 10)) {
OPTION_SAFE;
size_t nNodes;
if (rts_argv[arg][12] == '=' &&
isdigit(rts_argv[arg][13])) {
nNodes = (StgWord)strtol(rts_argv[arg]+13,
(char **) NULL, 10);
} else {
errorBelch("%s: missing number of nodes",
rts_argv[arg]);
error = rtsTrue;
break;
}
if (nNodes > MAX_NUMA_NODES) {
errorBelch("%s: Too many NUMA nodes (max %d)",
rts_argv[arg], MAX_NUMA_NODES);
error = rtsTrue;
} else {
RtsFlags.GcFlags.numa = rtsTrue;
RtsFlags.DebugFlags.numa = rtsTrue;
RtsFlags.GcFlags.nNumaNodes = nNodes;
uint32_t physical = 0;
for (; physical < MAX_NUMA_NODES; physical++) {
RtsFlags.GcFlags.numaMap[physical] = physical;
}
}
}
#endif
else {
OPTION_SAFE;
errorBelch("unknown RTS option: %s",rts_argv[arg]);
......@@ -856,20 +940,20 @@ error = rtsTrue;
if (strncmp("maxN", &rts_argv[arg][1], 4) == 0) {
OPTION_SAFE;
THREADED_BUILD_ONLY(
int nNodes;
int nCapabilities;
int proc = (int)getNumberOfProcessors();
nNodes = strtol(rts_argv[arg]+5, (char **) NULL, 10);
if (nNodes > proc) { nNodes = proc; }
nCapabilities = strtol(rts_argv[arg]+5, (char **) NULL, 10);
if (nCapabilities > proc) { nCapabilities = proc; }
if (nNodes <= 0) {
if (nCapabilities <= 0) {
errorBelch("bad value for -maxN");
error = rtsTrue;
}
#if defined(PROFILING)
RtsFlags.ParFlags.nNodes = 1;
RtsFlags.ParFlags.nCapabilities = 1;
#else
RtsFlags.ParFlags.nNodes = (uint32_t)nNodes;
RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities;
#endif
) break;
} else {
......@@ -1071,26 +1155,26 @@ error = rtsTrue;
THREADED_BUILD_ONLY(
if (rts_argv[arg][2] == '\0') {
#if defined(PROFILING)
RtsFlags.ParFlags.nNodes = 1;
RtsFlags.ParFlags.nCapabilities = 1;
#else
RtsFlags.ParFlags.nNodes = getNumberOfProcessors();
RtsFlags.ParFlags.nCapabilities = getNumberOfProcessors();
#endif
} else {
int nNodes;
int nCapabilities;
OPTION_SAFE; /* but see extra checks below... */
nNodes = strtol(rts_argv[arg]+2, (char **) NULL, 10);
nCapabilities = strtol(rts_argv[arg]+2, (char **) NULL, 10);
if (nNodes <= 0) {
if (nCapabilities <= 0) {
errorBelch("bad value for -N");
error = rtsTrue;
}
if (rtsOptsEnabled == RtsOptsSafeOnly &&
nNodes > (int)getNumberOfProcessors()) {
nCapabilities > (int)getNumberOfProcessors()) {
errorRtsOptsDisabled("Using large values for -N is not allowed by default. %s");
stg_exit(EXIT_FAILURE);
}
RtsFlags.ParFlags.nNodes = (uint32_t)nNodes;
RtsFlags.ParFlags.nCapabilities = (uint32_t)nCapabilities;
}
) break;
......@@ -1395,7 +1479,7 @@ static void normaliseRtsOpts (void)
}
#ifdef THREADED_RTS
if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nNodes) {
if (RtsFlags.ParFlags.parGcThreads > RtsFlags.ParFlags.nCapabilities) {
errorBelch("GC threads (-qn) must be between 1 and the value of -N");
errorUsage();
}
......
......@@ -9,6 +9,8 @@
#ifndef RTS_STORAGE_SMPCLOSUREOPS_H
#define RTS_STORAGE_SMPCLOSUREOPS_H
#include "BeginPrivate.h"
#ifdef CMINUSMINUS
/* Lock closure, equivalent to ccall lockClosure but the condition is inlined.
......@@ -122,4 +124,6 @@ INLINE_HEADER void unlockTSO(StgTSO *tso)
#endif /* CMINUSMINUS */
#include "EndPrivate.h"
#endif /* RTS_STORAGE_SMPCLOSUREOPS_H */
......@@ -92,6 +92,7 @@
#include "Trace.h"
#include "Threads.h"
#include "sm/Storage.h"
#include "SMPClosureOps.h"
#include <stdio.h>
......
......@@ -726,7 +726,8 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
} while (n_wanted_caps < n_capabilities-1);
}