Commit 498ed266 authored by Simon Marlow's avatar Simon Marlow

NUMA cleanups

- Move the numaMap and nNumaNodes out of RtsFlags to Capability.c
- Add a test to tests/rts
parent a7f65b87
......@@ -73,9 +73,7 @@ typedef struct _GC_FLAGS {
*/
rtsBool numa; /* Use NUMA */
uint32_t nNumaNodes; /* Number of nodes */
uint32_t numaMap[MAX_NUMA_NODES]; /* Map our internal node numbers to OS
* node numbers */
StgWord numaMask;
} GC_FLAGS;
/* See Note [Synchronization of flags and base APIs] */
......
......@@ -114,7 +114,7 @@ data GCFlags = GCFlags
, heapBase :: Word -- ^ address to ask the OS for memory
, allocLimitGrace :: Word
, numa :: Bool
, nNumaNodes :: Word32
, numaMask :: Word
} deriving (Show)
-- | Parameters concerning context switching
......@@ -376,7 +376,7 @@ getGCFlags = do
<*> #{peek GC_FLAGS, heapBase} ptr
<*> #{peek GC_FLAGS, allocLimitGrace} ptr
<*> #{peek GC_FLAGS, numa} ptr
<*> #{peek GC_FLAGS, nNumaNodes} ptr
<*> #{peek GC_FLAGS, numaMask} ptr
getParFlags :: IO ParFlags
getParFlags = do
......
......@@ -26,6 +26,7 @@
#include "sm/GC.h" // for gcWorkerThread()
#include "STM.h"
#include "RtsUtils.h"
#include "sm/OSMem.h"
#if !defined(mingw32_HOST_OS)
#include "rts/IOManager.h" // for setIOManagerControlFd()
......@@ -59,6 +60,12 @@ static Capability *last_free_capability[MAX_NUMA_NODES];
*/
PendingSync * volatile pending_sync = 0;
// Number of logical NUMA nodes
uint32_t n_numa_nodes;
// Map logical NUMA node to OS node numbers
uint32_t numa_map[MAX_NUMA_NODES];
/* Let foreign code get the current Capability -- assuming there is one!
* This is useful for unsafe foreign calls because they are called with
* the current Capability held, but they are not passed it. For example,
......@@ -326,6 +333,31 @@ void initCapabilities (void)
traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess);
traceCapsetCreate(CAPSET_CLOCKDOMAIN_DEFAULT, CapsetTypeClockdomain);
// Initialise NUMA
if (!RtsFlags.GcFlags.numa) {
n_numa_nodes = 1;
for (i = 0; i < MAX_NUMA_NODES; i++) {
numa_map[i] = 0;
}
} else {
uint32_t nNodes = osNumaNodes();
if (nNodes > MAX_NUMA_NODES) {
barf("Too many NUMA nodes (max %d)", MAX_NUMA_NODES);
}
StgWord mask = RtsFlags.GcFlags.numaMask & osNumaMask();
uint32_t logical = 0, physical = 0;
for (; physical < MAX_NUMA_NODES; physical++) {
if (mask & 1) {
numa_map[logical++] = physical;
}
mask = mask >> 1;
}
n_numa_nodes = logical;
if (logical == 0) {
barf("%s: available NUMA node set is empty");
}
}
#if defined(THREADED_RTS)
#ifndef REG_Base
......@@ -355,7 +387,7 @@ void initCapabilities (void)
// There are no free capabilities to begin with. We will start
// a worker Task to each Capability, which will quickly put the
// Capability on the free list when it finds nothing to do.
for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) {
for (i = 0; i < n_numa_nodes; i++) {
last_free_capability[i] = capabilities[0];
}
}
......@@ -730,9 +762,9 @@ void waitForCapability (Capability **pCap, Task *task)
// Otherwise, search for a free capability on this node.
cap = NULL;
for (i = task->node; i < enabled_capabilities;
i += RtsFlags.GcFlags.nNumaNodes) {
i += n_numa_nodes) {
// visits all the capabilities on this node, because
// cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes
// cap[i]->node == i % n_numa_nodes
if (!capabilities[i]->running_task) {
cap = capabilities[i];
break;
......
......@@ -39,7 +39,7 @@ struct Capability_ {
// The NUMA node on which this capability resides. This is used to allocate
// node-local memory in allocate().
//
// Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes.
// Note: this is always equal to cap->no % n_numa_nodes.
// The reason we slice it this way is that if we add or remove capabilities
// via setNumCapabilities(), then we keep the number of capabilities on each
// NUMA node balanced.
......@@ -159,9 +159,6 @@ struct Capability_ {
#endif
;
#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes)
#if defined(THREADED_RTS)
#define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
#else
......@@ -349,6 +346,18 @@ void markCapabilities (evac_fn evac, void *user);
void traverseSparkQueues (evac_fn evac, void *user);
/* -----------------------------------------------------------------------------
NUMA
-------------------------------------------------------------------------- */
/* Number of logical NUMA nodes */
extern uint32_t n_numa_nodes;
/* Map logical NUMA node to OS node numbers */
extern uint32_t numa_map[MAX_NUMA_NODES];
#define capNoToNumaNode(n) ((n) % n_numa_nodes)
/* -----------------------------------------------------------------------------
Messages
-------------------------------------------------------------------------- */
......
......@@ -123,7 +123,6 @@ static void errorRtsOptsDisabled (const char *s);
void initRtsFlagsDefaults(void)
{
uint32_t i;
StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10;
// if getPhysicalMemorySize fails just move along with an 8MB limit
if (maxStkSize == 0)
......@@ -160,10 +159,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.GcFlags.heapBase = 0; /* means don't care */
RtsFlags.GcFlags.allocLimitGrace = (100*1024) / BLOCK_SIZE;
RtsFlags.GcFlags.numa = rtsFalse;
RtsFlags.GcFlags.nNumaNodes = 1;
for (i = 0; i < MAX_NUMA_NODES; i++) {
RtsFlags.GcFlags.numaMap[i] = 0;
}
RtsFlags.GcFlags.numaMask = 1;
RtsFlags.DebugFlags.scheduler = rtsFalse;
RtsFlags.DebugFlags.interpreter = rtsFalse;
......@@ -776,28 +772,8 @@ error = rtsTrue;
break;
}
uint32_t nNodes = osNumaNodes();
if (nNodes > MAX_NUMA_NODES) {
errorBelch("%s: Too many NUMA nodes (max %d)",
rts_argv[arg], MAX_NUMA_NODES);
error = rtsTrue;
} else {
RtsFlags.GcFlags.numa = rtsTrue;
mask = mask & osNumaMask();
uint32_t logical = 0, physical = 0;
for (; physical < MAX_NUMA_NODES; physical++) {
if (mask & 1) {
RtsFlags.GcFlags.numaMap[logical++] = physical;
}
mask = mask >> 1;
}
RtsFlags.GcFlags.nNumaNodes = logical;
if (logical == 0) {
errorBelch("%s: available node set is empty",
rts_argv[arg]);
error = rtsTrue;
}
}
RtsFlags.GcFlags.numa = rtsTrue;
RtsFlags.GcFlags.numaMask = mask;
}
#endif
#if defined(DEBUG) && defined(THREADED_RTS)
......@@ -821,11 +797,7 @@ error = rtsTrue;
} else {
RtsFlags.GcFlags.numa = rtsTrue;
RtsFlags.DebugFlags.numa = rtsTrue;
RtsFlags.GcFlags.nNumaNodes = nNodes;
uint32_t physical = 0;
for (; physical < MAX_NUMA_NODES; physical++) {
RtsFlags.GcFlags.numaMap[physical] = physical;
}
RtsFlags.GcFlags.numaMask = (1<<nNodes) - 1;
}
}
#endif
......
......@@ -429,7 +429,7 @@ workerStart(Task *task)
setThreadAffinity(cap->no, n_capabilities);
}
if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) {
setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
setThreadNode(numa_map[task->node]);
}
// set the thread-local pointer to the Task:
......@@ -510,7 +510,7 @@ void rts_setInCallCapability (
if (RtsFlags.GcFlags.numa) {
task->node = capNoToNumaNode(preferred_capability);
if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA
setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
setThreadNode(numa_map[task->node]);
}
}
}
......
......@@ -321,7 +321,6 @@ setThreadAffinity (uint32_t n STG_UNUSED,
#if HAVE_LIBNUMA
void setThreadNode (uint32_t node)
{
ASSERT(node < RtsFlags.GcFlags.nNumaNodes);
if (numa_run_on_node(node) == -1) {
sysErrorBelch("numa_run_on_node");
stg_exit(1);
......
......@@ -467,7 +467,7 @@ uint32_t nodeWithLeastBlocks (void)
{
uint32_t node = 0, i;
uint32_t min_blocks = n_alloc_blocks_by_node[0];
for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) {
for (i = 1; i < n_numa_nodes; i++) {
if (n_alloc_blocks_by_node[i] < min_blocks) {
min_blocks = n_alloc_blocks_by_node[i];
node = i;
......@@ -504,7 +504,7 @@ bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
StgWord ln, lnmax;
if (min >= BLOCKS_PER_MBLOCK) {
return allocGroup(max);
return allocGroupOnNode(node,max);
}
ln = log_2_ceil(min);
......@@ -811,7 +811,7 @@ void returnMemoryToOS(uint32_t n /* megablocks */)
StgWord size;
// ToDo: not fair, we free all the memory starting with node 0.
for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) {
for (node = 0; n > 0 && node < n_numa_nodes; node++) {
bd = free_mblock_list[node];
while ((n > 0) && (bd != NULL)) {
size = BLOCKS_TO_MBLOCKS(bd->blocks);
......@@ -875,7 +875,7 @@ checkFreeListSanity(void)
StgWord ln, min;
uint32_t node;
for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
for (node = 0; node < n_numa_nodes; node++) {
min = 1;
for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
IF_DEBUG(block_alloc,
......@@ -950,7 +950,7 @@ countFreeList(void)
StgWord ln;
uint32_t node;
for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
for (node = 0; node < n_numa_nodes; node++) {
for (ln=0; ln < NUM_FREE_LISTS; ln++) {
for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
total_blocks += bd->blocks;
......
......@@ -594,7 +594,7 @@ getMBlocksOnNode(uint32_t node, uint32_t n)
#ifdef DEBUG
if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA
#endif
osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]);
osBindMBlocksToNode(addr, n * MBLOCK_SIZE, numa_map[node]);
return addr;
}
......
......@@ -57,7 +57,7 @@ generation *oldest_gen = NULL; /* oldest generation, for convenience */
/*
* Array of nurseries, size == n_capabilities
*
* nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes)
* nursery[i] belongs to NUMA node (i % n_numa_nodes)
* This is chosen to be the same convention as capabilities[i], so
* that when not using nursery chunks (+RTS -n), we just map
* capabilities to nurseries 1:1.
......@@ -209,7 +209,7 @@ initStorage (void)
N = 0;
for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
for (n = 0; n < n_numa_nodes; n++) {
next_nursery[n] = n;
}
storageAddCapabilities(0, n_capabilities);
......@@ -615,7 +615,7 @@ assignNurseriesToCapabilities (uint32_t from, uint32_t to)
for (i = from; i < to; i++) {
node = capabilities[i]->node;
assignNurseryToCapability(capabilities[i], next_nursery[node]);
next_nursery[node] += RtsFlags.GcFlags.nNumaNodes;
next_nursery[node] += n_numa_nodes;
}
}
......@@ -642,7 +642,7 @@ resetNurseries (void)
{
uint32_t n;
for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
for (n = 0; n < n_numa_nodes; n++) {
next_nursery[n] = n;
}
assignNurseriesToCapabilities(0, n_capabilities);
......@@ -758,22 +758,20 @@ getNewNursery (Capability *cap)
for(;;) {
i = next_nursery[node];
if (i < n_nurseries) {
if (cas(&next_nursery[node], i,
i+RtsFlags.GcFlags.nNumaNodes) == i) {
if (cas(&next_nursery[node], i, i+n_numa_nodes) == i) {
assignNurseryToCapability(cap, i);
return rtsTrue;
}
} else if (RtsFlags.GcFlags.nNumaNodes > 1) {
} else if (n_numa_nodes > 1) {
// Try to find an unused nursery chunk on other nodes. We'll get
// remote memory, but the rationale is that avoiding GC is better
// than avoiding remote memory access.
rtsBool lost = rtsFalse;
for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
for (n = 0; n < n_numa_nodes; n++) {
if (n == node) continue;
i = next_nursery[n];
if (i < n_nurseries) {
if (cas(&next_nursery[n], i,
i+RtsFlags.GcFlags.nNumaNodes) == i) {
if (cas(&next_nursery[n], i, i+n_numa_nodes) == i) {
assignNurseryToCapability(cap, i);
return rtsTrue;
} else {
......
......@@ -350,3 +350,6 @@ test('T10296a', [extra_clean(['T10296a.o','T10296a_c.o','T10296a'])],
['$MAKE -s --no-print-directory T10296a'])
test('T10296b', [only_ways('threaded2')], compile_and_run, [''])
test('numa001', [ extra_run_opts('8'), extra_ways(['debug_numa']) ]
, compile_and_run, [''])
import System.Environment
import Control.Monad
import Control.Concurrent
main = do
[n] <- map read <$> getArgs
mvars <- replicateM n newEmptyMVar
sequence_ [ forkIO $ putMVar m $! nsoln n
| (m,n) <- zip mvars (repeat 9) ]
mapM_ takeMVar mvars
nsoln nq = length (gen nq)
where
safe :: Int -> Int -> [Int] -> Bool
safe x d [] = True
safe x d (q:l) = x /= q && x /= q+d && x /= q-d && safe x (d+1) l
gen :: Int -> [[Int]]
gen 0 = [[]]
gen n = [ (q:b) | b <- gen (n-1), q <- [1..nq], safe q 1 b]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment