Commit c93813d9 authored by Tamar Christina's avatar Tamar Christina

Add NUMA support for Windows

Summary:
NOTE: I have been able to do simple testing on emulated NUMA nodes.
           Real hardware would be needed for a proper test.

D2199 Added NUMA support for Linux, I have just filled in the missing pieces following
the description of the Linux APIs.

Test Plan:
Use `bcdedit.exe /set groupsize 2` to modify the kernel again (Similar to D2533).

This generates some NUMA nodes:

```
Logical Processor to NUMA Node Map:
NUMA Node 0:
**
--
NUMA Node 1:
--
**

Approximate Cross-NUMA Node Access Cost (relative to fastest):
     00  01
00: 1.1 1.1
01: 1.0 1.0
```

run ` ../test-numa.exe +RTS --numa -RTS`

and check PerfMon for NUMA allocations.

Reviewers: simonmar, erikd, bgamari, austin

Reviewed By: simonmar

Subscribers: thomie, #ghc_windows_task_force

Differential Revision: https://phabricator.haskell.org/D2534

GHC Trac Issues: #12602
parent 1e795a00
......@@ -71,6 +71,8 @@ Runtime system
event log, allowing heap profiles to be correlated with other tracing events
(see :ghc-ticket:`11094`).
- Added NUMA support to Windows.
- Added processor group support for Windows. This allows the runtime to allocate
threads to all cores in systems which have multiple processor groups.
(e.g. > 64 cores, see :ghc-ticket:`11054`)
......
......@@ -11,9 +11,7 @@
#include "sm/HeapAlloc.h"
#include "RtsUtils.h"
#if HAVE_WINDOWS_H
#include <windows.h>
#endif
typedef struct alloc_rec_ {
char* base; // non-aligned base address, directly from VirtualAlloc
......@@ -39,11 +37,28 @@ static alloc_rec* allocs = NULL;
/* free_blocks are kept in ascending order, and adjacent blocks are merged */
static block_rec* free_blocks = NULL;
/* Mingw-w64 does not currently have this in their header. So we have to import it.*/
typedef LPVOID(WINAPI *VirtualAllocExNumaProc)(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD);
/* Cache NUMA API call. */
VirtualAllocExNumaProc VirtualAllocExNuma;
void
osMemInit(void)
{
allocs = NULL;
free_blocks = NULL;
/* Resolve and cache VirtualAllocExNuma. */
if (osNumaAvailable() && RtsFlags.GcFlags.numa)
{
VirtualAllocExNuma = (VirtualAllocExNumaProc)GetProcAddress(GetModuleHandleW(L"kernel32"), "VirtualAllocExNuma");
if (!VirtualAllocExNuma)
{
sysErrorBelch(
"osBindMBlocksToNode: VirtualAllocExNuma does not exist. How did you get this far?");
}
}
}
static
......@@ -486,22 +501,72 @@ void osReleaseHeapMemory (void)
rtsBool osNumaAvailable(void)
{
return rtsFalse;
return osNumaNodes() > 1;
}
uint32_t osNumaNodes(void)
{
return 1;
/* Cache the amount of NUMA values. */
static ULONG numNumaNodes = 0;
/* Cache the amount of NUMA nodes. */
if (!numNumaNodes && !GetNumaHighestNodeNumber(&numNumaNodes))
{
numNumaNodes = 1;
}
return numNumaNodes;
}
StgWord osNumaMask(void)
{
return 1;
StgWord numaMask;
if (!GetNumaNodeProcessorMask(0, &numaMask))
{
return 1;
}
return numaMask;
}
void osBindMBlocksToNode(
void *addr STG_UNUSED,
StgWord size STG_UNUSED,
uint32_t node STG_UNUSED)
void *addr,
StgWord size,
uint32_t node)
{
if (osNumaAvailable())
{
void* temp;
if (RtsFlags.GcFlags.numa) {
/* Note [base memory]
I would like to use addr here to specify the base
memory of allocation. The problem is that the address
we are requesting is too high. I can't figure out if it's
because of my NUMA-emulation or a bug in the code.
On windows also -xb is broken, it does nothing so that can't
be used to tweak it (see #12577). So for now, just let the OS decide.
*/
temp = VirtualAllocExNuma(
GetCurrentProcess(),
NULL, // addr? See base memory
size,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE,
node
);
if (!temp) {
if (GetLastError() == ERROR_NOT_ENOUGH_MEMORY) {
errorBelch("out of memory");
}
else {
sysErrorBelch(
"osBindMBlocksToNode: VirtualAllocExNuma MEM_RESERVE %llu bytes "
"at address %p bytes failed",
size, addr);
}
stg_exit(EXIT_FAILURE);
}
}
}
}
......@@ -9,6 +9,7 @@
#include "Rts.h"
#include <windows.h>
#include "sm/OSMem.h"
#if defined(THREADED_RTS)
#include "RtsUtils.h"
......@@ -572,8 +573,48 @@ interruptOSThread (OSThreadId id)
CloseHandle(hdl);
}
void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
void releaseThreadNode (void) { /* nothing */ }
void setThreadNode (uint32_t node)
{
if (osNumaAvailable())
{
StgWord mask = 0;
mask |= 1 << node;
if (!SetThreadAffinityMask(GetCurrentThread(), mask))
{
sysErrorBelch(
"setThreadNode: Error setting affinity of thread to NUMA node `%u': %lu.",
node, GetLastError());
stg_exit(EXIT_FAILURE);
}
}
}
void releaseThreadNode (void)
{
if (osNumaAvailable())
{
StgWord processMask;
StgWord systemMask;
if (!GetProcessAffinityMask(GetCurrentProcess(),
&processMask,
&systemMask))
{
sysErrorBelch(
"releaseThreadNode: Error resetting affinity of thread: %lu",
GetLastError());
stg_exit(EXIT_FAILURE);
}
if (!SetThreadAffinityMask(GetCurrentThread(), processMask))
{
sysErrorBelch(
"releaseThreadNode: Error reseting NUMA affinity mask of thread: %lu.",
GetLastError());
stg_exit(EXIT_FAILURE);
}
}
}
#else /* !defined(THREADED_RTS) */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment