Commit 0d1a8d09 authored by gcampax's avatar gcampax Committed by Simon Marlow

Two step allocator for 64-bit systems

Summary:
The current OS memory allocator conflates the concepts of allocating
address space and allocating memory, which makes the HEAP_ALLOCED()
implementation excessively complicated (as the only thing it cares
about is address space layout) and slow. Instead, what we want
is to allocate a single insanely large contiguous block of address
space (to make HEAP_ALLOCED() checks fast), and then commit subportions
of that in 1MB blocks as we did before.
This is currently behind a flag, USE_LARGE_ADDRESS_SPACE, that is only enabled for
certain OSes.

Test Plan: validate

Reviewers: simonmar, ezyang, austin

Subscribers: thomie, carter

Differential Revision: https://phabricator.haskell.org/D524

GHC Trac Issues: #9706
parent b949c96b
......@@ -968,6 +968,42 @@ else
fi
AC_SUBST(HavePapi)
dnl large address space support (see includes/rts/storage/MBlock.h)
dnl
dnl Darwin has vm_allocate/vm_protect
dnl Linux has mmap(MAP_NORESERVE)/madv(MADV_DONTNEED)
dnl FreeBSD, Solaris and maybe other have MAP_NORESERVE/MADV_FREE
dnl (They also have MADV_DONTNEED, but it means something else!)
dnl
dnl Windows has VirtualAlloc MEM_RESERVE/MEM_COMMIT, however
dnl it counts page-table space as committed memory, and so quickly
dnl runs out of paging file when we have multiple processes reserving
dnl 1TB of address space, we get the following error:
dnl VirtualAlloc MEM_RESERVE 1099512676352 bytes failed: The paging file is too small for this operation to complete.
dnl
use_large_address_space=no
if test "$ac_cv_sizeof_void_p" -eq 8 ; then
if test "$ghc_host_os" = "darwin" ; then
use_large_address_space=yes
else
AC_CHECK_DECLS([MAP_NORESERVE, MADV_FREE, MADV_DONTNEED],[],[],
[
#include <unistd.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
])
if test "$ac_cv_have_decl_MAP_NORESERVE" = "yes" &&
test "$ac_cv_have_decl_MADV_FREE" = "yes" ||
test "$ac_cv_have_decl_MADV_DONTNEED" = "yes" ; then
use_large_address_space=yes
fi
fi
fi
if test "$use_large_address_space" = "yes" ; then
AC_DEFINE([USE_LARGE_ADDRESS_SPACE], [1], [Enable single heap address space support])
fi
if test "$HAVE_DOCBOOK_XSL" = "NO" ||
test "$XsltprocCmd" = ""
then
......
......@@ -19,203 +19,15 @@ extern void initMBlocks(void);
extern void * getMBlock(void);
extern void * getMBlocks(nat n);
extern void freeMBlocks(void *addr, nat n);
extern void releaseFreeMemory(void);
extern void freeAllMBlocks(void);
extern void *getFirstMBlock(void);
extern void *getNextMBlock(void *mblock);
extern void *getFirstMBlock(void **state);
extern void *getNextMBlock(void **state, void *mblock);
#ifdef THREADED_RTS
// needed for HEAP_ALLOCED below
extern SpinLock gc_alloc_block_sync;
#endif
/* -----------------------------------------------------------------------------
The HEAP_ALLOCED() test.
HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
It needs to be FAST.
See wiki commentary at
http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
Implementation of HEAP_ALLOCED
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
can just use a table to record which megablocks in the address
space belong to the heap. On a 32-bit machine, with 1Mb
megablocks, using 8 bits for each entry in the table, the table
requires 4k. Lookups during GC will be fast, because the table
will be quickly cached (indeed, performance measurements showed no
measurable difference between doing the table lookup and using a
constant comparison).
On 64-bit machines, we cache one 12-bit block map that describes
4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
an address that is not in the cache, it calls slowIsHeapAlloced
(see MBlock.c) which will find the block map for the 4GB block in
question.
-------------------------------------------------------------------------- */
#if SIZEOF_VOID_P == 4
extern StgWord8 mblock_map[];
/* On a 32-bit machine a 4KB table is always sufficient */
# define MBLOCK_MAP_SIZE 4096
# define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
# define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
/* -----------------------------------------------------------------------------
HEAP_ALLOCED for 64-bit machines.
Here are some cache layout options:
[1]
16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
mblock size = 20 bits
entries = 8192 13 bits
line size = 0 bits (1 bit of value)
tag size = 15 bits
= 48 bits
[2]
32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
mblock size = 20 bits
entries = 16384 14 bits
line size = 2 bits (4 bits of value)
tag size = 12 bits
= 48 bits
[3]
16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
mblock size = 20 bits
entries = 8192 13 bits
line size = 1 bits (2 bits of value)
tag size = 14 bits
= 48 bits
[4]
4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
mblock size = 20 bits
entries = 1024 10 bits
line size = 4 bits (16 bits of value)
tag size = 14 bits
= 48 bits
[5]
4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
mblock size = 20 bits
entries = 512 9 bits
line size = 5 bits (32 bits of value)
tag size = 14 bits
= 48 bits
We actually use none of the above. After much experimentation it was
found that optimising the lookup is the most important factor,
followed by reducing the number of misses. To that end, we use a
variant of [1] in which each cache entry is ((mblock << 1) + value)
where value is 0 for non-heap and 1 for heap. The cache entries can
be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
1 bit for the value. The cache can be as big as we like, but
currently we use 8k entries, giving us 8GB capacity.
---------------------------------------------------------------------------- */
#elif SIZEOF_VOID_P == 8
#define MBC_LINE_BITS 0
#define MBC_TAG_BITS 15
#if x86_64_HOST_ARCH
// 32bits are enough for 'entry' as modern amd64 boxes have
// only 48bit sized virtual addres.
typedef StgWord32 MbcCacheLine;
#else
// 32bits is not enough here as some arches (like ia64) use
// upper address bits to distinct memory areas.
typedef StgWord64 MbcCacheLine;
#endif
typedef StgWord8 MBlockMapLine;
#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
#define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
#define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
#define MBC_ENTRIES (1<<MBC_SHIFT)
extern MbcCacheLine mblock_cache[];
#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
typedef struct {
StgWord32 addrHigh32;
MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
} MBlockMap;
extern W_ mpc_misses;
StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
INLINE_HEADER
StgBool HEAP_ALLOCED(void *p)
{
StgWord mblock;
nat entry_no;
MbcCacheLine entry, value;
mblock = (StgWord)p >> MBLOCK_SHIFT;
entry_no = mblock & (MBC_ENTRIES-1);
entry = mblock_cache[entry_no];
value = entry ^ (mblock << 1);
// this formulation coaxes gcc into prioritising the value==1
// case, which we expect to be the most common.
// __builtin_expect() didn't have any useful effect (gcc-4.3.0).
if (value == 1) {
return 1;
} else if (value == 0) {
return 0;
} else {
// putting the rest out of line turned out to be a slight
// performance improvement:
return HEAP_ALLOCED_miss(mblock,p);
}
}
// In the parallel GC, the cache itself is safe to *read*, and can be
// updated atomically, but we need to place a lock around operations
// that touch the MBlock map.
INLINE_HEADER
StgBool HEAP_ALLOCED_GC(void *p)
{
StgWord mblock;
nat entry_no;
MbcCacheLine entry, value;
StgBool b;
mblock = (StgWord)p >> MBLOCK_SHIFT;
entry_no = mblock & (MBC_ENTRIES-1);
entry = mblock_cache[entry_no];
value = entry ^ (mblock << 1);
if (value == 1) {
return 1;
} else if (value == 0) {
return 0;
} else {
// putting the rest out of line turned out to be a slight
// performance improvement:
ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
b = HEAP_ALLOCED_miss(mblock,p);
RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
return b;
}
}
#else
# error HEAP_ALLOCED not defined
#endif
#endif /* RTS_STORAGE_MBLOCK_H */
......@@ -14,6 +14,7 @@
#include "Trace.h"
#include "Prelude.h"
#include "Sparks.h"
#include "sm/HeapAlloc.h"
#if defined(THREADED_RTS)
......
......@@ -13,6 +13,7 @@
#include "RtsUtils.h"
#include "sm/OSMem.h"
#include "sm/HeapAlloc.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
......@@ -72,23 +73,67 @@ void osMemInit(void)
-------------------------------------------------------------------------- */
// A wrapper around mmap(), to abstract away from OS differences in
// the mmap() interface.
/*
A wrapper around mmap(), to abstract away from OS differences in
the mmap() interface.
It supports the following operations:
- reserve: find a new chunk of available address space, and make it so
that we own it (no other library will get it), but don't actually
allocate memory for it
the addr is a hint for where to place the memory (and most
of the time the OS happily ignores!)
- commit: given a chunk of address space that we know we own, make sure
there is some memory backing it
the addr is not a hint, it must point into previously reserved
address space, or bad things happen
- reserve&commit: do both at the same time
The naming is chosen from the Win32 API (VirtualAlloc) which does the
same thing and has done so forever, while support for this in Unix systems
has only been added recently and is hidden in the posix portability mess.
It is confusing because to get the reserve behavior we need MAP_NORESERVE
(which tells the kernel not to allocate backing space), but heh...
*/
enum
{
MEM_RESERVE = 1,
MEM_COMMIT = 2,
MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
};
static void *
my_mmap (void *addr, W_ size)
my_mmap (void *addr, W_ size, int operation)
{
void *ret;
int prot, flags;
if (operation & MEM_COMMIT)
prot = PROT_READ | PROT_WRITE;
else
prot = PROT_NONE;
if (operation == MEM_RESERVE)
flags = MAP_NORESERVE;
else if (operation == MEM_COMMIT)
flags = MAP_FIXED;
else
flags = 0;
#if defined(solaris2_HOST_OS) || defined(irix_HOST_OS)
{
int fd = open("/dev/zero",O_RDONLY);
ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
close(fd);
if (operation & MEM_RESERVE)
{
int fd = open("/dev/zero",O_RDONLY);
ret = mmap(addr, size, prot, flags | MAP_PRIVATE, fd, 0);
close(fd);
}
else
{
ret = mmap(addr, size, prot, flags | MAP_PRIVATE, -1, 0);
}
}
#elif hpux_HOST_OS
ret = mmap(addr, size, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
ret = mmap(addr, size, prot, flags | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
#elif darwin_HOST_OS
// Without MAP_FIXED, Apple's mmap ignores addr.
// With MAP_FIXED, it overwrites already mapped regions, whic
......@@ -100,10 +145,16 @@ my_mmap (void *addr, W_ size)
kern_return_t err = 0;
ret = addr;
if(addr) // try to allocate at address
err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, FALSE);
if(!addr || err) // try to allocate anywhere
err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, TRUE);
if(operation & MEM_RESERVE)
{
if(addr) // try to allocate at address
err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
size, FALSE);
if(!addr || err) // try to allocate anywhere
err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
size, TRUE);
}
if(err) {
// don't know what the error codes mean exactly, assume it's
......@@ -111,23 +162,24 @@ my_mmap (void *addr, W_ size)
errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
size);
stg_exit(EXIT_FAILURE);
} else {
}
if(operation & MEM_COMMIT) {
vm_protect(mach_task_self(), (vm_address_t)ret, size, FALSE,
VM_PROT_READ|VM_PROT_WRITE);
}
#elif linux_HOST_OS
ret = mmap(addr, size, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
if (ret == (void *)-1 && errno == EPERM) {
// Linux may return EPERM if it tried to give us
// a chunk of address space below mmap_min_addr,
// See Trac #7500.
if (addr != 0) {
if (addr != 0 && (operation & MEM_RESERVE)) {
// Try again with no hint address.
// It's not clear that this can ever actually help,
// but since our alternative is to abort, we may as well try.
ret = mmap(0, size, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
ret = mmap(0, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
}
if (ret == (void *)-1 && errno == EPERM) {
// Linux is not willing to give us any mapping,
......@@ -137,8 +189,7 @@ my_mmap (void *addr, W_ size)
}
}
#else
ret = mmap(addr, size, PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE, -1, 0);
ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
#endif
if (ret == (void *)-1) {
......@@ -168,7 +219,7 @@ gen_map_mblocks (W_ size)
// Try to map a larger block, and take the aligned portion from
// it (unmap the rest).
size += MBLOCK_SIZE;
ret = my_mmap(0, size);
ret = my_mmap(0, size, MEM_RESERVE_AND_COMMIT);
// unmap the slop bits around the chunk we allocated
slop = (W_)ret & MBLOCK_MASK;
......@@ -207,7 +258,7 @@ osGetMBlocks(nat n)
// use gen_map_mblocks the first time.
ret = gen_map_mblocks(size);
} else {
ret = my_mmap(next_request, size);
ret = my_mmap(next_request, size, MEM_RESERVE_AND_COMMIT);
if (((W_)ret & MBLOCK_MASK) != 0) {
// misaligned block!
......@@ -244,10 +295,11 @@ void osReleaseFreeMemory(void) {
void osFreeAllMBlocks(void)
{
void *mblock;
void *state;
for (mblock = getFirstMBlock();
for (mblock = getFirstMBlock(&state);
mblock != NULL;
mblock = getNextMBlock(mblock)) {
mblock = getNextMBlock(&state, mblock)) {
munmap(mblock, MBLOCK_SIZE);
}
}
......@@ -318,3 +370,103 @@ void setExecutable (void *p, W_ len, rtsBool exec)
barf("setExecutable: failed to protect 0x%p\n", p);
}
}
#ifdef USE_LARGE_ADDRESS_SPACE
static void *
osTryReserveHeapMemory (void *hint)
{
void *base, *top;
void *start, *end;
/* We try to allocate MBLOCK_SPACE_SIZE + MBLOCK_SIZE,
because we need memory which is MBLOCK_SIZE aligned,
and then we discard what we don't need */
base = my_mmap(hint, MBLOCK_SPACE_SIZE + MBLOCK_SIZE, MEM_RESERVE);
top = (void*)((W_)base + MBLOCK_SPACE_SIZE + MBLOCK_SIZE);
if (((W_)base & MBLOCK_MASK) != 0) {
start = MBLOCK_ROUND_UP(base);
end = MBLOCK_ROUND_DOWN(top);
ASSERT(((W_)end - (W_)start) == MBLOCK_SPACE_SIZE);
if (munmap(base, (W_)start-(W_)base) < 0) {
sysErrorBelch("unable to release slop before heap");
}
if (munmap(end, (W_)top-(W_)end) < 0) {
sysErrorBelch("unable to release slop after heap");
}
} else {
start = base;
}
return start;
}
void *osReserveHeapMemory(void)
{
int attempt;
void *at;
/* We want to ensure the heap starts at least 8 GB inside the address space,
to make sure that any dynamically loaded code will be close enough to the
original code so that short relocations will work. This is in particular
important on Darwin/Mach-O, because object files not compiled as shared
libraries are position independent but cannot be loaded about 4GB.
We do so with a hint to the mmap, and we verify the OS satisfied our
hint. We loop a few times in case there is already something allocated
there, but we bail if we cannot allocate at all.
*/
attempt = 0;
do {
at = osTryReserveHeapMemory((void*)((W_)8 * (1 << 30) +
attempt * BLOCK_SIZE));
} while ((W_)at < ((W_)8 * (1 << 30)));
return at;
}
void osCommitMemory(void *at, W_ size)
{
my_mmap(at, size, MEM_COMMIT);
}
void osDecommitMemory(void *at, W_ size)
{
int r;
// First make the memory unaccessible (so that we get a segfault
// at the next attempt to touch it)
// We only do this in DEBUG because it forces the OS to remove
// all MMU entries for this page range, and there is no reason
// to do so unless there is memory pressure
#ifdef DEBUG
r = mprotect(at, size, PROT_NONE);
if(r < 0)
sysErrorBelch("unable to make released memory unaccessible");
#endif
#ifdef MADV_FREE
// Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
// just swaps memory out
r = madvise(at, size, MADV_FREE);
#else
r = madvise(at, size, MADV_DONTNEED);
#endif
if(r < 0)
sysErrorBelch("unable to decommit memory");
}
void osReleaseHeapMemory(void)
{
int r;
r = munmap((void*)mblock_address_space_begin, MBLOCK_SPACE_SIZE);
if(r < 0)
sysErrorBelch("unable to release address space");
}
#endif
......@@ -736,7 +736,14 @@ void returnMemoryToOS(nat n /* megablocks */)
}
free_mblock_list = bd;
osReleaseFreeMemory();
// Ask the OS to release any address space portion
// that was associated with the just released MBlocks
//
// Historically, we used to ask the OS directly (via
// osReleaseFreeMemory()) - now the MBlock layer might
// have a reason to preserve the address space range,
// so we keep it
releaseFreeMemory();
IF_DEBUG(gc,
if (n != 0) {
......@@ -869,11 +876,12 @@ void
reportUnmarkedBlocks (void)
{
void *mblock;
void *state;
bdescr *bd;
debugBelch("Unreachable blocks:\n");
for (mblock = getFirstMBlock(); mblock != NULL;
mblock = getNextMBlock(mblock)) {
for (mblock = getFirstMBlock(&state); mblock != NULL;
mblock = getNextMBlock(&state, mblock)) {
for (bd = FIRST_BDESCR(mblock); bd <= LAST_BDESCR(mblock); ) {
if (!(bd->flags & BF_KNOWN) && bd->free != (P_)-1) {
debugBelch(" %p\n",bd);
......
......@@ -16,6 +16,8 @@
#include "BeginPrivate.h"
#include "HeapAlloc.h"
void GarbageCollect (rtsBool force_major_gc,
rtsBool do_heap_census,
nat gc_type, Capability *cap);
......
/* -----------------------------------------------------------------------------
*
* (c) The University of Glasgow 2006-2008
*
* The HEAP_ALLOCED() test.
*
* ---------------------------------------------------------------------------*/
#ifndef SM_HEAP_ALLOC_H
#define SM_HEAP_ALLOC_H
#include "BeginPrivate.h"
/* -----------------------------------------------------------------------------
The HEAP_ALLOCED() test.
HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
It needs to be FAST.
See wiki commentary at
http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
Implementation of HEAP_ALLOCED
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
can just use a table to record which megablocks in the address
space belong to the heap. On a 32-bit machine, with 1Mb
megablocks, using 8 bits for each entry in the table, the table
requires 4k. Lookups during GC will be fast, because the table
will be quickly cached (indeed, performance measurements showed no
measurable difference between doing the table lookup and using a
constant comparison).
On 64-bit machines, we have two possibilities. One is to request
a single chunk of address space that we deem "large enough"
(currently 1TB, could easily be extended to, say 16TB or more).
Memory from that chunk is GC memory, everything else is not. This
case is tricky in that it requires support from the OS to allocate
address space without allocating memory (in practice, all modern
OSes do this). It's also tricky in that it is the only case where
a successful HEAP_ALLOCED(p) check can trigger a segfault when
accessing p (and for debugging purposes, it will).
Alternatively, the older implementation caches one 12-bit block map
that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is
called for an address that is not in the cache, it calls
slowIsHeapAlloced (see MBlock.c) which will find the block map for
the 4GB block in question.
-------------------------------------------------------------------------- */
#ifdef USE_LARGE_ADDRESS_SPACE
extern W_ mblock_address_space_begin;
# define MBLOCK_SPACE_SIZE ((StgWord)1 << 40) /* 1 TB */
# define HEAP_ALLOCED(p) ((W_)(p) >= mblock_address_space_begin && \
(W_)(p) < (mblock_address_space_begin + \
MBLOCK_SPACE_SIZE))
# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
#elif SIZEOF_VOID_P == 4
extern StgWord8 mblock_map[];
/* On a 32-bit machine a 4KB table is always sufficient */
# define MBLOCK_MAP_SIZE 4096
# define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
# define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
/* -----------------------------------------------------------------------------
HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE).
Here are some cache layout options:
[1]
16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
mblock size = 20 bits
entries = 8192 13 bits
line size = 0 bits (1 bit of value)
tag size = 15 bits
= 48 bits
[2]
32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
mblock size = 20 bits
entries = 16384 14 bits
line size = 2 bits (4 bits of value)
tag size = 12 bits
= 48 bits
[3]
16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
mblock size = 20 bits
entries = 8192 13 bits
line size = 1 bits (2 bits of value)
tag size = 14 bits
= 48 bits
[4]
4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
mblock size = 20 bits
entries = 1024 10 bits
line size = 4 bits (16 bits of value)
tag size = 14 bits
= 48 bits
[5]
4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
mblock size = 20 bits
entries = 512 9 bits
line size = 5 bits (32 bits of value)
tag size = 14 bits
= 48 bits
We actually use none of the above. After much experimentation it was
found that optimising the lookup is the most important factor,
followed by reducing the number of misses. To that end, we use a
variant of [1] in which each cache entry is ((mblock << 1) + value)
where value is 0 for non-heap and 1 for heap. The cache entries can
be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
1 bit for the value. The cache can be as big as we like, but
currently we use 8k entries, giving us 8GB capacity.
---------------------------------------------------------------------------- */
#elif SIZEOF_VOID_P == 8
#define MBC_LINE_BITS 0
#define MBC_TAG_BITS 15
#if x86_64_HOST_ARCH
// 32bits are enough for 'entry' as modern amd64 boxes have
// only 48bit sized virtual addres.
typedef StgWord32 MbcCacheLine;
#else
// 32bits is not enough here as some arches (like ia64) use
// upper address bits to distinct memory areas.
typedef StgWord64 MbcCacheLine;
#endif
typedef StgWord8 MBlockMapLine;
#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
#define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
#define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
#define MBC_ENTRIES (1<<MBC_SHIFT)
extern MbcCacheLine mblock_cache[];
#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
typedef struct {
StgWord32 addrHigh32;
MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
} MBlockMap;