HeapAlloc.h 7.46 KB
Newer Older
1 2 3 4 5 6 7 8
/* -----------------------------------------------------------------------------
 *
 * (c) The University of Glasgow 2006-2008
 *
 * The HEAP_ALLOCED() test.
 *
 * ---------------------------------------------------------------------------*/

9
#pragma once
10 11 12 13 14 15 16 17 18 19

#include "BeginPrivate.h"

/* -----------------------------------------------------------------------------
   The HEAP_ALLOCED() test.

   HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
   It needs to be FAST.

   See wiki commentary at
20
     https://gitlab.haskell.org/ghc/ghc/wikis/commentary/heap-alloced
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35

   Implementation of HEAP_ALLOCED
   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

   Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
   can just use a table to record which megablocks in the address
   space belong to the heap.  On a 32-bit machine, with 1Mb
   megablocks, using 8 bits for each entry in the table, the table
   requires 4k.  Lookups during GC will be fast, because the table
   will be quickly cached (indeed, performance measurements showed no
   measurable difference between doing the table lookup and using a
   constant comparison).

   On 64-bit machines, we have two possibilities. One is to request
   a single chunk of address space that we deem "large enough"
36 37 38 39 40 41
   (currently 1TB or the ulimit size, whichever is smaller, although this could
   easily be extended to, say 16TB or more). Memory from that chunk is GC
   memory, everything else is not. This case is tricky in that it requires
   support from the OS to allocate address space without allocating memory (in
   practice, all modern OSes do this). It's also tricky in that it is the only
   case where a successful HEAP_ALLOCED(p) check can trigger a segfault when
42 43 44 45 46 47 48 49 50
   accessing p (and for debugging purposes, it will).

   Alternatively, the older implementation caches one 12-bit block map
   that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is
   called for an address that is not in the cache, it calls
   slowIsHeapAlloced (see MBlock.c) which will find the block map for
   the 4GB block in question.
   -------------------------------------------------------------------------- */

Ben Gamari's avatar
Ben Gamari committed
51
#if defined(USE_LARGE_ADDRESS_SPACE)
52

53 54 55 56 57
struct mblock_address_range {
    W_ begin, end;
    W_ padding[6];  // ensure nothing else inhabits this cache line
} ATTRIBUTE_ALIGNED(64);
extern struct mblock_address_range mblock_address_space;
58

59 60
# define HEAP_ALLOCED(p)        ((W_)(p) >= mblock_address_space.begin && \
                                 (W_)(p) < (mblock_address_space.end))
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
# define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)

#elif SIZEOF_VOID_P == 4
extern StgWord8 mblock_map[];

/* On a 32-bit machine a 4KB table is always sufficient */
# define MBLOCK_MAP_SIZE        4096
# define MBLOCK_MAP_ENTRY(p)    ((StgWord)(p) >> MBLOCK_SHIFT)
# define HEAP_ALLOCED(p)        mblock_map[MBLOCK_MAP_ENTRY(p)]
# define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)

/* -----------------------------------------------------------------------------
   HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE).

 Here are some cache layout options:

 [1]
 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
  mblock size =          20 bits
  entries   =     8192   13 bits
  line size =             0 bits (1 bit of value)
  tag size  =            15 bits
                       = 48 bits

 [2]
 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
  mblock size =          20 bits
  entries   =    16384   14 bits
  line size =             2 bits (4 bits of value)
  tag size  =            12 bits
                       = 48 bits

 [3]
 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
  mblock size =          20 bits
  entries   =    8192    13 bits
  line size =             1 bits (2 bits of value)
  tag size  =            14 bits
                       = 48 bits

 [4]
 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
  mblock size =          20 bits
  entries   =     1024   10 bits
  line size =             4 bits (16 bits of value)
  tag size  =            14 bits
                       = 48 bits

 [5]
 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
  mblock size =          20 bits
  entries   =     512     9 bits
  line size =             5 bits (32 bits of value)
  tag size  =            14 bits
                       = 48 bits

 We actually use none of the above.  After much experimentation it was
 found that optimising the lookup is the most important factor,
 followed by reducing the number of misses.  To that end, we use a
 variant of [1] in which each cache entry is ((mblock << 1) + value)
 where value is 0 for non-heap and 1 for heap.  The cache entries can
 be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
 1 bit for the value.  The cache can be as big as we like, but
 currently we use 8k entries, giving us 8GB capacity.

 ---------------------------------------------------------------------------- */

#elif SIZEOF_VOID_P == 8

#define MBC_LINE_BITS 0
#define MBC_TAG_BITS 15

133
#if defined(x86_64_HOST_ARCH)
134
// 32bits are enough for 'entry' as modern amd64 boxes have
Gabor Greif's avatar
Gabor Greif committed
135
// only 48bit sized virtual address.
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
typedef StgWord32 MbcCacheLine;
#else
// 32bits is not enough here as some arches (like ia64) use
// upper address bits to distinct memory areas.
typedef StgWord64 MbcCacheLine;
#endif

typedef StgWord8  MBlockMapLine;

#define MBLOCK_MAP_LINE(p)  (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))

#define MBC_LINE_SIZE  (1<<MBC_LINE_BITS)
#define MBC_SHIFT      (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
#define MBC_ENTRIES    (1<<MBC_SHIFT)

extern MbcCacheLine mblock_cache[];

#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))

#define MBLOCK_MAP_ENTRIES  (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))

typedef struct {
    StgWord32    addrHigh32;
    MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
} MBlockMap;

extern W_ mpc_misses;

164
StgBool HEAP_ALLOCED_miss(StgWord mblock, const void *p);
165 166

INLINE_HEADER
167
StgBool HEAP_ALLOCED(const void *p)
168 169
{
    StgWord mblock;
170
    uint32_t entry_no;
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
    MbcCacheLine entry, value;

    mblock   = (StgWord)p >> MBLOCK_SHIFT;
    entry_no = mblock & (MBC_ENTRIES-1);
    entry    = mblock_cache[entry_no];
    value    = entry ^ (mblock << 1);
    // this formulation coaxes gcc into prioritising the value==1
    // case, which we expect to be the most common.
    // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
    if (value == 1) {
        return 1;
    } else if (value == 0) {
        return 0;
    } else {
        // putting the rest out of line turned out to be a slight
        // performance improvement:
        return HEAP_ALLOCED_miss(mblock,p);
    }
}

// In the parallel GC, the cache itself is safe to *read*, and can be
// updated atomically, but we need to place a lock around operations
// that touch the MBlock map.
INLINE_HEADER
195
StgBool HEAP_ALLOCED_GC(const void *p)
196 197
{
    StgWord mblock;
198
    uint32_t entry_no;
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
    MbcCacheLine entry, value;
    StgBool b;

    mblock   = (StgWord)p >> MBLOCK_SHIFT;
    entry_no = mblock & (MBC_ENTRIES-1);
    entry    = mblock_cache[entry_no];
    value    = entry ^ (mblock << 1);
    if (value == 1) {
        return 1;
    } else if (value == 0) {
        return 0;
    } else {
        // putting the rest out of line turned out to be a slight
        // performance improvement:
        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
        b = HEAP_ALLOCED_miss(mblock,p);
        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
        return b;
    }
}

#else
# error HEAP_ALLOCED not defined
#endif

#include "EndPrivate.h"