Commit 1b62aece authored by Simon Marlow's avatar Simon Marlow

Partial fix for #2917

 - add newAlignedPinnedByteArray# for allocating pinned BAs with
   arbitrary alignment

 - the old newPinnedByteArray# now aligns to 16 bytes

Foreign.alloca will use newAlignedPinnedByteArray#, and so might end
up wasting less space than before (we used to align to 8 by default).
Foreign.allocaBytes and Foreign.mallocForeignPtrBytes will get 16-byte
aligned memory, which is enough to avoid problems with SSE
instructions on x86, for example.

There was a bug in the old newPinnedByteArray#: it aligned to 8 bytes,
but would have failed if the header was not a multiple of 8
(fortunately it always was, even with profiling).  Also we
occasionally wasted some space unnecessarily due to alignment in
allocatePinned().

I haven't done anything about Foreign.malloc/mallocBytes, which will
give you the same alignment guarantees as malloc() (8 bytes on
Linux/x86 here).
parent a0b4833f
......@@ -814,6 +814,11 @@ primop NewPinnedByteArrayOp_Char "newPinnedByteArray#" GenPrimOp
{Create a mutable byte array that the GC guarantees not to move.}
with out_of_line = True
primop NewAlignedPinnedByteArrayOp_Char "newAlignedPinnedByteArray#" GenPrimOp
Int# -> Int# -> State# s -> (# State# s, MutableByteArray# s #)
{Create a mutable byte array, aligned by the specified amount, that the GC guarantees not to move.}
with out_of_line = True
primop ByteArrayContents_Char "byteArrayContents#" GenPrimOp
ByteArray# -> Addr#
{Intended for use with pinned arrays; otherwise very unsafe!}
......
......@@ -540,6 +540,7 @@ RTS_FUN(word64ToIntegerzh_fast);
RTS_FUN(unsafeThawArrayzh_fast);
RTS_FUN(newByteArrayzh_fast);
RTS_FUN(newPinnedByteArrayzh_fast);
RTS_FUN(newAlignedPinnedByteArrayzh_fast);
RTS_FUN(newArrayzh_fast);
RTS_FUN(newMutVarzh_fast);
......
......@@ -692,6 +692,7 @@ typedef struct _RtsSymbolVal {
SymI_HasProto(noDuplicatezh_fast) \
SymI_HasProto(atomicModifyMutVarzh_fast) \
SymI_HasProto(newPinnedByteArrayzh_fast) \
SymI_HasProto(newAlignedPinnedByteArrayzh_fast) \
SymI_HasProto(newSpark) \
SymI_HasProto(orIntegerzh_fast) \
SymI_HasProto(performGC) \
......
......@@ -83,6 +83,9 @@ newByteArrayzh_fast
RET_P(p);
}
#define BA_ALIGN 16
#define BA_MASK (BA_ALIGN-1)
newPinnedByteArrayzh_fast
{
W_ words, payload_words, n, p;
......@@ -91,24 +94,45 @@ newPinnedByteArrayzh_fast
n = R1;
payload_words = ROUNDUP_BYTES_TO_WDS(n);
// We want an 8-byte aligned array. allocatePinned() gives us
words = payload_words + ((SIZEOF_StgArrWords + BA_MASK) & ~BA_MASK);
("ptr" p) = foreign "C" allocatePinned(words) [];
TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
// This bumps p forwards so that the payload falls on an R2-byte boundary.
p = p + ((-p - SIZEOF_StgArrWords) & BA_MASK);
SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
StgArrWords_words(p) = payload_words;
RET_P(p);
}
newAlignedPinnedByteArrayzh_fast
{
W_ words, payload_words, n, p, mask;
MAYBE_GC(NO_PTRS,newAlignedPinnedByteArrayzh_fast);
n = R1;
if (R2 > SIZEOF_W) {
mask = R2 - 1;
} else {
mask = 0;
}
payload_words = ROUNDUP_BYTES_TO_WDS(n);
// We want an <align>-byte aligned array. allocatePinned() gives us
// 8-byte aligned memory by default, but we want to align the
// *goods* inside the ArrWords object, so we have to check the
// size of the ArrWords header and adjust our size accordingly.
words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
if ((SIZEOF_StgArrWords & 7) != 0) {
words = words + 1;
}
words = payload_words + ((SIZEOF_StgArrWords + mask) & ~mask);
("ptr" p) = foreign "C" allocatePinned(words) [];
TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
// Again, if the ArrWords header isn't a multiple of 8 bytes, we
// have to push the object forward one word so that the goods
// fall on an 8-byte boundary.
if ((SIZEOF_StgArrWords & 7) != 0) {
p = p + WDS(1);
}
// This bumps p forwards so that the payload falls on an R2-byte boundary.
p = p + ((-p - SIZEOF_StgArrWords) & mask);
SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
StgArrWords_words(p) = payload_words;
......
......@@ -490,6 +490,10 @@ update_fwd_large( bdescr *bd )
for (; bd != NULL; bd = bd->link) {
// nothing to do in a pinned block; it might not even have an object
// at the beginning.
if (bd->flags & BF_PINNED) continue;
p = bd->start;
info = get_itbl((StgClosure *)p);
......
......@@ -248,10 +248,6 @@ evacuate_large(StgPtr p)
stp = bd->step;
ACQUIRE_SPIN_LOCK(&stp->sync_large_objects);
// object must be at the beginning of the block (or be a ByteArray)
ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS ||
(((W_)p & BLOCK_MASK) == 0));
// already evacuated?
if (bd->flags & BF_EVACUATED) {
/* Don't forget to set the gct->failed_to_evac flag if we didn't get
......@@ -287,11 +283,23 @@ evacuate_large(StgPtr p)
}
ws = &gct->steps[new_stp->abs_no];
bd->flags |= BF_EVACUATED;
bd->step = new_stp;
bd->gen_no = new_stp->gen_no;
bd->link = ws->todo_large_objects;
ws->todo_large_objects = bd;
// If this is a block of pinned objects, we don't have to scan
// these objects, because they aren't allowed to contain any
// pointers. For these blocks, we skip the scavenge stage and put
// them straight on the scavenged_large_objects list.
if (bd->flags & BF_PINNED) {
ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS);
dbl_link_onto(bd, &ws->step->scavenged_large_objects);
ws->step->n_scavenged_large_blocks += bd->blocks;
} else {
bd->link = ws->todo_large_objects;
ws->todo_large_objects = bd;
}
RELEASE_SPIN_LOCK(&stp->sync_large_objects);
}
......
......@@ -808,7 +808,9 @@ allocatePinned( lnat n )
// If the request is for a large object, then allocate()
// will give us a pinned object anyway.
if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
return allocate(n);
p = allocate(n);
Bdescr(p)->flags |= BF_PINNED;
return p;
}
ACQUIRE_SM_LOCK;
......@@ -816,13 +818,6 @@ allocatePinned( lnat n )
TICK_ALLOC_HEAP_NOCTR(n);
CCS_ALLOC(CCCS,n);
// we always return 8-byte aligned memory. bd->free must be
// 8-byte aligned to begin with, so we just round up n to
// the nearest multiple of 8 bytes.
if (sizeof(StgWord) == 4) {
n = (n+1) & ~1;
}
// If we don't have a block of pinned objects yet, or the current
// one isn't large enough to hold the new object, allocate a new one.
if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment