Skip to content
Snippets Groups Projects
Commit 28b031c5 authored by Austin Seipp's avatar Austin Seipp
Browse files

Refactor GCTDecl.h, and mitigate #7602 a bit


This basically cleans a lot of GCTDecl up - I found it quite hard to
read and a bit confusing. The changes are mostly cosmetic: better
delineation between the alternative cases and light touchups, and tries
to make every branch as consistent as possible.

However, this patch does have one significant effect: it will ensure
that any LLVM-based compilers will use __thread if they support it.
Before, they would simply always use pthread_getspecific and
pthread_setspecific, which are almost surely even *more* inefficient.

The details are a bit too long and boring to go into here; see #7602.
After talking with Simon, we decided to play it safe - __thread can at
least be optimized by future clang releases even further on OS X if they
choose, and it's safer until we can investigate the pthread
implementation further on Mavericks.

For Linux, the story isn't so bleak if you use Clang (for whatever
reason) - Linux directly writes to `%fs` for __thread slots (while OS X
will perform a load followed by an indirect call.) So it should still be
fairly competitive, speed-wise.

Signed-off-by: default avatarAustin Seipp <austin@well-typed.com>
parent f9652e22
No related branches found
No related tags found
No related merge requests found
/* ----------------------------------------------------------------------------- /* -----------------------------------------------------------------------------
* *
* (c) The GHC Team 1998-2009 * (c) The GHC Team 1998-2014
* *
* Documentation on the architecture of the Garbage Collector can be * Documentation on the architecture of the Garbage Collector can be
* found in the online commentary: * found in the online commentary:
* *
* http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC * http://ghc.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC
* *
* ---------------------------------------------------------------------------*/ * ---------------------------------------------------------------------------*/
...@@ -14,94 +14,128 @@ ...@@ -14,94 +14,128 @@
#include "BeginPrivate.h" #include "BeginPrivate.h"
/* ----------------------------------------------------------------------------- /* The gct variable is thread-local and points to the current thread's
The gct variable is thread-local and points to the current thread's gc_thread structure. It is heavily accessed, and thus high
gc_thread structure. It is heavily accessed, so we try to put gct performance access is crucial to parallel (-threaded) workloads.
into a global register variable if possible; if we don't have a
register then use gcc's __thread extension to create a thread-local First, we try to use a 'global register variable' which is a GCC
variable. extension. This reserves the register globally.
-------------------------------------------------------------------------- */
If that's not possible, then we need to use __thread, which is a
compiler/OS specific TLS storage mechanism (assumed to be Fast
Enough.)
BUT, some older versions of OS X compilers (llvm-gcc, older Clangs)
do not support __thread at all. Modern clang however, does - but on
OS X it's not as fast as the Linux (which can write directly into a
segment register - see #7602.)
If we don't support __thread then we do the absolute worst thing:
we just use pthread_getspecific and pthread_setspecific (which are
horribly slow.)
*/
#if defined(THREADED_RTS) #define GCT_REG_DECL(type,name,reg) register type name REG(reg);
#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg);
#ifdef llvm_CC_FLAVOR /* -------------------------------------------------------------------------- */
/* First: if we're not using the threaded RTS, it's easy: just fake it. */
#if !defined(THREADED_RTS)
extern StgWord8 the_gc_thread[];
#define gct ((gc_thread*)&the_gc_thread)
#define SET_GCT(to) /*nothing*/
#define DECLARE_GCT /*nothing*/
#else /* defined(THREADED_RTS) */
/* -------------------------------------------------------------------------- */
/* Now, llvm-gcc and some older Clang compilers do not support
__thread. So we have to fallback to the extremely slow case,
unfortunately. Note: clang_CC_FLAVOR implies llvm_CC_FLAVOR */
#if defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 0)
#define gct ((gc_thread *)(pthread_getspecific(gctKey)))
#define SET_GCT(to) (pthread_setspecific(gctKey, to)) #define SET_GCT(to) (pthread_setspecific(gctKey, to))
#else #define DECLARE_GCT ThreadLocalKey gctKey;
#define SET_GCT(to) gct = (to)
#endif
/* -------------------------------------------------------------------------- */
/* However, if we *are* using an LLVM based compiler with __thread
support, then use that (since LLVM doesn't support global register
variables.) */
#elif defined(llvm_CC_FLAVOR) && (CC_SUPPORTS_TLS == 1)
extern __thread gc_thread* gct;
#define SET_GCT(to) gct = (to)
#define DECLARE_GCT __thread gc_thread* gct;
#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) /* -------------------------------------------------------------------------- */
// Using __thread is better than stealing a register on x86/Linux, because
// we have too few registers available. In my tests it was worth
// about 5% in GC performance, but of course that might change as gcc
// improves. -- SDM 2009/04/03
//
// For MacOSX, we can use an llvm-based C compiler which will store the gct
// in a thread local variable using pthreads.
/* Next up: Using __thread is better than stealing a register on
x86/Linux, because we have too few registers available. In my
tests it was worth about 5% in GC performance, but of course that
might change as gcc improves. -- SDM 2009/04/03 */
#elif (defined(i386_HOST_ARCH) && defined(linux_HOST_OS))
extern __thread gc_thread* gct; extern __thread gc_thread* gct;
#define SET_GCT(to) gct = (to)
#define DECLARE_GCT __thread gc_thread* gct; #define DECLARE_GCT __thread gc_thread* gct;
#elif defined(llvm_CC_FLAVOR) /* -------------------------------------------------------------------------- */
// LLVM does not support the __thread extension and will generate
// incorrect code for global register variables. If we are compiling
// with a C compiler that uses an LLVM back end (clang or llvm-gcc) then we
// use pthread_getspecific() to handle the thread local storage for gct.
#define gct ((gc_thread *)(pthread_getspecific(gctKey)))
#define DECLARE_GCT ThreadLocalKey gctKey;
#elif defined(sparc_HOST_ARCH) /* Next up: On SPARC we can't pin gct to a register. Names like %l1
// On SPARC we can't pin gct to a register. Names like %l1 are just offsets are just offsets into the register window, which change on each
// into the register window, which change on each function call. function call.
//
// There are eight global (non-window) registers, but they're used for other purposes.
// %g0 -- always zero
// %g1 -- volatile over function calls, used by the linker
// %g2-%g3 -- used as scratch regs by the C compiler (caller saves)
// %g4 -- volatile over function calls, used by the linker
// %g5-%g7 -- reserved by the OS
There are eight global (non-window) registers, but they're used for other
purposes:
%g0 -- always zero
%g1 -- volatile over function calls, used by the linker
%g2-%g3 -- used as scratch regs by the C compiler (caller saves)
%g4 -- volatile over function calls, used by the linker
%g5-%g7 -- reserved by the OS
*/
#elif defined(sparc_HOST_ARCH)
extern __thread gc_thread* gct; extern __thread gc_thread* gct;
#define SET_GCT(to) gct = (to)
#define DECLARE_GCT __thread gc_thread* gct; #define DECLARE_GCT __thread gc_thread* gct;
/* -------------------------------------------------------------------------- */
/* Next up: generally, if REG_Base is defined and we're *not* using
i386, then actually declare the needed register. The catch for i386
here is that REG_Base is %ebx, but that is also used for -fPIC, so
it can't be stolen */
#elif defined(REG_Base) && !defined(i386_HOST_ARCH) #elif defined(REG_Base) && !defined(i386_HOST_ARCH)
// on i386, REG_Base is %ebx which is also used for PIC, so we don't GCT_REG_DECL(gc_thread*, gct, REG_Base);
// want to steal it #define SET_GCT(to) gct = (to)
GLOBAL_REG_DECL(gc_thread*, gct, REG_Base)
#define DECLARE_GCT /* nothing */ #define DECLARE_GCT /* nothing */
/* -------------------------------------------------------------------------- */
/* Next up: if REG_R1 is available after checking REG_Base, we're
gonna steal it in every case we can. */
#elif defined(REG_R1) #elif defined(REG_R1)
GCT_REG_DECL(gc_thread*, gct, REG_R1);
GLOBAL_REG_DECL(gc_thread*, gct, REG_R1) #define SET_GCT(to) gct = (to)
#define DECLARE_GCT /* nothing */ #define DECLARE_GCT /* nothing */
/* -------------------------------------------------------------------------- */
#elif defined(__GNUC__) /* Finally, as an absolute fallback, if none of the above tests check
out but we *do* have __thread support, then use that. */
#elif CC_SUPPORTS_TLS == 1
extern __thread gc_thread* gct; extern __thread gc_thread* gct;
#define SET_GCT(to) gct = (to)
#define DECLARE_GCT __thread gc_thread* gct; #define DECLARE_GCT __thread gc_thread* gct;
#else /* -------------------------------------------------------------------------- */
#error Cannot find a way to declare the thread-local gct
/* Impossible! */
#else
#error Cannot find a way to declare the thread-local gc variable!
#endif #endif
#else // not the threaded RTS
extern StgWord8 the_gc_thread[];
#define gct ((gc_thread*)&the_gc_thread)
#define SET_GCT(to) /*nothing*/
#define DECLARE_GCT /*nothing*/
#endif // THREADED_RTS #endif // THREADED_RTS
#include "EndPrivate.h" #include "EndPrivate.h"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment