Adjustor.c 48.2 KB
Newer Older
1 2 3 4 5 6
/* -----------------------------------------------------------------------------
 * Foreign export adjustor thunks
 *
 * Copyright (c) 1998.
 *
 * ---------------------------------------------------------------------------*/
sof's avatar
sof committed
7

8
/* A little bit of background...
9
   
10
An adjustor thunk is a dynamically allocated code snippet that allows
sof's avatar
sof committed
11 12 13
Haskell closures to be viewed as C function pointers. 

Stable pointers provide a way for the outside world to get access to,
14 15
and evaluate, Haskell heap objects, with the RTS providing a small
range of ops for doing so. So, assuming we've got a stable pointer in
sof's avatar
sof committed
16 17 18 19 20 21
our hand in C, we can jump into the Haskell world and evaluate a callback
procedure, say. This works OK in some cases where callbacks are used, but
does require the external code to know about stable pointers and how to deal
with them. We'd like to hide the Haskell-nature of a callback and have it
be invoked just like any other C function pointer. 

22 23 24 25 26 27
Enter adjustor thunks. An adjustor thunk is a little piece of code
that's generated on-the-fly (one per Haskell closure being exported)
that, when entered using some 'universal' calling convention (e.g., the
C calling convention on platform X), pushes an implicit stable pointer
(to the Haskell callback) before calling another (static) C function stub
which takes care of entering the Haskell code via its stable pointer.
sof's avatar
sof committed
28 29 30 31 32

An adjustor thunk is allocated on the C heap, and is called from within
Haskell just before handing out the function pointer to the Haskell (IO)
action. User code should never have to invoke it explicitly.

33
An adjustor thunk differs from a C function pointer in one respect: when
sof's avatar
sof committed
34
the code is through with it, it has to be freed in order to release Haskell
35
and C resources. Failure to do so will result in memory leaks on both the C and
sof's avatar
sof committed
36
Haskell side.
37
*/
38

39
#include "PosixSource.h"
40
#include "Rts.h"
Simon Marlow's avatar
Simon Marlow committed
41

42
#include "RtsUtils.h"
Simon Marlow's avatar
Simon Marlow committed
43
#include "Stable.h"
sof's avatar
sof committed
44

45 46
#if defined(USE_LIBFFI_FOR_ADJUSTORS)
#include "ffi.h"
47
#include <string.h>
Ian Lynagh's avatar
Ian Lynagh committed
48 49
#endif

50
#if defined(i386_HOST_ARCH)
Ian Lynagh's avatar
Ian Lynagh committed
51 52 53 54 55 56 57
extern void adjustorCode(void);
#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
// from AdjustorAsm.s
// not declared as a function so that AIX-style
// fundescs can never get in the way.
extern void *adjustorCode;
#endif
58

Ian Lynagh's avatar
Ian Lynagh committed
59
#if defined(USE_LIBFFI_FOR_ADJUSTORS)
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
/* There are subtle differences between how libffi adjustors work on
 * different platforms, and the situation is a little complex.
 * 
 * HOW ADJUSTORS/CLOSURES WORK ON LIBFFI:
 * libffi's ffi_closure_alloc() function gives you two pointers to a closure,
 * 1. the writable pointer, and 2. the executable pointer. You write the
 * closure into the writable pointer (and ffi_prep_closure_loc() will do this
 * for you) and you execute it at the executable pointer.
 *
 * THE PROBLEM:
 * The RTS deals only with the executable pointer, but when it comes time to
 * free the closure, libffi wants the writable pointer back that it gave you
 * when you allocated it.
 *
 * On Linux we solve this problem by storing the address of the writable
 * mapping into itself, then returning both writable and executable pointers
 * plus 1 machine word for preparing the closure for use by the RTS (see the
 * Linux version of allocateExec() in rts/sm/Storage.c). When we want to
 * recover the writable address, we subtract 1 word from the executable
 * address and fetch. This works because Linux kernel magic gives us two
 * pointers with different addresses that refer to the same memory. Whatever
 * you write into the writeable address can be read back at the executable
 * address. This method is very efficient.
 *
 * On iOS this breaks for two reasons: 1. the two pointers do not refer to
 * the same memory (so we can't retrieve anything stored into the writable
 * pointer if we only have the exec pointer), and 2. libffi's
 * ffi_closure_alloc() assumes the pointer it has returned you is a
 * ffi_closure structure and treats it as such: It uses that memory to
 * communicate with ffi_prep_closure_loc(). On Linux by contrast
 * ffi_closure_alloc() is viewed simply as a memory allocation, and only
 * ffi_prep_closure_loc() deals in ffi_closure structures. Each of these
 * differences is enough make the efficient way used on Linux not work on iOS.
 * Instead on iOS we use hash tables to recover the writable address from the
 * executable one. This method is conservative and would almost certainly work
 * on any platform, but on Linux it makes sense to use the faster method.
 */
97 98 99 100 101
void
freeHaskellFunctionPtr(void* ptr)
{
    ffi_closure *cl;

102 103 104
#if defined(ios_HOST_OS)
    cl = execToWritable(ptr);
#else
105
    cl = (ffi_closure*)ptr;
106
#endif
107 108 109
    freeStablePtr(cl->user_data);
    stgFree(cl->cif->arg_types);
    stgFree(cl->cif);
110
    freeExec(ptr);
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
}

static ffi_type * char_to_ffi_type(char c)
{
    switch (c) {
    case 'v':  return &ffi_type_void;
    case 'f':  return &ffi_type_float;
    case 'd':  return &ffi_type_double;
    case 'L':  return &ffi_type_sint64;
    case 'l':  return &ffi_type_uint64;
    case 'W':  return &ffi_type_sint32;
    case 'w':  return &ffi_type_uint32;
    case 'S':  return &ffi_type_sint16;
    case 's':  return &ffi_type_uint16;
    case 'B':  return &ffi_type_sint8;
    case 'b':  return &ffi_type_uint8;
127
    case 'p':  return &ffi_type_pointer;
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
    default:   barf("char_to_ffi_type: unknown type '%c'", c);
    }
}

void*
createAdjustor (int cconv, 
                StgStablePtr hptr,
                StgFunPtr wptr,
                char *typeString)
{
    ffi_cif *cif;
    ffi_type **arg_types;
    nat n_args, i;
    ffi_type *result_type;
    ffi_closure *cl;
    int r, abi;
144
    void *code;
145 146 147 148 149 150 151 152 153 154

    n_args = strlen(typeString) - 1;
    cif = stgMallocBytes(sizeof(ffi_cif), "createAdjustor");
    arg_types = stgMallocBytes(n_args * sizeof(ffi_type*), "createAdjustor");

    result_type = char_to_ffi_type(typeString[0]);
    for (i=0; i < n_args; i++) {
        arg_types[i] = char_to_ffi_type(typeString[i+1]);
    }
    switch (cconv) {
155
#if defined(mingw32_HOST_OS) && defined(i386_HOST_ARCH)
156 157 158 159 160 161 162 163 164 165 166 167 168 169
    case 0: /* stdcall */
        abi = FFI_STDCALL;
        break;
#endif
    case 1: /* ccall */
        abi = FFI_DEFAULT_ABI;
        break;
    default:
        barf("createAdjustor: convention %d not supported on this platform", cconv);
    }

    r = ffi_prep_cif(cif, abi, n_args, result_type, arg_types);
    if (r != FFI_OK) barf("ffi_prep_cif failed: %d", r);
    
170 171 172 173
    cl = allocateExec(sizeof(ffi_closure), &code);
    if (cl == NULL) {
        barf("createAdjustor: failed to allocate memory");
    }
174

175
    r = ffi_prep_closure_loc(cl, cif, (void*)wptr, hptr/*userdata*/, code);
176
    if (r != FFI_OK) barf("ffi_prep_closure_loc failed: %d", r);
177

178
    return (void*)code;
179 180 181 182
}

#else // To end of file...

sof's avatar
sof committed
183 184 185 186
#if defined(_WIN32)
#include <windows.h>
#endif

187
#if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
188 189 190
#include <string.h>
#endif

191 192 193 194 195
#ifdef LEADING_UNDERSCORE
#define UNDERSCORE "_"
#else 
#define UNDERSCORE ""
#endif
Simon Marlow's avatar
Simon Marlow committed
196 197

#if defined(x86_64_HOST_ARCH)
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
/* 
  Now here's something obscure for you:

  When generating an adjustor thunk that uses the C calling
  convention, we have to make sure that the thunk kicks off
  the process of jumping into Haskell with a tail jump. Why?
  Because as a result of jumping in into Haskell we may end
  up freeing the very adjustor thunk we came from using
  freeHaskellFunctionPtr(). Hence, we better not return to
  the adjustor code on our way  out, since it could by then
  point to junk.
  
  The fix is readily at hand, just include the opcodes
  for the C stack fixup code that we need to perform when
  returning in some static piece of memory and arrange
  to return to it before tail jumping from the adjustor thunk.
*/
dons's avatar
dons committed
215 216 217
static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
{
  __asm__ (
218 219
   ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
   UNDERSCORE "obscure_ccall_ret_code:\n\t"
220
   "addq $0x8, %rsp\n\t"
Ian Lynagh's avatar
Ian Lynagh committed
221 222 223 224 225 226
#if defined(mingw32_HOST_OS)
   /* On Win64, we had to put the original return address after the
      arg 1-4 spill slots, ro now we have to move it back */
   "movq 0x20(%rsp), %rcx\n"
   "movq %rcx, (%rsp)\n"
#endif
227 228
   "ret"
  );
dons's avatar
dons committed
229
}
230 231 232
extern void obscure_ccall_ret_code(void);
#endif

233
#if defined(alpha_HOST_ARCH)
ken's avatar
ken committed
234
/* To get the definition of PAL_imb: */
235
# if defined(linux_HOST_OS)
236 237 238 239
#  include <asm/pal.h>
# else
#  include <machine/pal.h>
# endif
ken's avatar
ken committed
240 241
#endif

242
#if defined(ia64_HOST_ARCH)
243 244 245 246 247 248 249 250 251 252 253 254 255 256

/* Layout of a function descriptor */
typedef struct _IA64FunDesc {
    StgWord64 ip;
    StgWord64 gp;
} IA64FunDesc;

static void *
stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
{
  StgArrWords* arr;
  nat data_size_in_words, total_size_in_words;
  
  /* round up to a whole number of words */
257
  data_size_in_words  = ROUNDUP_BYTES_TO_WDS(size_in_bytes);
258 259 260 261
  total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
  
  /* allocate and fill it in */
  arr = (StgArrWords *)allocate(total_size_in_words);
262
  SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, size_in_bytes);
263 264 265 266 267
 
  /* obtain a stable ptr */
  *stable = getStablePtr((StgPtr)arr);

  /* and return a ptr to the goods inside the array */
268
  return(&(arr->payload));
269 270 271
}
#endif

272
#if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
273 274 275 276 277 278 279 280
__asm__("obscure_ccall_ret_code:\n\t"
        "lwz 1,0(1)\n\t"
        "lwz 0,4(1)\n\t"
        "mtlr 0\n\t"
        "blr");
extern void obscure_ccall_ret_code(void);
#endif

281 282
#if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
#if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
283 284 285 286 287 288 289

/* !!! !!! WARNING: !!! !!!
 * This structure is accessed from AdjustorAsm.s
 * Any changes here have to be mirrored in the offsets there.
 */

typedef struct AdjustorStub {
290
#if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
291 292 293 294 295 296
    unsigned        lis;
    unsigned        ori;
    unsigned        lwz;
    unsigned        mtctr;
    unsigned        bctr;
    StgFunPtr       code;
297
#elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
        /* powerpc64-darwin: just guessing that it won't use fundescs. */
    unsigned        lis;
    unsigned        ori;
    unsigned        rldimi;
    unsigned        oris;
    unsigned        ori2;
    unsigned        lwz;
    unsigned        mtctr;
    unsigned        bctr;
    StgFunPtr       code;
#else
        /* fundesc-based ABIs */
#define         FUNDESCS
    StgFunPtr       code;
    struct AdjustorStub
                    *toc;
    void            *env;
#endif
    StgStablePtr    hptr;
    StgFunPtr       wptr;
    StgInt          negative_framesize;
    StgInt          extrawords_plus_one;
} AdjustorStub;
321

322
#endif
323 324
#endif

325
#if defined(i386_HOST_ARCH)
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340

/* !!! !!! WARNING: !!! !!!
 * This structure is accessed from AdjustorAsm.s
 * Any changes here have to be mirrored in the offsets there.
 */

typedef struct AdjustorStub {
    unsigned char   call[8];
    StgStablePtr    hptr;
    StgFunPtr       wptr;
    StgInt          frame_size;
    StgInt          argument_size;
} AdjustorStub;
#endif

341
#if defined(i386_HOST_ARCH) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
342 343 344 345 346 347 348 349 350 351 352 353
static int totalArgumentSize(char *typeString)
{
    int sz = 0;
    while(*typeString)
    {
        char t = *typeString++;

        switch(t)
        {
                // on 32-bit platforms, Double and Int64 occupy two words.
            case 'd':
            case 'l':
354
            case 'L':
355 356 357 358 359 360 361 362 363 364 365 366 367 368
                if(sizeof(void*) == 4)
                {
                    sz += 2;
                    break;
                }
                // everything else is one word.
            default:
                sz += 1;
        }
    }
    return sz;
}
#endif

sof's avatar
sof committed
369
void*
370
createAdjustor(int cconv, StgStablePtr hptr,
Ian Lynagh's avatar
Ian Lynagh committed
371 372
               StgFunPtr wptr,
               char *typeString
373
#if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
Ian Lynagh's avatar
Ian Lynagh committed
374
                  STG_UNUSED
375 376
#endif
              )
sof's avatar
sof committed
377
{
ken's avatar
ken committed
378
  void *adjustor = NULL;
379
  void *code = NULL;
380

ken's avatar
ken committed
381 382 383
  switch (cconv)
  {
  case 0: /* _stdcall */
384
#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
sof's avatar
sof committed
385 386 387 388
    /* Magic constant computed by inspecting the code length of
       the following assembly language snippet
       (offset and machine code prefixed):

Ian Lynagh's avatar
Ian Lynagh committed
389 390 391 392 393 394 395
     <0>:       58                popl   %eax              # temp. remove ret addr..
     <1>:       68 fd fc fe fa    pushl  0xfafefcfd        # constant is large enough to
                                                           # hold a StgStablePtr
     <6>:       50                pushl  %eax              # put back ret. addr
     <7>:       b8 fa ef ff 00    movl   $0x00ffeffa, %eax # load up wptr
     <c>:       ff e0             jmp    %eax              # and jump to it.
                # the callee cleans up the stack
sof's avatar
sof committed
396
    */
397
    adjustor = allocateExec(14,&code);
398
    {
Ian Lynagh's avatar
Ian Lynagh committed
399 400
        unsigned char *const adj_code = (unsigned char *)adjustor;
        adj_code[0x00] = (unsigned char)0x58;  /* popl %eax  */
sof's avatar
sof committed
401

Ian Lynagh's avatar
Ian Lynagh committed
402 403
        adj_code[0x01] = (unsigned char)0x68;  /* pushl hptr (which is a dword immediate ) */
        *((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
sof's avatar
sof committed
404

Ian Lynagh's avatar
Ian Lynagh committed
405
        adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
sof's avatar
sof committed
406

Ian Lynagh's avatar
Ian Lynagh committed
407 408
        adj_code[0x07] = (unsigned char)0xb8; /* movl  $wptr, %eax */
        *((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
sof's avatar
sof committed
409

Ian Lynagh's avatar
Ian Lynagh committed
410 411
        adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
        adj_code[0x0d] = (unsigned char)0xe0;
412
    }
ken's avatar
ken committed
413 414
#endif
    break;
sof's avatar
sof committed
415

ken's avatar
ken committed
416
  case 1: /* _ccall */
417
#if defined(i386_HOST_ARCH)
418 419
    {
        /*
420 421 422 423 424
          Most of the trickiness here is due to the need to keep the
          stack pointer 16-byte aligned (see #5250).  That means we
          can't just push another argument on the stack and call the
          wrapper, we may have to shuffle the whole argument block.

425 426
          We offload most of the work to AdjustorAsm.S.
        */
427
        AdjustorStub *adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
428 429 430 431 432
        adjustor = adjustorStub;

        int sz = totalArgumentSize(typeString);
        
        adjustorStub->call[0] = 0xe8;
433
        *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)code + 5);
434 435
        adjustorStub->hptr = hptr;
        adjustorStub->wptr = wptr;
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
        
            // The adjustor puts the following things on the stack:
            // 1.) %ebp link
            // 2.) padding and (a copy of) the arguments
            // 3.) a dummy argument
            // 4.) hptr
            // 5.) return address (for returning to the adjustor)
            // All these have to add up to a multiple of 16. 

            // first, include everything in frame_size
        adjustorStub->frame_size = sz * 4 + 16;
            // align to 16 bytes
        adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
            // only count 2.) and 3.) as part of frame_size
        adjustorStub->frame_size -= 12; 
451 452 453
        adjustorStub->argument_size = sz;
    }
    
454
#elif defined(x86_64_HOST_ARCH)
Ian Lynagh's avatar
Ian Lynagh committed
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498

# if defined(mingw32_HOST_OS)
    /*
      stack at call:
               argn
               ...
               arg5
               return address
               %rcx,%rdx,%r8,%r9 = arg1..arg4

      if there are <4 integer args, then we can just push the
      StablePtr into %rcx and shuffle the other args up.

      If there are >=4 integer args, then we have to flush one arg
      to the stack, and arrange to adjust the stack ptr on return.
      The stack will be rearranged to this:

             argn
             ...
             arg5
             return address  *** <-- dummy arg in stub fn.
             arg4
             obscure_ccall_ret_code

      This unfortunately means that the type of the stub function
      must have a dummy argument for the original return address
      pointer inserted just after the 4th integer argument.

      Code for the simple case:

   0:   4d 89 c1                mov    %r8,%r9
   3:   49 89 d0                mov    %rdx,%r8
   6:   48 89 ca                mov    %rcx,%rdx
   9:   f2 0f 10 da             movsd  %xmm2,%xmm3
   d:   f2 0f 10 d1             movsd  %xmm1,%xmm2
  11:   f2 0f 10 c8             movsd  %xmm0,%xmm1
  15:   48 8b 0d 0c 00 00 00    mov    0xc(%rip),%rcx    # 28 <.text+0x28>
  1c:   ff 25 0e 00 00 00       jmpq   *0xe(%rip)        # 30 <.text+0x30>
  22:   90                      nop
  [...]


  And the version for >=4 integer arguments:

Ian Lynagh's avatar
Ian Lynagh committed
499 500 501 502 503 504 505
[we want to push the 4th argument (either %r9 or %xmm3, depending on
 whether it is a floating arg or not) and the return address onto the
 stack. However, slots 1-4 are reserved for code we call to spill its
 args 1-4 into, so we can't just push them onto the bottom of the stack.
 So first put the 4th argument onto the stack, above what will be the
 spill slots.]
   0:   48 83 ec 08             sub    $0x8,%rsp
506 507 508 509 510 511
[if non-floating arg, then do this:]
   4:   90                      nop
   5:   4c 89 4c 24 20          mov    %r9,0x20(%rsp)
[else if floating arg then do this:]
   4:   f2 0f 11 5c 24 20       movsd  %xmm3,0x20(%rsp)
[end if]
Ian Lynagh's avatar
Ian Lynagh committed
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
[Now push the new return address onto the stack]
   a:   ff 35 30 00 00 00       pushq  0x30(%rip)        # 40 <.text+0x40>
[But the old return address has been moved up into a spill slot, so
 we need to move it above them]
  10:   4c 8b 4c 24 10          mov    0x10(%rsp),%r9
  15:   4c 89 4c 24 30          mov    %r9,0x30(%rsp)
[Now we do the normal register shuffle-up etc]
  1a:   4d 89 c1                mov    %r8,%r9
  1d:   49 89 d0                mov    %rdx,%r8
  20:   48 89 ca                mov    %rcx,%rdx
  23:   f2 0f 10 da             movsd  %xmm2,%xmm3
  27:   f2 0f 10 d1             movsd  %xmm1,%xmm2
  2b:   f2 0f 10 c8             movsd  %xmm0,%xmm1
  2f:   48 8b 0d 12 00 00 00    mov    0x12(%rip),%rcx        # 48 <.text+0x48>
  36:   ff 25 14 00 00 00       jmpq   *0x14(%rip)        # 50 <.text+0x50>
  3c:   90                      nop
  3d:   90                      nop
  3e:   90                      nop
  3f:   90                      nop
Ian Lynagh's avatar
Ian Lynagh committed
531 532 533 534 535 536 537 538
  [...]

    */
    {  
        StgWord8 *adj_code;

        // determine whether we have 4 or more integer arguments,
        // and therefore need to flush one to the stack.
ian@well-typed.com's avatar
ian@well-typed.com committed
539 540 541 542
        if ((typeString[0] == '\0') ||
            (typeString[1] == '\0') ||
            (typeString[2] == '\0') ||
            (typeString[3] == '\0')) {
Ian Lynagh's avatar
Ian Lynagh committed
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561

            adjustor = allocateExec(0x38,&code);
            adj_code = (StgWord8*)adjustor;

            *(StgInt32 *)adj_code        = 0x49c1894d;
            *(StgInt32 *)(adj_code+0x4)  = 0x8948d089;
            *(StgInt32 *)(adj_code+0x8)  = 0x100ff2ca;
            *(StgInt32 *)(adj_code+0xc)  = 0x100ff2da;
            *(StgInt32 *)(adj_code+0x10) = 0x100ff2d1;
            *(StgInt32 *)(adj_code+0x14) = 0x0d8b48c8;
            *(StgInt32 *)(adj_code+0x18) = 0x0000000c;

            *(StgInt32 *)(adj_code+0x1c) = 0x000e25ff;
            *(StgInt32 *)(adj_code+0x20) = 0x00000000;
            *(StgInt64 *)(adj_code+0x28) = (StgInt64)hptr;
            *(StgInt64 *)(adj_code+0x30) = (StgInt64)wptr;
        }
        else
        {
ian@well-typed.com's avatar
ian@well-typed.com committed
562 563 564
            int fourthFloating;

            fourthFloating = (typeString[3] == 'f' || typeString[3] == 'd');
Ian Lynagh's avatar
Ian Lynagh committed
565
            adjustor = allocateExec(0x58,&code);
Ian Lynagh's avatar
Ian Lynagh committed
566
            adj_code = (StgWord8*)adjustor;
567 568 569 570
            *(StgInt32 *)adj_code        = 0x08ec8348;
            *(StgInt32 *)(adj_code+0x4)  = fourthFloating ? 0x5c110ff2
                                                          : 0x4c894c90;
            *(StgInt32 *)(adj_code+0x8)  = 0x35ff2024;
Ian Lynagh's avatar
Ian Lynagh committed
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
            *(StgInt32 *)(adj_code+0xc)  = 0x00000030;
            *(StgInt32 *)(adj_code+0x10) = 0x244c8b4c;
            *(StgInt32 *)(adj_code+0x14) = 0x4c894c10;
            *(StgInt32 *)(adj_code+0x18) = 0x894d3024;
            *(StgInt32 *)(adj_code+0x1c) = 0xd08949c1;
            *(StgInt32 *)(adj_code+0x20) = 0xf2ca8948;
            *(StgInt32 *)(adj_code+0x24) = 0xf2da100f;
            *(StgInt32 *)(adj_code+0x28) = 0xf2d1100f;
            *(StgInt32 *)(adj_code+0x2c) = 0x48c8100f;
            *(StgInt32 *)(adj_code+0x30) = 0x00120d8b;
            *(StgInt32 *)(adj_code+0x34) = 0x25ff0000;
            *(StgInt32 *)(adj_code+0x38) = 0x00000014;
            *(StgInt32 *)(adj_code+0x3c) = 0x90909090;
            *(StgInt64 *)(adj_code+0x40) = (StgInt64)obscure_ccall_ret_code;
            *(StgInt64 *)(adj_code+0x48) = (StgInt64)hptr;
            *(StgInt64 *)(adj_code+0x50) = (StgInt64)wptr;
Ian Lynagh's avatar
Ian Lynagh committed
587 588
        }
    }
Ian Lynagh's avatar
Ian Lynagh committed
589

Ian Lynagh's avatar
Ian Lynagh committed
590
# else
591 592 593
    /*
      stack at call:
               argn
Ian Lynagh's avatar
Ian Lynagh committed
594 595
               ...
               arg7
596
               return address
Ian Lynagh's avatar
Ian Lynagh committed
597
               %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg1..arg6
598 599 600 601 602 603 604 605 606

      if there are <6 integer args, then we can just push the
      StablePtr into %edi and shuffle the other args up.

      If there are >=6 integer args, then we have to flush one arg
      to the stack, and arrange to adjust the stack ptr on return.
      The stack will be rearranged to this:

             argn
Ian Lynagh's avatar
Ian Lynagh committed
607 608 609 610 611
             ...
             arg7
             return address  *** <-- dummy arg in stub fn.
             arg6
             obscure_ccall_ret_code
612 613 614 615 616 617 618 619 620 621 622 623 624

      This unfortunately means that the type of the stub function
      must have a dummy argument for the original return address
      pointer inserted just after the 6th integer argument.

      Code for the simple case:

   0:   4d 89 c1                mov    %r8,%r9
   3:   49 89 c8                mov    %rcx,%r8
   6:   48 89 d1                mov    %rdx,%rcx
   9:   48 89 f2                mov    %rsi,%rdx
   c:   48 89 fe                mov    %rdi,%rsi
   f:   48 8b 3d 0a 00 00 00    mov    10(%rip),%rdi
625
  16:   ff 25 0c 00 00 00       jmpq   *12(%rip)
626 627
  ... 
  20: .quad 0  # aligned on 8-byte boundary
628
  28: .quad 0  # aligned on 8-byte boundary
629 630 631 632 633


  And the version for >=6 integer arguments:

   0:   41 51                   push   %r9
634 635 636 637 638 639 640 641
   2:   ff 35 20 00 00 00       pushq  32(%rip)        # 28 <ccall_adjustor+0x28>
   8:   4d 89 c1                mov    %r8,%r9
   b:   49 89 c8                mov    %rcx,%r8
   e:   48 89 d1                mov    %rdx,%rcx
  11:   48 89 f2                mov    %rsi,%rdx
  14:   48 89 fe                mov    %rdi,%rsi
  17:   48 8b 3d 12 00 00 00    mov    18(%rip),%rdi        # 30 <ccall_adjustor+0x30>
  1e:   ff 25 14 00 00 00       jmpq   *20(%rip)        # 38 <ccall_adjustor+0x38>
642 643
  ...
  28: .quad 0  # aligned on 8-byte boundary
644 645
  30: .quad 0  # aligned on 8-byte boundary
  38: .quad 0  # aligned on 8-byte boundary
646 647 648
    */

    {  
Ian Lynagh's avatar
Ian Lynagh committed
649 650 651 652 653 654 655 656 657 658 659 660 661
        int i = 0;
        char *c;
        StgWord8 *adj_code;

        // determine whether we have 6 or more integer arguments,
        // and therefore need to flush one to the stack.
        for (c = typeString; *c != '\0'; c++) {
            if (*c != 'f' && *c != 'd') i++;
            if (i == 6) break;
        }

        if (i < 6) {
            adjustor = allocateExec(0x30,&code);
Simon Marlow's avatar
Simon Marlow committed
662 663
            adj_code = (StgWord8*)adjustor;

Ian Lynagh's avatar
Ian Lynagh committed
664 665 666 667 668 669 670 671 672 673 674 675 676
            *(StgInt32 *)adj_code        = 0x49c1894d;
            *(StgInt32 *)(adj_code+0x4)  = 0x8948c889;
            *(StgInt32 *)(adj_code+0x8)  = 0xf28948d1;
            *(StgInt32 *)(adj_code+0xc)  = 0x48fe8948;
            *(StgInt32 *)(adj_code+0x10) = 0x000a3d8b;
            *(StgInt32 *)(adj_code+0x14) = 0x25ff0000;
            *(StgInt32 *)(adj_code+0x18) = 0x0000000c;
            *(StgInt64 *)(adj_code+0x20) = (StgInt64)hptr;
            *(StgInt64 *)(adj_code+0x28) = (StgInt64)wptr;
        }
        else
        {
            adjustor = allocateExec(0x40,&code);
Simon Marlow's avatar
Simon Marlow committed
677 678
            adj_code = (StgWord8*)adjustor;

Ian Lynagh's avatar
Ian Lynagh committed
679 680 681 682 683 684 685 686 687 688 689 690 691 692
            *(StgInt32 *)adj_code        = 0x35ff5141;
            *(StgInt32 *)(adj_code+0x4)  = 0x00000020;
            *(StgInt32 *)(adj_code+0x8)  = 0x49c1894d;
            *(StgInt32 *)(adj_code+0xc)  = 0x8948c889;
            *(StgInt32 *)(adj_code+0x10) = 0xf28948d1;
            *(StgInt32 *)(adj_code+0x14) = 0x48fe8948;
            *(StgInt32 *)(adj_code+0x18) = 0x00123d8b;
            *(StgInt32 *)(adj_code+0x1c) = 0x25ff0000;
            *(StgInt32 *)(adj_code+0x20) = 0x00000014;
            
            *(StgInt64 *)(adj_code+0x28) = (StgInt64)obscure_ccall_ret_code;
            *(StgInt64 *)(adj_code+0x30) = (StgInt64)hptr;
            *(StgInt64 *)(adj_code+0x38) = (StgInt64)wptr;
        }
693
    }
Ian Lynagh's avatar
Ian Lynagh committed
694 695 696
# endif


697
#elif defined(sparc_HOST_ARCH)
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725
  /* Magic constant computed by inspecting the code length of the following
     assembly language snippet (offset and machine code prefixed):

     <00>: 9C23A008   sub   %sp, 8, %sp         ! make room for %o4/%o5 in caller's frame
     <04>: DA23A060   st    %o5, [%sp + 96]     ! shift registers by 2 positions
     <08>: D823A05C   st    %o4, [%sp + 92]
     <0C>: 9A10000B   mov   %o3, %o5
     <10>: 9810000A   mov   %o2, %o4
     <14>: 96100009   mov   %o1, %o3
     <18>: 94100008   mov   %o0, %o2
     <1C>: 13000000   sethi %hi(wptr), %o1      ! load up wptr (1 of 2)
     <20>: 11000000   sethi %hi(hptr), %o0      ! load up hptr (1 of 2)
     <24>: 81C26000   jmp   %o1 + %lo(wptr)     ! jump to wptr (load 2 of 2)
     <28>: 90122000   or    %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
     <2C>  00000000                             ! place for getting hptr back easily

     ccall'ing on SPARC is easy, because we are quite lucky to push a
     multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
     existing arguments (note that %sp must stay double-word aligned at
     all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
     To do this, we extend the *caller's* stack frame by 2 words and shift
     the output registers used for argument passing (%o0 - %o5, we are a *leaf*
     procedure because of the tail-jump) by 2 positions. This makes room in
     %o0 and %o1 for the additinal arguments, namely  hptr and a dummy (used
     for destination addr of jump on SPARC, return address on x86, ...). This
     shouldn't cause any problems for a C-like caller: alloca is implemented
     similarly, and local variables should be accessed via %fp, not %sp. In a
     nutshell: This should work! (Famous last words! :-)
726
  */
727
    adjustor = allocateExec(4*(11+1),&code);
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
    {
        unsigned long *const adj_code = (unsigned long *)adjustor;

        adj_code[ 0]  = 0x9C23A008UL;   /* sub   %sp, 8, %sp         */
        adj_code[ 1]  = 0xDA23A060UL;   /* st    %o5, [%sp + 96]     */
        adj_code[ 2]  = 0xD823A05CUL;   /* st    %o4, [%sp + 92]     */
        adj_code[ 3]  = 0x9A10000BUL;   /* mov   %o3, %o5            */
        adj_code[ 4]  = 0x9810000AUL;   /* mov   %o2, %o4            */
        adj_code[ 5]  = 0x96100009UL;   /* mov   %o1, %o3            */
        adj_code[ 6]  = 0x94100008UL;   /* mov   %o0, %o2            */
        adj_code[ 7]  = 0x13000000UL;   /* sethi %hi(wptr), %o1      */
        adj_code[ 7] |= ((unsigned long)wptr) >> 10;
        adj_code[ 8]  = 0x11000000UL;   /* sethi %hi(hptr), %o0      */
        adj_code[ 8] |= ((unsigned long)hptr) >> 10;
        adj_code[ 9]  = 0x81C26000UL;   /* jmp   %o1 + %lo(wptr)     */
        adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
        adj_code[10]  = 0x90122000UL;   /* or    %o0, %lo(hptr), %o0 */
        adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;

        adj_code[11]  = (unsigned long)hptr;

        /* flush cache */
        asm("flush %0" : : "r" (adj_code     ));
        asm("flush %0" : : "r" (adj_code +  2));
        asm("flush %0" : : "r" (adj_code +  4));
        asm("flush %0" : : "r" (adj_code +  6));
        asm("flush %0" : : "r" (adj_code + 10));

        /* max. 5 instructions latency, and we need at >= 1 for returning */
        asm("nop");
        asm("nop");
        asm("nop");
        asm("nop");
    }
762
#elif defined(alpha_HOST_ARCH)
ken's avatar
ken committed
763 764 765 766 767
  /* Magic constant computed by inspecting the code length of
     the following assembly language snippet
     (offset and machine code prefixed; note that the machine code
     shown is longwords stored in little-endian order):

Ian Lynagh's avatar
Ian Lynagh committed
768 769 770 771 772 773 774 775
  <00>: 46520414        mov     a2, a4
  <04>: 46100412        mov     a0, a2
  <08>: a61b0020        ldq     a0, 0x20(pv)    # load up hptr
  <0c>: 46730415        mov     a3, a5
  <10>: a77b0028        ldq     pv, 0x28(pv)    # load up wptr
  <14>: 46310413        mov     a1, a3
  <18>: 6bfb----        jmp     (pv), <hint>    # jump to wptr (with hint)
  <1c>: 00000000                                # padding for alignment
ken's avatar
ken committed
776 777
  <20>: [8 bytes for hptr quadword]
  <28>: [8 bytes for wptr quadword]
ken's avatar
ken committed
778 779 780 781 782 783

     The "computed" jump at <08> above is really a jump to a fixed
     location.  Accordingly, we place an always-correct hint in the
     jump instruction, namely the address offset from <0c> to wptr,
     divided by 4, taking the lowest 14 bits.

ken's avatar
ken committed
784
     We only support passing 4 or fewer argument words, for the same
785
     reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
ken's avatar
ken committed
786 787 788 789 790 791 792 793 794 795 796 797
     On the Alpha the first 6 integer arguments are in a0 through a5,
     and the rest on the stack.  Hence we want to shuffle the original
     caller's arguments by two.

     On the Alpha the calling convention is so complex and dependent
     on the callee's signature -- for example, the stack pointer has
     to be a multiple of 16 -- that it seems impossible to me [ccshan]
     to handle the general case correctly without changing how the
     adjustor is called from C.  For now, our solution of shuffling
     registers only and ignoring the stack only works if the original
     caller passed 4 or fewer argument words.

ken's avatar
ken committed
798 799 800 801 802 803
TODO: Depending on how much allocation overhead stgMallocBytes uses for
      header information (more precisely, if the overhead is no more than
      4 bytes), we should move the first three instructions above down by
      4 bytes (getting rid of the nop), hence saving memory. [ccshan]
  */
    ASSERT(((StgWord64)wptr & 3) == 0);
804
    adjustor = allocateExec(48,&code);
805
    {
Ian Lynagh's avatar
Ian Lynagh committed
806
        StgWord64 *const code = (StgWord64 *)adjustor;
807

Ian Lynagh's avatar
Ian Lynagh committed
808 809 810 811 812
        code[0] = 0x4610041246520414L;
        code[1] = 0x46730415a61b0020L;
        code[2] = 0x46310413a77b0028L;
        code[3] = 0x000000006bfb0000L
                | (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
ken's avatar
ken committed
813

Ian Lynagh's avatar
Ian Lynagh committed
814 815
        code[4] = (StgWord64)hptr;
        code[5] = (StgWord64)wptr;
ken's avatar
ken committed
816

Ian Lynagh's avatar
Ian Lynagh committed
817 818
        /* Ensure that instruction cache is consistent with our new code */
        __asm__ volatile("call_pal %0" : : "i" (PAL_imb));
819
    }
820
#elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854

#define OP_LO(op,lo)  ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
#define OP_HI(op,hi)  ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
    {
        /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
           We need to calculate all the details of the stack frame layout,
           taking into account the types of all the arguments, and then
           generate code on the fly. */
    
        int src_gpr = 3, dst_gpr = 5;
        int fpr = 3;
        int src_offset = 0, dst_offset = 0;
        int n = strlen(typeString),i;
        int src_locs[n], dst_locs[n];
        int frameSize;
        unsigned *code;
      
            /* Step 1:
               Calculate where the arguments should go.
               src_locs[] will contain the locations of the arguments in the
               original stack frame passed to the adjustor.
               dst_locs[] will contain the locations of the arguments after the
               adjustor runs, on entry to the wrapper proc pointed to by wptr.

               This algorithm is based on the one described on page 3-19 of the
               System V ABI PowerPC Processor Supplement.
            */
        for(i=0;typeString[i];i++)
        {
            char t = typeString[i];
            if((t == 'f' || t == 'd') && fpr <= 8)
                src_locs[i] = dst_locs[i] = -32-(fpr++);
            else
            {
855
                if((t == 'l' || t == 'L') && src_gpr <= 9)
856 857 858 859 860 861
                {
                    if((src_gpr & 1) == 0)
                        src_gpr++;
                    src_locs[i] = -src_gpr;
                    src_gpr += 2;
                }
862
                else if((t == 'w' || t == 'W') && src_gpr <= 10)
863 864 865 866 867
                {
                    src_locs[i] = -(src_gpr++);
                }
                else
                {
868
                    if(t == 'l' || t == 'L' || t == 'd')
869 870 871 872 873
                    {
                        if(src_offset % 8)
                            src_offset += 4;
                    }
                    src_locs[i] = src_offset;
874
                    src_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
875 876
                }

877
                    if((t == 'l' || t == 'L') && dst_gpr <= 9)
878 879 880 881 882 883
                {
                    if((dst_gpr & 1) == 0)
                        dst_gpr++;
                    dst_locs[i] = -dst_gpr;
                    dst_gpr += 2;
                }
884
                else if((t == 'w' || t == 'W') && dst_gpr <= 10)
885 886 887 888 889
                {
                    dst_locs[i] = -(dst_gpr++);
                }
                else
                {
890
                    if(t == 'l' || t == 'L' || t == 'd')
891 892 893 894 895
                    {
                        if(dst_offset % 8)
                            dst_offset += 4;
                    }
                    dst_locs[i] = dst_offset;
896
                    dst_offset += (t == 'l' || t == 'L' || t == 'd') ? 8 : 4;
897 898 899 900 901 902 903 904 905 906 907 908
                }
            }
        }

        frameSize = dst_offset + 8;
        frameSize = (frameSize+15) & ~0xF;

            /* Step 2:
               Build the adjustor.
            */
                    // allocate space for at most 4 insns per parameter
                    // plus 14 more instructions.
909
        adjustor = allocateExec(4 * (4*n + 14),&code);
910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
        code = (unsigned*)adjustor;
        
        *code++ = 0x48000008; // b *+8
            // * Put the hptr in a place where freeHaskellFunctionPtr
            //   can get at it.
        *code++ = (unsigned) hptr;

            // * save the link register
        *code++ = 0x7c0802a6; // mflr r0;
        *code++ = 0x90010004; // stw r0, 4(r1);
            // * and build a new stack frame
        *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)

            // * now generate instructions to copy arguments
            //   from the old stack frame into the new stack frame.
        for(i=n-1;i>=0;i--)
        {
            if(src_locs[i] < -32)
                ASSERT(dst_locs[i] == src_locs[i]);
            else if(src_locs[i] < 0)
            {
                // source in GPR.
                ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
                if(dst_locs[i] < 0)
                {
                    ASSERT(dst_locs[i] > -32);
                        // dst is in GPR, too.

938
                    if(typeString[i] == 'l' || typeString[i] == 'L')
939 940 941 942 943 944 945 946 947 948 949 950 951 952 953
                    {
                            // mr dst+1, src+1
                        *code++ = 0x7c000378
                                | ((-dst_locs[i]+1) << 16)
                                | ((-src_locs[i]+1) << 11)
                                | ((-src_locs[i]+1) << 21);
                    }
                    // mr dst, src
                    *code++ = 0x7c000378
                            | ((-dst_locs[i]) << 16)
                            | ((-src_locs[i]) << 11)
                            | ((-src_locs[i]) << 21);
                }
                else
                {
954
                    if(typeString[i] == 'l' || typeString[i] == 'L')
955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
                    {
                            // stw src+1, dst_offset+4(r1)
                        *code++ = 0x90010000
                                | ((-src_locs[i]+1) << 21)
                                | (dst_locs[i] + 4);
                    }
                    
                        // stw src, dst_offset(r1)
                    *code++ = 0x90010000
                            | ((-src_locs[i]) << 21)
                            | (dst_locs[i] + 8);
                }
            }
            else
            {
                ASSERT(dst_locs[i] >= 0);
                ASSERT(typeString[i] != 'f' && typeString[i] != 'd');

973
                if(typeString[i] == 'l' || typeString[i] == 'L')
974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
                {
                    // lwz r0, src_offset(r1)
                        *code++ = 0x80010000
                                | (src_locs[i] + frameSize + 8 + 4);
                    // stw r0, dst_offset(r1)
                        *code++ = 0x90010000
                                | (dst_locs[i] + 8 + 4);
                    }
                // lwz r0, src_offset(r1)
                    *code++ = 0x80010000
                            | (src_locs[i] + frameSize + 8);
                // stw r0, dst_offset(r1)
                    *code++ = 0x90010000
                            | (dst_locs[i] + 8);
           }
        }

            // * hptr will be the new first argument.
            // lis r3, hi(hptr)
        *code++ = OP_HI(0x3c60, hptr);
            // ori r3,r3,lo(hptr)
        *code++ = OP_LO(0x6063, hptr);

            // * we need to return to a piece of code
            //   which will tear down the stack frame.
            // lis r11,hi(obscure_ccall_ret_code)
        *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
            // ori r11,r11,lo(obscure_ccall_ret_code)
        *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
            // mtlr r11
        *code++ = 0x7d6803a6;

            // * jump to wptr
            // lis r11,hi(wptr)
        *code++ = OP_HI(0x3d60, wptr);
            // ori r11,r11,lo(wptr)
        *code++ = OP_LO(0x616b, wptr);
            // mtctr r11
        *code++ = 0x7d6903a6;
            // bctr
        *code++ = 0x4e800420;

        // Flush the Instruction cache:
        {
            unsigned *p = adjustor;
            while(p < code)
            {
                __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
                                 : : "r" (p));
                p++;
            }
            __asm__ volatile ("sync\n\tisync");
        }
    }
1028

1029
#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
1030 1031 1032
        
#define OP_LO(op,lo)  ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
#define OP_HI(op,hi)  ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059
    {
        /* The following code applies to all PowerPC and PowerPC64 platforms
           whose stack layout is based on the AIX ABI.

           Besides (obviously) AIX, this includes
            Mac OS 9 and BeOS/PPC (may they rest in peace),
                which use the 32-bit AIX ABI
            powerpc64-linux,
                which uses the 64-bit AIX ABI
            and Darwin (Mac OS X),
                which uses the same stack layout as AIX,
                but no function descriptors.

           The actual stack-frame shuffling is implemented out-of-line
           in the function adjustorCode, in AdjustorAsm.S.
           Here, we set up an AdjustorStub structure, which
           is a function descriptor (on platforms that have function
           descriptors) or a short piece of stub code (on Darwin) to call
           adjustorCode with a pointer to the AdjustorStub struct loaded
           into register r2.

           One nice thing about this is that there is _no_ code generated at
           runtime on the platforms that have function descriptors.
        */
        AdjustorStub *adjustorStub;
        int sz = 0, extra_sz, total_sz;

1060
#ifdef FUNDESCS
1061
        adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
1062
#else
1063
        adjustorStub = allocateExec(sizeof(AdjustorStub),&code);
1064
#endif
1065 1066 1067
        adjustor = adjustorStub;
            
        adjustorStub->code = (void*) &adjustorCode;
1068 1069

#ifdef FUNDESCS
1070 1071 1072
            // function descriptors are a cool idea.
            // We don't need to generate any code at runtime.
        adjustorStub->toc = adjustorStub;
1073 1074
#else

1075 1076
            // no function descriptors :-(
            // We need to do things "by hand".
1077
#if defined(powerpc_HOST_ARCH)
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
            // lis  r2, hi(adjustorStub)
        adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
            // ori  r2, r2, lo(adjustorStub)
        adjustorStub->ori = OP_LO(0x6042, adjustorStub);
            // lwz r0, code(r2)
        adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
                                        - (char*)adjustorStub);
            // mtctr r0
        adjustorStub->mtctr = 0x7c0903a6;
            // bctr
        adjustorStub->bctr = 0x4e800420;
1089
#else
1090
        barf("adjustor creation not supported on this platform");
1091 1092
#endif

1093 1094 1095 1096 1097
        // Flush the Instruction cache:
        {
            int n = sizeof(AdjustorStub)/sizeof(unsigned);
            unsigned *p = (unsigned*)adjustor;
            while(n--)
1098
            {
1099 1100 1101
                __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
                                    : : "r" (p));
                p++;
1102
            }
1103 1104
            __asm__ volatile ("sync\n\tisync");
        }
1105 1106
#endif

1107
            // Calculate the size of the stack frame, in words.
1108 1109
        sz = totalArgumentSize(typeString);
        
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134
            // The first eight words of the parameter area
            // are just "backing store" for the parameters passed in
            // the GPRs. extra_sz is the number of words beyond those first
            // 8 words.
        extra_sz = sz - 8;
        if(extra_sz < 0)
            extra_sz = 0;

            // Calculate the total size of the stack frame.
        total_sz = (6 /* linkage area */
                  + 8 /* minimum parameter area */
                  + 2 /* two extra arguments */
                  + extra_sz)*sizeof(StgWord);
       
            // align to 16 bytes.
            // AIX only requires 8 bytes, but who cares?
        total_sz = (total_sz+15) & ~0xF;
       
            // Fill in the information that adjustorCode in AdjustorAsm.S
            // will use to create a new stack frame with the additional args.
        adjustorStub->hptr = hptr;
        adjustorStub->wptr = wptr;
        adjustorStub->negative_framesize = -total_sz;
        adjustorStub->extrawords_plus_one = extra_sz + 1;
    }
1135

1136
#elif defined(ia64_HOST_ARCH)
1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147
/*
    Up to 8 inputs are passed in registers.  We flush the last two inputs to
    the stack, initially into the 16-byte scratch region left by the caller.
    We then shuffle the others along by 4 (taking 2 registers for ourselves
    to save return address and previous function state - we need to come back
    here on the way out to restore the stack, so this is a real function
    rather than just a trampoline).
    
    The function descriptor we create contains the gp of the target function
    so gp is already loaded correctly.

Ian Lynagh's avatar
Ian Lynagh committed
1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
        [MLX]       alloc r16=ar.pfs,10,2,0
                    movl r17=wptr
        [MII]       st8.spill [r12]=r38,8               // spill in6 (out4)
                    mov r41=r37                         // out7 = in5 (out3)
                    mov r40=r36;;                       // out6 = in4 (out2)
        [MII]       st8.spill [r12]=r39                 // spill in7 (out5)
                    mov.sptk b6=r17,50
                    mov r38=r34;;                       // out4 = in2 (out0)
        [MII]       mov r39=r35                         // out5 = in3 (out1)
                    mov r37=r33                         // out3 = in1 (loc1)
                    mov r36=r32                         // out2 = in0 (loc0)
        [MLX]       adds r12=-24,r12                    // update sp
                    movl r34=hptr;;                     // out0 = hptr
        [MIB]       mov r33=r16                         // loc1 = ar.pfs
                    mov r32=b0                          // loc0 = retaddr
                    br.call.sptk.many b0=b6;;

        [MII]       adds r12=-16,r12
                    mov b0=r32
                    mov.i ar.pfs=r33
        [MFB]       nop.m 0x0
                    nop.f 0x0
                    br.ret.sptk.many b0;;
1171 1172 1173
*/

/* These macros distribute a long constant into the two words of an MLX bundle */
Ian Lynagh's avatar
Ian Lynagh committed
1174 1175 1176 1177 1178 1179 1180 1181
#define BITS(val,start,count)   (((val) >> (start)) & ((1 << (count))-1))
#define MOVL_LOWORD(val)        (BITS(val,22,18) << 46)
#define MOVL_HIWORD(val)        ( (BITS(val,0,7)    << 36)      \
                                | (BITS(val,7,9)    << 50)      \
                                | (BITS(val,16,5)   << 45)      \
                                | (BITS(val,21,1)   << 44)      \
                                | (BITS(val,40,23))             \
                                | (BITS(val,63,1)    << 59))
1182 1183

    {
Ian Lynagh's avatar
Ian Lynagh committed
1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
        StgStablePtr stable;
        IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
        StgWord64 wcode = wdesc->ip;
        IA64FunDesc *fdesc;
        StgWord64 *code;

        /* we allocate on the Haskell heap since malloc'd memory isn't
         * executable - argh */
        /* Allocated memory is word-aligned (8 bytes) but functions on ia64
         * must be aligned to 16 bytes.  We allocate an extra 8 bytes of
         * wiggle room so that we can put the code on a 16 byte boundary. */
        adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8+8, &stable);

        fdesc = (IA64FunDesc *)adjustor;
        code = (StgWord64 *)(fdesc + 1);
        /* add 8 bytes to code if needed to align to a 16-byte boundary */
        if ((StgWord64)code & 15) code++;
        fdesc->ip = (StgWord64)code;
        fdesc->gp = wdesc->gp;

        code[0]  = 0x0000058004288004 | MOVL_LOWORD(wcode);
        code[1]  = 0x6000000220000000 | MOVL_HIWORD(wcode);
        code[2]  = 0x029015d818984001;
        code[3]  = 0x8401200500420094;
        code[4]  = 0x886011d8189c0001;
        code[5]  = 0x84011004c00380c0;
        code[6]  = 0x0250210046013800;
        code[7]  = 0x8401000480420084;
        code[8]  = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
        code[9]  = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
        code[10] = 0x0200210020010811;
        code[11] = 0x1080006800006200;
        code[12] = 0x0000210018406000;
        code[13] = 0x00aa021000038005;
        code[14] = 0x000000010000001d;
        code[15] = 0x0084000880000200;

        /* save stable pointers in convenient form */
        code[16] = (StgWord64)hptr;
        code[17] = (StgWord64)stable;
1224
    }
ken's avatar
ken committed
1225
#else
1226
    barf("adjustor creation not supported on this platform");
1227
#endif
ken's avatar
ken committed
1228
    break;
sof's avatar
sof committed
1229
  
ken's avatar
ken committed
1230 1231 1232
  default:
    ASSERT(0);
    break;
sof's avatar
sof committed
1233 1234 1235
  }

  /* Have fun! */
1236
  return code;
sof's avatar
sof committed
1237 1238
}

ken's avatar
ken committed
1239

sof's avatar
sof committed
1240
void
1241
freeHaskellFunctionPtr(void* ptr)
sof's avatar
sof committed
1242
{
1243 1244
#if defined(i386_HOST_ARCH)
 if ( *(unsigned char*)ptr != 0xe8 &&
sof's avatar
sof committed
1245
      *(unsigned char*)ptr != 0x58 ) {
1246
   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
sof's avatar
sof committed
1247 1248
   return;
 }
1249 1250
 if (*(unsigned char*)ptr == 0xe8) { /* Aha, a ccall adjustor! */
     freeStablePtr(((AdjustorStub*)ptr)->hptr);
sof's avatar
sof committed
1251
 } else {
1252
    freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
1253 1254 1255
 }
#elif defined(x86_64_HOST_ARCH)
 if ( *(StgWord16 *)ptr == 0x894d ) {
1256 1257 1258 1259 1260 1261 1262
     freeStablePtr(*(StgStablePtr*)((StgWord8*)ptr+
#if defined(mingw32_HOST_OS)
                                                   0x28
#else
                                                   0x20
#endif
                                                       ));
1263
#if !defined(mingw32_HOST_OS)
1264
 } else if ( *(StgWord16 *)ptr == 0x5141 ) {