StgCmmExpr.hs 18.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
-----------------------------------------------------------------------------
--
-- Stg to C-- code generation: expressions
--
-- (c) The University of Glasgow 2004-2006
--
-----------------------------------------------------------------------------

module StgCmmExpr ( cgExpr ) where

#define FAST_STRING_NOT_NEEDED
#include "HsVersions.h"

import {-# SOURCE #-} StgCmmBind ( cgBind )

import StgCmmMonad
import StgCmmHeap
import StgCmmEnv
import StgCmmCon
import StgCmmProf
import StgCmmLayout
import StgCmmPrim
import StgCmmHpc
import StgCmmTicky
import StgCmmUtils
import StgCmmClosure

import StgSyn

import MkZipCfgCmm
import BlockId
import Cmm()
import CmmExpr
import CoreSyn
import DataCon
36
import ForeignCall
37
import Id
38
import PrimOp
39
40
41
42
43
44
import TyCon
import CostCentre	( CostCentreStack, currentCCS )
import Maybes
import Util
import FastString
import Outputable
45
import UniqSupply
46
47
48
49
50
51
52
53
54
55
56
57

------------------------------------------------------------------------
--		cgExpr: the main function
------------------------------------------------------------------------

cgExpr	:: StgExpr -> FCode ()

cgExpr (StgApp fun args)     = cgIdApp fun args
cgExpr (StgOpApp op args ty) = cgOpApp op args ty
cgExpr (StgConApp con args)  = cgConApp con args
cgExpr (StgSCC cc expr)   = do { emitSetCCC cc; cgExpr expr }
cgExpr (StgTick m n expr) = do { emit (mkTickBox m n); cgExpr expr }
58
59
cgExpr (StgLit lit)       = do cmm_lit <- cgLit lit
                               emitReturn [CmmLit cmm_lit]
60

61
62
63
64
65
66
67
cgExpr (StgLet binds expr)             = do { cgBind binds;     cgExpr expr }
cgExpr (StgLetNoEscape _ _ binds expr) =
  do { us <- newUniqSupply
     ; let join_id = mkBlockId (uniqFromSupply us)
     ; cgLneBinds join_id binds
     ; cgExpr expr 
     ; emit $ mkLabel join_id}
68

69
70
cgExpr (StgCase expr _live_vars _save_vars bndr srt alt_type alts) =
  cgCase expr bndr srt alt_type alts
71
72
73
74
75
76
77
78

cgExpr (StgLam {}) = panic "cgExpr: StgLam"

------------------------------------------------------------------------
--		Let no escape
------------------------------------------------------------------------

{- Generating code for a let-no-escape binding, aka join point is very
79
very similar to what we do for a case expression.  The duality is
80
81
82
83
84
85
86
87
88
89
90
91
92
between
	let-no-escape x = b
	in e
and
	case e of ... -> b

That is, the RHS of 'x' (ie 'b') will execute *later*, just like
the alternative of the case; it needs to be compiled in an environment
in which all volatile bindings are forgotten, and the free vars are
bound only to stable things like stack locations..  The 'e' part will
execute *next*, just like the scrutinee of a case. -}

-------------------------
93
94
95
96
97
98
99
100
101
102
103
104
105
106
cgLneBinds :: BlockId -> StgBinding -> FCode ()
cgLneBinds join_id (StgNonRec bndr rhs)
  = do  { local_cc <- saveCurrentCostCentre
                -- See Note [Saving the current cost centre]
        ; info <- cgLetNoEscapeRhs join_id local_cc bndr rhs 
        ; addBindC (cg_id info) info }

cgLneBinds join_id (StgRec pairs)
  = do  { local_cc <- saveCurrentCostCentre
        ; new_bindings <- fixC (\ new_bindings -> do
                { addBindsC new_bindings
                ; listFCs [ cgLetNoEscapeRhs join_id local_cc b e 
                          | (b,e) <- pairs ] })
        ; addBindsC new_bindings }
107

108

109
-------------------------
110
111
112
cgLetNoEscapeRhs
    :: BlockId          -- join point for successor of let-no-escape
    -> Maybe LocalReg	-- Saved cost centre
113
114
    -> Id
    -> StgRhs
115
116
    -> FCode CgIdInfo

117
cgLetNoEscapeRhs join_id local_cc bndr rhs =
118
119
  do { (info, rhs_body) <- getCodeR $ cgLetNoEscapeRhsBody local_cc bndr rhs 
     ; let (bid, _) = expectJust "cgLetNoEscapeRhs" $ maybeLetNoEscape info
120
     ; emit (outOfLine $ mkLabel bid <*> rhs_body <*> mkBranch join_id)
121
122
123
     ; return info
     }

124
125
126
127
128
cgLetNoEscapeRhsBody
    :: Maybe LocalReg	-- Saved cost centre
    -> Id
    -> StgRhs
    -> FCode CgIdInfo
129
130
cgLetNoEscapeRhsBody local_cc bndr (StgRhsClosure cc _bi _ _upd _ args body)
  = cgLetNoEscapeClosure bndr local_cc cc (nonVoidIds args) body
131
cgLetNoEscapeRhsBody local_cc bndr (StgRhsCon cc con args)
132
  = cgLetNoEscapeClosure bndr local_cc cc [] (StgConApp con args)
133
134
135
136
137
138
139
140
141
142
	-- For a constructor RHS we want to generate a single chunk of 
	-- code which can be jumped to from many places, which will 
	-- return the constructor. It's easy; just behave as if it 
	-- was an StgRhsClosure with a ConApp inside!

-------------------------
cgLetNoEscapeClosure
	:: Id			-- binder
	-> Maybe LocalReg	-- Slot for saved current cost centre
	-> CostCentreStack   	-- XXX: *** NOT USED *** why not?
143
	-> [NonVoid Id]		-- Args (as in \ args -> body)
144
    	-> StgExpr		-- Body (as in above)
145
	-> FCode CgIdInfo
146

147
cgLetNoEscapeClosure bndr cc_slot _unused_cc args body
148
149
150
  = do  { arg_regs <- forkProc $ do	
		{ restoreCurrentCostCentre cc_slot
		; arg_regs <- bindArgsToRegs args
151
		; altHeapCheck arg_regs (cgExpr body)
152
153
154
			-- Using altHeapCheck just reduces
			-- instructions to save on stack
		; return arg_regs }
155
	; return $ lneIdInfo bndr arg_regs}
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273


------------------------------------------------------------------------
--		Case expressions
------------------------------------------------------------------------

{- Note [Compiling case expressions]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
It is quite interesting to decide whether to put a heap-check at the
start of each alternative.  Of course we certainly have to do so if
the case forces an evaluation, or if there is a primitive op which can
trigger GC.

A more interesting situation is this (a Plan-B situation)

	!P!;
	...P...
	case x# of
	  0#      -> !Q!; ...Q...
	  default -> !R!; ...R...

where !x! indicates a possible heap-check point. The heap checks
in the alternatives *can* be omitted, in which case the topmost
heapcheck will take their worst case into account.

In favour of omitting !Q!, !R!:

 - *May* save a heap overflow test,
   if ...P... allocates anything.  

 - We can use relative addressing from a single Hp to 
   get at all the closures so allocated.

 - No need to save volatile vars etc across heap checks
   in !Q!, !R!

Against omitting !Q!, !R!

  - May put a heap-check into the inner loop.  Suppose 
	the main loop is P -> R -> P -> R...
	Q is the loop exit, and only it does allocation.
    This only hurts us if P does no allocation.  If P allocates,
    then there is a heap check in the inner loop anyway.

  - May do more allocation than reqd.  This sometimes bites us
    badly.  For example, nfib (ha!) allocates about 30\% more space if the
    worst-casing is done, because many many calls to nfib are leaf calls
    which don't need to allocate anything. 

    We can un-allocate, but that costs an instruction

Neither problem hurts us if there is only one alternative.

Suppose the inner loop is P->R->P->R etc.  Then here is
how many heap checks we get in the *inner loop* under various
conditions

  Alooc	  Heap check in branches (!Q!, !R!)?
  P Q R	     yes     no (absorb to !P!)
--------------------------------------
  n n n	     0		0
  n y n	     0		1
  n . y	     1		1
  y . y	     2		1
  y . n	     1		1

Best choices: absorb heap checks from Q and R into !P! iff
  a) P itself does some allocation
or
  b) P does allocation, or there is exactly one alternative

We adopt (b) because that is more likely to put the heap check at the
entry to a function, when not many things are live.  After a bunch of
single-branch cases, we may have lots of things live

Hence: two basic plans for

	case e of r { alts }

------ Plan A: the general case ---------

	...save current cost centre...

	...code for e, 
	   with sequel (SetLocals r)

        ...restore current cost centre...
	...code for alts...
	...alts do their own heap checks

------ Plan B: special case when ---------
  (i)  e does not allocate or call GC
  (ii) either upstream code performs allocation
       or there is just one alternative

  Then heap allocation in the (single) case branch
  is absorbed by the upstream check.
  Very common example: primops on unboxed values

	...code for e,
	   with sequel (SetLocals r)...

	...code for alts...
	...no heap check...
-}



-------------------------------------
data GcPlan
  = GcInAlts 		-- Put a GC check at the start the case alternatives,
	[LocalReg] 	-- which binds these registers
	SRT		-- using this SRT
  | NoGcInAlts		-- The scrutinee is a primitive value, or a call to a
			-- primitive op which does no GC.  Absorb the allocation
			-- of the case alternative(s) into the upstream check

-------------------------------------
274
-- See Note [case on Bool]
275
cgCase :: StgExpr -> Id -> SRT -> AltType -> [StgAlt] -> FCode ()
276
277
278
279
280
281
{-
cgCase (OpApp ) bndr srt AlgAlt [(DataAlt flase, a2]
  | isBoolTy (idType bndr)
  , isDeadBndr bndr
  = 
-}
282

283
284
285
286
287
cgCase scrut bndr srt alt_type alts 
  = do	{ up_hp_usg <- getVirtHp	-- Upstream heap usage
	; let ret_bndrs = chooseReturnBndrs bndr alt_type alts
	      alt_regs  = map idToReg ret_bndrs
	      simple_scrut = isSimpleScrut scrut alt_type
288
289
290
291
292
	      gcInAlts | not simple_scrut = True
	               | isSingleton alts = False
		       | up_hp_usg > 0    = False
		       | otherwise        = True
              gc_plan = if gcInAlts then GcInAlts alt_regs srt else NoGcInAlts
293
294

	; mb_cc <- maybeSaveCostCentre simple_scrut
295
	; withSequel (AssignTo alt_regs gcInAlts) (cgExpr scrut)
296
297
	; restoreCurrentCostCentre mb_cc

298
  -- JD: We need Note: [Better Alt Heap Checks]
299
	; bindArgsToRegs ret_bndrs
300
	; cgAlts gc_plan (NonVoid bndr) alt_type alts }
301
302
303
304
305
306
307
308
309
310

-----------------
maybeSaveCostCentre :: Bool -> FCode (Maybe LocalReg)
maybeSaveCostCentre simple_scrut
  | simple_scrut = saveCurrentCostCentre
  | otherwise    = return Nothing


-----------------
isSimpleScrut :: StgExpr -> AltType -> Bool
311
312
313
314
315
316
317
-- Simple scrutinee, does not block or allocate; hence safe to amalgamate
-- heap usage from alternatives into the stuff before the case
-- NB: if you get this wrong, and claim that the expression doesn't allocate
--     when it does, you'll deeply mess up allocation
isSimpleScrut (StgOpApp op _ _) _          = isSimpleOp op
isSimpleScrut (StgLit _)       _           = True	-- case 1# of { 0# -> ..; ... }
isSimpleScrut (StgApp _ [])    (PrimAlt _) = True	-- case x# of { 0# -> ..; ... }
318
319
isSimpleScrut _		       _           = False

320
321
322
323
324
325
isSimpleOp :: StgOp -> Bool
-- True iff the op cannot block or allocate
isSimpleOp (StgFCallOp (CCall (CCallSpec _ _ safe)) _) = not (playSafe safe)
isSimpleOp (StgFCallOp (DNCall _) _)                   = False         -- Safe!
isSimpleOp (StgPrimOp op)      			       = not (primOpOutOfLine op)

326
-----------------
327
chooseReturnBndrs :: Id -> AltType -> [StgAlt] -> [NonVoid Id]
328
329
330
331
332
333
334
335
336
337
-- These are the binders of a case that are assigned
-- by the evaluation of the scrutinee
-- Only non-void ones come back
chooseReturnBndrs bndr (PrimAlt _) _alts
  = nonVoidIds [bndr]

chooseReturnBndrs _bndr (UbxTupAlt _) [(_, ids, _, _)]
  = nonVoidIds ids	-- 'bndr' is not assigned!

chooseReturnBndrs bndr (AlgAlt _) _alts
338
  = nonVoidIds [bndr]	-- Only 'bndr' is assigned
339
340

chooseReturnBndrs bndr PolyAlt _alts
341
  = nonVoidIds [bndr]	-- Only 'bndr' is assigned
342
343
344
345
346

chooseReturnBndrs _ _ _ = panic "chooseReturnBndrs"
	-- UbxTupALt has only one alternative

-------------------------------------
347
cgAlts :: GcPlan -> NonVoid Id -> AltType -> [StgAlt] -> FCode ()
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
-- At this point the result of the case are in the binders
cgAlts gc_plan _bndr PolyAlt [(_, _, _, rhs)]
  = maybeAltHeapCheck gc_plan (cgExpr rhs)
  
cgAlts gc_plan _bndr (UbxTupAlt _) [(_, _, _, rhs)]
  = maybeAltHeapCheck gc_plan (cgExpr rhs)
	-- Here bndrs are *already* in scope, so don't rebind them

cgAlts gc_plan bndr (PrimAlt _) alts
  = do	{ tagged_cmms <- cgAltRhss gc_plan bndr alts

	; let bndr_reg = CmmLocal (idToReg bndr)
	      (DEFAULT,deflt) = head tagged_cmms
		-- PrimAlts always have a DEFAULT case
		-- and it always comes first

	      tagged_cmms' = [(lit,code) 
			     | (LitAlt lit, code) <- tagged_cmms]
	; emit (mkCmmLitSwitch (CmmReg bndr_reg) tagged_cmms' deflt) }

cgAlts gc_plan bndr (AlgAlt tycon) alts
  = do	{ tagged_cmms <- cgAltRhss gc_plan bndr alts
	
	; let fam_sz   = tyConFamilySize tycon
	      bndr_reg = CmmLocal (idToReg bndr)
	      mb_deflt = case tagged_cmms of
			   ((DEFAULT,rhs) : _) -> Just rhs
			   _other	       -> Nothing
		-- DEFAULT is always first, if present

	      branches = [ (dataConTagZ con, cmm) 
	   	         | (DataAlt con, cmm) <- tagged_cmms ]

                    -- Is the constructor tag in the node reg?
382
        ; if isSmallFamily fam_sz
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
	  then let	-- Yes, bndr_reg has constr. tag in ls bits
                   tag_expr = cmmConstrTag1 (CmmReg bndr_reg)
                   branches' = [(tag+1,branch) | (tag,branch) <- branches]
                in
	        emitSwitch tag_expr branches' mb_deflt 1 fam_sz

	   else 	-- No, get tag from info table
                let -- Note that ptr _always_ has tag 1
                    -- when the family size is big enough
                    untagged_ptr = cmmRegOffB bndr_reg (-1)
                    tag_expr = getConstrTag (untagged_ptr)
		 in
		 emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1) }

cgAlts _ _ _ _ = panic "cgAlts"
	-- UbxTupAlt and PolyAlt have only one alternative

-------------------
401
cgAltRhss :: GcPlan -> NonVoid Id -> [StgAlt] -> FCode [(AltCon, CmmAGraph)]
402
403
404
405
406
407
408
409
cgAltRhss gc_plan bndr alts
  = forkAlts (map cg_alt alts)
  where
    base_reg = idToReg bndr
    cg_alt :: StgAlt -> FCode (AltCon, CmmAGraph)
    cg_alt (con, bndrs, _uses, rhs)
      = getCodeR		  $
	maybeAltHeapCheck gc_plan $
410
	do { bindConArgs con base_reg bndrs
411
412
413
414
415
416
	   ; cgExpr rhs
	   ; return con }

maybeAltHeapCheck :: GcPlan -> FCode a -> FCode a
maybeAltHeapCheck NoGcInAlts code
  = code
417
418
maybeAltHeapCheck (GcInAlts regs _) code
  = altHeapCheck regs code
419
420
421
422
423
424
425

-----------------------------------------------------------------------------
-- 	Tail calls
-----------------------------------------------------------------------------

cgConApp :: DataCon -> [StgArg] -> FCode ()
cgConApp con stg_args
426
427
428
429
430
431
  | isUnboxedTupleCon con	-- Unboxed tuple: assign and return
  = do { arg_exprs <- getNonVoidArgAmodes stg_args
       ; tickyUnboxedTupleReturn (length arg_exprs)
       ; emitReturn arg_exprs }

  | otherwise	--  Boxed constructors; allocate and return
432
  = ASSERT( stg_args `lengthIs` dataConRepArity con )
433
    do	{ (idinfo, init) <- buildDynCon (dataConWorkId con) currentCCS con stg_args
434
435
436
	   	-- The first "con" says that the name bound to this closure is
		-- is "con", which is a bit of a fudge, but it only affects profiling

437
        ; emit init
438
439
	; emitReturn [idInfoToAmode idinfo] }

440

441
cgIdApp :: Id -> [StgArg] -> FCode ()
442
cgIdApp fun_id [] | isVoidId fun_id = emitReturn []
443
444
cgIdApp fun_id args
  = do 	{ fun_info <- getCgIdInfo fun_id
445
446
447
        ; case maybeLetNoEscape fun_info of
            Just (blk_id, lne_regs) -> cgLneJump blk_id lne_regs args
            Nothing -> cgTailCall fun_id fun_info args }
448
449
450
451
452
453
454
455

cgLneJump :: BlockId -> [LocalReg] -> [StgArg] -> FCode ()
cgLneJump blk_id lne_regs args	-- Join point; discard sequel
  = do	{ cmm_args <- getNonVoidArgAmodes args
      	; emit (mkMultiAssign lne_regs cmm_args
		<*> mkBranch blk_id) }
    
cgTailCall :: Id -> CgIdInfo -> [StgArg] -> FCode ()
456
457
458
cgTailCall fun_id fun_info args = do
    dflags <- getDynFlags
    case (getCallMethod dflags fun_name (idCafInfo fun_id) lf_info (length args)) of
459

460
	    -- A value in WHNF, so we can just return it.
461
462
463
      	ReturnIt -> emitReturn [fun]	-- ToDo: does ReturnIt guarantee tagged?
    
      	EnterIt -> ASSERT( null args )	-- Discarding arguments
464
465
      		do { let fun' = CmmLoad fun (cmmExprType fun)
                   ; [ret,call] <- forkAlts [
466
      			getCode $ emitReturn [fun],	-- Is tagged; no need to untag
467
468
469
      			getCode $ do -- emit (mkAssign nodeReg fun)
                         emitCall (NativeNodeCall, NativeReturn)
                                  (entryCode fun') [fun]]  -- Not tagged
470
471
472
473
      		   ; emit (mkCmmIfThenElse (cmmIsTagged fun) ret call) }

      	SlowCall -> do 	    -- A slow function call via the RTS apply routines
      		{ tickySlowCall lf_info args
474
                ; emit $ mkComment $ mkFastString "slowCall"
475
476
477
478
479
480
      		; slowCall fun args }
    
      	-- A direct function call (possibly with some left-over arguments)
      	DirectEntry lbl arity -> do
		{ tickyDirectCall arity args
 		; if node_points then
481
482
483
484
485
                    do emit $ mkComment $ mkFastString "directEntry"
                       emit (mkAssign nodeReg fun)
                       directCall lbl arity args
		  else do emit $ mkComment $ mkFastString "directEntry else"
                          directCall lbl arity args }
486
487
488
489

	JumpToIt {} -> panic "cgTailCall"	-- ???

  where
490
491
492
    fun_name 	= idName            fun_id
    fun         = idInfoToAmode     fun_info
    lf_info     = cgIdInfoLF        fun_info
493
494
495
    node_points = nodeMustPointToIt lf_info


496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
{- Note [case on Bool]
   ~~~~~~~~~~~~~~~~~~~
A case on a Boolean value does two things:
  1. It looks up the Boolean in a closure table and assigns the
     result to the binder.
  2. It branches to the True or False case through analysis
     of the closure assigned to the binder.
But the indirection through the closure table is unnecessary
if the assignment to the binder will be dead code (use isDeadBndr).

The following example illustrates how badly the code turns out:
  STG:
    case <=## [ww_s7Hx y_s7HD] of wild2_sbH8 {
      GHC.Bool.False -> <true  code> // sbH8 dead
      GHC.Bool.True  -> <false code> // sbH8 dead
    };
  Cmm:
    _s7HD::F64 = F64[_sbH7::I64 + 7];  // MidAssign
    _ccsW::I64 = %MO_F_Le_W64(_s7Hx::F64, _s7HD::F64);  // MidAssign
    // emitReturn  // MidComment
    _sbH8::I64 = I64[ghczmprim_GHCziBool_Bool_closure_tbl + (_ccsW::I64 << 3)];  // MidAssign
    _ccsX::I64 = _sbH8::I64 & 7;  // MidAssign
    if (_ccsX::I64 >= 2) goto ccsH; else goto ccsI;  // LastCondBranch

The assignments to _sbH8 and _ccsX are completely unnecessary.
Instead, we should branch based on the value of _ccsW.
-}
523

524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
{- Note [Better Alt Heap Checks]
If two function calls can share a return point, then they will also
get the same info table. Therefore, it's worth our effort to make
those opportunities appear as frequently as possible.

Here are a few examples of how it should work:

  STG:
    case f x of
      True  -> <True code -- including allocation>
      False -> <False code>
  Cmm:
      r = call f(x) returns to L;
   L:
      if r & 7 >= 2 goto L1 else goto L2;
   L1:
      if Hp > HpLim then
        r = gc(r);
        goto L;
      <True code -- including allocation>
   L2:
      <False code>
Note that the code following both the call to f(x) and the code to gc(r)
should be the same, which will allow the common blockifier to discover
that they are the same. Therefore, both function calls will return to the same
block, and they will use the same info table.        

Here's an example of the Cmm code we want from a primOp.
The primOp doesn't produce an info table for us to reuse, but that's okay:
we should still generate the same code:
  STG:
    case f x of
      0 -> <0-case code -- including allocation>
      _ -> <default-case code>
  Cmm:
      r = a +# b;
   L:
      if r == 0 then goto L1 else goto L2;
   L1:
      if Hp > HpLim then
        r = gc(r);
        goto L;
      <0-case code -- including allocation>
   L2:
      <default-case code>
-}