Commit 9897e8c8 authored by Gabor Greif's avatar Gabor Greif 💬 Committed by Marge Bot

Implement pointer tagging for big families (#14373)

Formerly we punted on these and evaluated constructors always got a tag
of 1.

We now cascade switches because we have to check the tag first and when
it is MAX_PTR_TAG then get the precise tag from the info table and
switch on that. The only technically tricky part is that the default
case needs (logical) duplication. To do this we emit an extra label for
it and branch to that from the second switch. This avoids duplicated
codegen.

Here's a simple example of the new code gen:

    data D = D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8

On a 64-bit system previously all constructors would be tagged 1. With
the new code gen D7 and D8 are tagged 7:

    [Lib.D7_con_entry() {
         ...
         {offset
           c1eu: // global
               R1 = R1 + 7;
               call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
         }
     }]

    [Lib.D8_con_entry() {
         ...
         {offset
           c1ez: // global
               R1 = R1 + 7;
               call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
         }
     }]

When switching we now look at the info table only when the tag is 7. For
example, if we derive Enum for the type above, the Cmm looks like this:

    c2Le:
        _s2Js::P64 = R1;
        _c2Lq::P64 = _s2Js::P64 & 7;
        switch [1 .. 7] _c2Lq::P64 {
            case 1 : goto c2Lk;
            case 2 : goto c2Ll;
            case 3 : goto c2Lm;
            case 4 : goto c2Ln;
            case 5 : goto c2Lo;
            case 6 : goto c2Lp;
            case 7 : goto c2Lj;
        }

    // Read info table for tag
    c2Lj:
        _c2Lv::I64 = %MO_UU_Conv_W32_W64(I32[I64[_s2Js::P64 & (-8)] - 4]);
        if (_c2Lv::I64 != 6) goto c2Lu; else goto c2Lt;

Generated Cmm sizes do not change too much, but binaries are very
slightly larger, due to the fact that the new instructions are longer in
encoded form. E.g. previously entry code for D8 above would be

    00000000000001c0 <Lib_D8_con_info>:
     1c0:	48 ff c3             	inc    %rbx
     1c3:	ff 65 00             	jmpq   *0x0(%rbp)

With this patch

    00000000000001d0 <Lib_D8_con_info>:
     1d0:	48 83 c3 07          	add    $0x7,%rbx
     1d4:	ff 65 00             	jmpq   *0x0(%rbp)

This is one byte longer.

Secondly, reading info table directly and then switching is shorter

    _c1co:
            movq -1(%rbx),%rax
            movl -4(%rax),%eax
            // Switch on info table tag
            jmp *_n1d5(,%rax,8)

than doing the same switch, and then for the tag 7 doing another switch:

    // When tag is 7
    _c1ct:
            andq $-8,%rbx
            movq (%rbx),%rax
            movl -4(%rax),%eax
            // Switch on info table tag
            ...

Some changes of binary sizes in actual programs:

- In NoFib the worst case is 0.1% increase in benchmark "parser" (see
  NoFib results below). All programs get slightly larger.

- Stage 2 compiler size does not change.

- In "containers" (the library) size of all object files increases
  0.0005%. Size of the test program "bitqueue-properties" increases
  0.03%.

nofib benchmarks kindly provided by Ömer (@osa1):

NoFib Results
=============

--------------------------------------------------------------------------------
        Program           Size    Allocs    Instrs     Reads    Writes
--------------------------------------------------------------------------------
             CS          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
            CSD          +0.0%      0.0%      0.0%     +0.0%     +0.0%
             FS          +0.0%      0.0%      0.0%     +0.0%      0.0%
              S          +0.0%      0.0%     -0.0%      0.0%      0.0%
             VS          +0.0%      0.0%     -0.0%     +0.0%     +0.0%
            VSD          +0.0%      0.0%     -0.0%     +0.0%     -0.0%
            VSM          +0.0%      0.0%      0.0%      0.0%      0.0%
           anna          +0.0%      0.0%     +0.1%     -0.9%     -0.0%
           ansi          +0.0%      0.0%     -0.0%     +0.0%     +0.0%
           atom          +0.0%      0.0%      0.0%      0.0%      0.0%
         awards          +0.0%      0.0%     -0.0%     +0.0%      0.0%
         banner          +0.0%      0.0%     -0.0%     +0.0%      0.0%
     bernouilli          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
   binary-trees          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
          boyer          +0.0%      0.0%     +0.0%      0.0%     -0.0%
         boyer2          +0.0%      0.0%     +0.0%      0.0%     -0.0%
           bspt          +0.0%      0.0%     +0.0%     +0.0%      0.0%
      cacheprof          +0.0%      0.0%     +0.1%     -0.8%      0.0%
       calendar          +0.0%      0.0%     -0.0%     +0.0%     -0.0%
       cichelli          +0.0%      0.0%     +0.0%      0.0%      0.0%
        circsim          +0.0%      0.0%     -0.0%     -0.1%     -0.0%
       clausify          +0.0%      0.0%     +0.0%     +0.0%      0.0%
  comp_lab_zift          +0.0%      0.0%     +0.0%      0.0%     -0.0%
       compress          +0.0%      0.0%     +0.0%     +0.0%      0.0%
      compress2          +0.0%      0.0%      0.0%      0.0%      0.0%
    constraints          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
   cryptarithm1          +0.0%      0.0%     +0.0%      0.0%      0.0%
   cryptarithm2          +0.0%      0.0%     +0.0%     -0.0%      0.0%
            cse          +0.0%      0.0%     +0.0%     +0.0%      0.0%
   digits-of-e1          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
   digits-of-e2          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         dom-lt          +0.0%      0.0%     +0.0%     +0.0%      0.0%
          eliza          +0.0%      0.0%     -0.0%     +0.0%      0.0%
          event          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
    exact-reals          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         exp3_8          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
         expert          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
 fannkuch-redux          +0.0%      0.0%     +0.0%      0.0%      0.0%
          fasta          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
            fem          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            fft          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           fft2          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
       fibheaps          +0.0%      0.0%     +0.0%     +0.0%      0.0%
           fish          +0.0%      0.0%     +0.0%     +0.0%      0.0%
          fluid          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         fulsom          +0.0%      0.0%     +0.0%     -0.0%     +0.0%
         gamteb          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
            gcd          +0.0%      0.0%     +0.0%     +0.0%      0.0%
    gen_regexps          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         genfft          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
             gg          +0.0%      0.0%      0.0%     -0.0%      0.0%
           grep          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         hidden          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
            hpg          +0.0%      0.0%     +0.0%     -0.1%     -0.0%
            ida          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
          infer          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
        integer          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
      integrate          +0.0%      0.0%      0.0%     +0.0%      0.0%
   k-nucleotide          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
          kahan          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
        knights          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         lambda          +0.0%      0.0%     +1.2%     -6.1%     -0.0%
     last-piece          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           lcss          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           life          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           lift          +0.0%      0.0%     +0.0%     +0.0%      0.0%
         linear          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      listcompr          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
       listcopy          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
       maillist          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         mandel          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
        mandel2          +0.0%      0.0%     +0.0%     +0.0%     -0.0%
           mate          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
        minimax          +0.0%      0.0%     -0.0%     +0.0%     -0.0%
        mkhprog          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
     multiplier          +0.0%      0.0%      0.0%     +0.0%     -0.0%
         n-body          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
       nucleic2          +0.0%      0.0%     +0.0%     +0.0%     -0.0%
           para          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      paraffins          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         parser          +0.1%      0.0%     +0.4%     -1.7%     -0.0%
        parstof          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
            pic          +0.0%      0.0%     +0.0%      0.0%     -0.0%
       pidigits          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
          power          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         pretty          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         primes          +0.0%      0.0%     +0.0%      0.0%      0.0%
      primetest          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         prolog          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         puzzle          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         queens          +0.0%      0.0%      0.0%     +0.0%     +0.0%
        reptile          +0.0%      0.0%     +0.0%     +0.0%      0.0%
reverse-complem          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
        rewrite          +0.0%      0.0%     +0.0%      0.0%     -0.0%
           rfib          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            rsa          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            scc          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
          sched          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            scs          +0.0%      0.0%     +0.0%     +0.0%      0.0%
         simple          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
          solid          +0.0%      0.0%     +0.0%     +0.0%      0.0%
        sorting          +0.0%      0.0%     +0.0%     -0.0%      0.0%
  spectral-norm          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
         sphere          +0.0%      0.0%     +0.0%     -1.0%      0.0%
         symalg          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            tak          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      transform          +0.0%      0.0%     +0.4%     -1.3%     +0.0%
       treejoin          +0.0%      0.0%     +0.0%     -0.0%      0.0%
      typecheck          +0.0%      0.0%     -0.0%     +0.0%      0.0%
        veritas          +0.0%      0.0%     +0.0%     -0.1%     +0.0%
           wang          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      wave4main          +0.0%      0.0%     +0.0%      0.0%     -0.0%
   wheel-sieve1          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
   wheel-sieve2          +0.0%      0.0%     +0.0%     +0.0%      0.0%
           x2n1          +0.0%      0.0%     +0.0%     +0.0%      0.0%
--------------------------------------------------------------------------------
            Min          +0.0%      0.0%     -0.0%     -6.1%     -0.0%
            Max          +0.1%      0.0%     +1.2%     +0.0%     +0.0%
 Geometric Mean          +0.0%     -0.0%     +0.0%     -0.1%     -0.0%

NoFib GC Results
================

--------------------------------------------------------------------------------
        Program           Size    Allocs    Instrs     Reads    Writes
--------------------------------------------------------------------------------
        circsim          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
    constraints          +0.0%      0.0%     -0.0%      0.0%     -0.0%
       fibheaps          +0.0%      0.0%      0.0%     -0.0%     -0.0%
         fulsom          +0.0%      0.0%      0.0%     -0.6%     -0.0%
       gc_bench          +0.0%      0.0%      0.0%      0.0%     -0.0%
           hash          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
           lcss          +0.0%      0.0%      0.0%     -0.0%      0.0%
      mutstore1          +0.0%      0.0%      0.0%     -0.0%     -0.0%
      mutstore2          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
          power          +0.0%      0.0%     -0.0%      0.0%     -0.0%
     spellcheck          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
--------------------------------------------------------------------------------
            Min          +0.0%      0.0%     -0.0%     -0.6%     -0.0%
            Max          +0.0%      0.0%     +0.0%      0.0%      0.0%
 Geometric Mean          +0.0%     +0.0%     +0.0%     -0.1%     +0.0%

Fixes #14373

These performance regressions appear to be a fluke in CI. See the
discussion in !1742 for details.

Metric Increase:
    T6048
    T12234
    T12425
    Naperian
    T12150
    T5837
    T13035
parent f171b358
......@@ -362,20 +362,19 @@ type DynTag = Int -- The tag on a *pointer*
-- * big, otherwise.
--
-- Small families can have the constructor tag in the tag bits.
-- Big families only use the tag value 1 to represent evaluatedness.
-- Big families always use the tag values 1..mAX_PTR_TAG to represent
-- evaluatedness, the last one lumping together all overflowing ones.
-- We don't have very many tag bits: for example, we have 2 bits on
-- x86-32 and 3 bits on x86-64.
--
-- Also see Note [Tagging big families] in GHC.StgToCmm.Expr
isSmallFamily :: DynFlags -> Int -> Bool
isSmallFamily dflags fam_size = fam_size <= mAX_PTR_TAG dflags
tagForCon :: DynFlags -> DataCon -> DynTag
tagForCon dflags con
| isSmallFamily dflags fam_size = con_tag
| otherwise = 1
where
con_tag = dataConTag con -- NB: 1-indexed
fam_size = tyConFamilySize (dataConTyCon con)
tagForCon dflags con = min (dataConTag con) (mAX_PTR_TAG dflags)
-- NB: 1-indexed
tagForArity :: DynFlags -> RepArity -> DynTag
tagForArity dflags arity
......
{-# LANGUAGE CPP #-}
{-# LANGUAGE CPP, BangPatterns #-}
-----------------------------------------------------------------------------
--
......@@ -32,10 +32,11 @@ import StgSyn
import MkGraph
import BlockId
import Cmm
import Cmm hiding ( succ )
import CmmInfo
import CoreSyn
import DataCon
import DynFlags ( mAX_PTR_TAG )
import ForeignCall
import Id
import PrimOp
......@@ -48,8 +49,9 @@ import Util
import FastString
import Outputable
import Control.Monad (unless,void)
import Control.Arrow (first)
import Control.Monad ( unless, void )
import Control.Arrow ( first )
import Data.List ( partition )
------------------------------------------------------------------------
-- cgExpr: the main function
......@@ -631,29 +633,152 @@ cgAlts gc_plan bndr (AlgAlt tycon) alts
; (mb_deflt, branches) <- cgAlgAltRhss gc_plan bndr alts
; let fam_sz = tyConFamilySize tycon
bndr_reg = CmmLocal (idToReg dflags bndr)
-- Is the constructor tag in the node reg?
; if isSmallFamily dflags fam_sz
then do
let -- Yes, bndr_reg has constr. tag in ls bits
tag_expr = cmmConstrTag1 dflags (CmmReg bndr_reg)
branches' = [(tag+1,branch) | (tag,branch) <- branches]
emitSwitch tag_expr branches' mb_deflt 1 fam_sz
else -- No, get tag from info table
let -- Note that ptr _always_ has tag 1
-- when the family size is big enough
untagged_ptr = cmmRegOffB bndr_reg (-1)
tag_expr = getConstrTag dflags (untagged_ptr)
in emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1)
; let !fam_sz = tyConFamilySize tycon
!bndr_reg = CmmLocal (idToReg dflags bndr)
!ptag_expr = cmmConstrTag1 dflags (CmmReg bndr_reg)
!branches' = first succ <$> branches
!maxpt = mAX_PTR_TAG dflags
(!via_ptr, !via_info) = partition ((< maxpt) . fst) branches'
!small = isSmallFamily dflags fam_sz
-- Is the constructor tag in the node reg?
-- See Note [Tagging big families]
; if small || null via_info
then -- Yes, bndr_reg has constructor tag in ls bits
emitSwitch ptag_expr branches' mb_deflt 1
(if small then fam_sz else maxpt)
else -- No, the get exact tag from info table when mAX_PTR_TAG
-- See Note [Double switching for big families]
do
let !untagged_ptr = cmmUntag dflags (CmmReg bndr_reg)
!itag_expr = getConstrTag dflags untagged_ptr
!info0 = first pred <$> via_info
if null via_ptr then
emitSwitch itag_expr info0 mb_deflt 0 (fam_sz - 1)
else do
infos_lbl <- newBlockId
infos_scp <- getTickScope
let spillover = (maxpt, (mkBranch infos_lbl, infos_scp))
(mb_shared_deflt, mb_shared_branch) <- case mb_deflt of
(Just (stmts, scp)) ->
do lbl <- newBlockId
return ( Just (mkLabel lbl scp <*> stmts, scp)
, Just (mkBranch lbl, scp))
_ -> return (Nothing, Nothing)
-- Switch on pointer tag
emitSwitch ptag_expr (spillover : via_ptr) mb_shared_deflt 1 maxpt
join_lbl <- newBlockId
emit (mkBranch join_lbl)
-- Switch on info table tag
emitLabel infos_lbl
emitSwitch itag_expr info0 mb_shared_branch
(maxpt - 1) (fam_sz - 1)
emitLabel join_lbl
; return AssignedDirectly }
cgAlts _ _ _ _ = panic "cgAlts"
-- UbxTupAlt and PolyAlt have only one alternative
-- Note [Double switching for big families]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-- An algebraic data type can have a n >= 0 summands
-- (or alternatives), which are identified (labeled) by
-- constructors. In memory they are kept apart by tags
-- (see Note [Data constructor dynamic tags] in GHC.StgToCmm.Closure).
-- Due to the characteristics of the platform that
-- contribute to the alignment of memory objects, there
-- is a natural limit of information about constructors
-- that can be encoded in the pointer tag. When the mapping
-- of constructors to the pointer tag range 1..mAX_PTR_TAG
-- is not injective, then we have a "big data type", also
-- called a "big (constructor) family" in the literature.
-- Constructor tags residing in the info table are injective,
-- but considerably more expensive to obtain, due to additional
-- memory access(es).
--
-- When doing case analysis on a value of a "big data type"
-- we need two nested switch statements to make up for the lack
-- of injectivity of pointer tagging, also taking the info
-- table tag into account. The exact mechanism is described next.
--
-- In the general case, switching on big family alternatives
-- is done by two nested switch statements. According to
-- Note [Tagging big families], the outer switch
-- looks at the pointer tag and the inner dereferences the
-- pointer and switches on the info table tag.
--
-- We can handle a simple case first, namely when none
-- of the case alternatives mention a constructor having
-- a pointer tag of 1..mAX_PTR_TAG-1. In this case we
-- simply emit a switch on the info table tag.
-- Note that the other simple case is when all mentioned
-- alternatives lie in 1..mAX_PTR_TAG-1, in which case we can
-- switch on the ptr tag only, just like in the small family case.
--
-- There is a single intricacy with a nested switch:
-- Both should branch to the same default alternative, and as such
-- avoid duplicate codegen of potentially heavy code. The outer
-- switch generates the actual code with a prepended fresh label,
-- while the inner one only generates a jump to that label.
--
-- For example, let's assume a 64-bit architecture, so that all
-- heap objects are 8-byte aligned, and hence the address of a
-- heap object ends in `000` (three zero bits).
--
-- Then consider the following data type
--
-- > data Big = T0 | T1 | T2 | T3 | T4 | T5 | T6 | T7 | T8
-- Ptr tag: 1 2 3 4 5 6 7 7 7
-- As bits: 001 010 011 100 101 110 111 111 111
-- Info pointer tag (zero based):
-- 0 1 2 3 4 5 6 7 8
--
-- Then \case T2 -> True; T8 -> True; _ -> False
-- will result in following code (slightly cleaned-up and
-- commented -ddump-cmm-from-stg):
{-
R1 = _sqI::P64; -- scrutinee
if (R1 & 7 != 0) goto cqO; else goto cqP;
cqP: // global -- enter
call (I64[R1])(R1) returns to cqO, args: 8, res: 8, upd: 8;
cqO: // global -- already WHNF
_sqJ::P64 = R1;
_cqX::P64 = _sqJ::P64 & 7; -- extract pointer tag
switch [1 .. 7] _cqX::P64 {
case 3 : goto cqW;
case 7 : goto cqR;
default: {goto cqS;}
}
cqR: // global
_cr2 = I32[I64[_sqJ::P64 & (-8)] - 4]; -- tag from info pointer
switch [6 .. 8] _cr2::I64 {
case 8 : goto cr1;
default: {goto cr0;}
}
cr1: // global
R1 = GHC.Types.True_closure+2;
call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8;
cr0: // global -- technically necessary label
goto cqS;
cqW: // global
R1 = GHC.Types.True_closure+2;
call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8;
cqS: // global
R1 = GHC.Types.False_closure+1;
call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8;
-}
--
-- For 32-bit systems we only have 2 tag bits in the pointers at our disposal,
-- so the performance win is dubious, especially in face of the increased code
-- size due to double switching. But we can take the viewpoint that 32-bit
-- architectures are not relevant for performance any more, so this can be
-- considered as moot.
-- Note [alg-alt heap check]
--
......@@ -675,6 +800,55 @@ cgAlts _ _ _ _ = panic "cgAlts"
-- x = R1
-- goto L1
-- Note [Tagging big families]
-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
-- Both the big and the small constructor families are tagged,
-- that is, greater unions which overflow the tag space of TAG_BITS
-- (i.e. 3 on 32 resp. 7 constructors on 64 bit archs).
--
-- For example, let's assume a 64-bit architecture, so that all
-- heap objects are 8-byte aligned, and hence the address of a
-- heap object ends in `000` (three zero bits). Then consider
-- > data Maybe a = Nothing | Just a
-- > data Day a = Mon | Tue | Wed | Thu | Fri | Sat | Sun
-- > data Grade = G1 | G2 | G3 | G4 | G5 | G6 | G7 | G8 | G9 | G10
--
-- Since `Grade` has more than 7 constructors, it counts as a
-- "big data type" (also referred to as "big constructor family" in papers).
-- On the other hand, `Maybe` and `Day` have 7 constructors or fewer, so they
-- are "small data types".
--
-- Then
-- * A pointer to an unevaluated thunk of type `Maybe Int`, `Day` or `Grade` will end in `000`
-- * A tagged pointer to a `Nothing`, `Mon` or `G1` will end in `001`
-- * A tagged pointer to a `Just x`, `Tue` or `G2` will end in `010`
-- * A tagged pointer to `Wed` or `G3` will end in `011`
-- ...
-- * A tagged pointer to `Sat` or `G6` will end in `110`
-- * A tagged pointer to `Sun` or `G7` or `G8` or `G9` or `G10` will end in `111`
--
-- For big families we employ a mildly clever way of combining pointer and
-- info-table tagging. We use 1..MAX_PTR_TAG-1 as pointer-resident tags where
-- the tags in the pointer and the info table are in a one-to-one
-- relation, whereas tag MAX_PTR_TAG is used as "spill over", signifying
-- we have to fall back and get the precise constructor tag from the
-- info-table.
--
-- Consequently we now cascade switches, because we have to check
-- the pointer tag first, and when it is MAX_PTR_TAG, fetch the precise
-- tag from the info table, and switch on that. The only technically
-- tricky part is that the default case needs (logical) duplication.
-- To do this we emit an extra label for it and branch to that from
-- the second switch. This avoids duplicated codegen. See Trac #14373.
-- See note [Double switching for big families] for the mechanics
-- involved.
--
-- Also see note [Data constructor dynamic tags]
-- and the wiki https://gitlab.haskell.org/ghc/ghc/wikis/commentary/rts/haskell-execution/pointer-tagging
--
-------------------
cgAlgAltRhss :: (GcPlan,ReturnKind) -> NonVoid Id -> [CgStgAlt]
-> FCode ( Maybe CmmAGraphScoped
......
......@@ -151,7 +151,7 @@ flattenCmmAGraph id (stmts_t, tscope) =
catAGraphs :: [CmmAGraph] -> CmmAGraph
catAGraphs = concatOL
-- | created a sequence "goto id; id:" as an AGraph
-- | creates a sequence "goto id; id:" as an AGraph
mkLabel :: BlockId -> CmmTickScope -> CmmAGraph
mkLabel bid scp = unitOL (CgLabel bid scp)
......@@ -159,7 +159,7 @@ mkLabel bid scp = unitOL (CgLabel bid scp)
mkMiddle :: CmmNode O O -> CmmAGraph
mkMiddle middle = unitOL (CgStmt middle)
-- | created a closed AGraph from a given node
-- | creates a closed AGraph from a given node
mkLast :: CmmNode O C -> CmmAGraph
mkLast last = unitOL (CgLast last)
......
......@@ -1220,6 +1220,9 @@ def multimod_compile( name, way, top_mod, extra_hc_opts ):
def multimod_compile_fail( name, way, top_mod, extra_hc_opts ):
return do_compile( name, way, True, top_mod, [], extra_hc_opts )
def multimod_compile_filter( name, way, top_mod, extra_hc_opts, filter_with, suppress_stdout=True ):
return do_compile( name, way, False, top_mod, [], extra_hc_opts, filter_with=filter_with, suppress_stdout=suppress_stdout )
def multi_compile( name, way, top_mod, extra_mods, extra_hc_opts ):
return do_compile( name, way, False, top_mod, extra_mods, extra_hc_opts)
......@@ -1459,12 +1462,14 @@ def simple_build(name: Union[TestName, str],
top_mod: Optional[Path],
link: bool,
addsuf: bool,
backpack: bool = False) -> Any:
backpack: bool = False,
suppress_stdout: bool = False,
filter_with: str = '') -> Any:
opts = getTestOpts()
# Redirect stdout and stderr to the same file
stdout = in_testdir(name, 'comp.stderr')
stderr = subprocess.STDOUT
stderr = subprocess.STDOUT if not suppress_stdout else None
if top_mod is not None:
srcname = top_mod
......@@ -1515,6 +1520,9 @@ def simple_build(name: Union[TestName, str],
'{{compiler}} {to_do} {srcname} {flags} {extra_hc_opts}'
).format(**locals())
if filter_with != '':
cmd = cmd + ' | ' + filter_with
exit_code = runCmd(cmd, None, stdout, stderr, opts.compile_timeout_multiplier)
actual_stderr_path = in_testdir(name, 'comp.stderr')
......
module T14373 where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P deriving (Enum, Show)
const T14373.A_closure+1;
const T14373.B_closure+2;
const T14373.C_closure+3;
const T14373.D_closure+3;
const T14373.E_closure+3;
const T14373.F_closure+3;
const T14373.G_closure+3;
const T14373.H_closure+3;
const T14373.I_closure+3;
const T14373.J_closure+3;
const T14373.K_closure+3;
const T14373.L_closure+3;
const T14373.M_closure+3;
const T14373.N_closure+3;
const T14373.O_closure+3;
const T14373.P_closure+3;
const T14373.A_closure+1;
const T14373.B_closure+2;
const T14373.C_closure+3;
const T14373.D_closure+4;
const T14373.E_closure+5;
const T14373.F_closure+6;
const T14373.G_closure+7;
const T14373.H_closure+7;
const T14373.I_closure+7;
const T14373.J_closure+7;
const T14373.K_closure+7;
const T14373.L_closure+7;
const T14373.M_closure+7;
const T14373.N_closure+7;
const T14373.O_closure+7;
const T14373.P_closure+7;
module T14373a where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
{-# NOINLINE lateSwitch #-}
lateSwitch P = "Cool"
switch [0 .. 15]
case 15 : goto
default: {goto
module T14373b where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
{-# NOINLINE earlySwitch #-}
earlySwitch A = True
earlySwitch B = False
earlySwitch C = False
switch [1 .. 3]
case 1 : goto
case 2 : goto
case 3 : goto
switch [2 .. 15]
case 2 : goto
default: {goto
switch [1 .. 7]
case 1 : goto
case 2 : goto
case 3 : goto
default: {goto
module T14373c where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
{-# NOINLINE mixedSwitch #-}
mixedSwitch A = True
mixedSwitch B = False
mixedSwitch C = False
mixedSwitch P = True
switch [1 .. 3]
case 1 : goto
case 2 : goto
case 3 : goto
switch [2 .. 15]
case 2 : goto
case 15 : goto
default: {goto
switch [1 .. 7]
case 1 : goto
case 2 : goto
case 3 : goto
case 7 : goto
default: {goto
switch [6 .. 15]
case 15 : goto
default: {goto
module T14373d where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
-- check that in all cases the default bloc is not duplicated
-- (but being jumped at)
{-# NOINLINE lateDefault #-}
lateDefault P = "Cool"
lateDefault _ = 'L' : "ate"
{-# NOINLINE earlyDefault #-}
earlyDefault B = "Cool"
earlyDefault _ = 'E' : "arly"
{-# NOINLINE mixedDefault #-}
mixedDefault B = "Cool"
mixedDefault P = "Cool"
mixedDefault _ = 'M' : "ixed"
[T14373d.lateDefault_entry() { //
switch [0 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.earlyDefault_entry() { //
switch [1 .. 3]
case 2 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.mixedDefault_entry() { //
switch [1 .. 3]
case 2 : goto
case 3 : goto
default: {goto
switch [2 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.lateDefault_entry() { //
switch [0 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.earlyDefault_entry() { //
switch [1 .. 7]
case 2 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.mixedDefault_entry() { //
switch [1 .. 7]
case 2 : goto
case 7 : goto
default: {goto
switch [6 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
......@@ -67,3 +67,25 @@ test('T17334', [ unless(have_ncg() and (arch('x86_64') or arch('i386')), skip)
, only_ways(['normal'])
], compile, ['-O'])
test('T14373', [],
multimod_compile_filter, ['T14373', '-fasm -O2 -c -ddump-cmm-from-stg',
'grep -e "const T14373\.._closure+.;"'])
switch_skeleton_only = 'grep -e "switch \[" -e "case " -e "default: " | sed -e "s|\] .*|\]|g" -e "s|goto .*|goto |g"'
test('T14373a', [],
multimod_compile_filter, ['T14373a', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_only])
test('T14373b', [],
multimod_compile_filter, ['T14373b', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_only])
test('T14373c', [],
multimod_compile_filter, ['T14373c', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_only])
switch_skeleton_and_entries_only = ('grep -e "switch \[" -e "case " -e "default: " -e "Default_entry(" -e "R1 = .*_closure+2;"'
'| sed -e "s|\] .*|\]|g" -e "s|goto .*|goto |g" -e "s|R1 = .*_closure+2;.*|R1 = XYZ_closure+2;|g" -e "s|//.*|//|g"')
test('T14373d', [],
multimod_compile_filter, ['T14373d', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_and_entries_only])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment