Commit 9897e8c8 authored by Gabor Greif's avatar Gabor Greif Committed by Marge Bot

Implement pointer tagging for big families (#14373)

Formerly we punted on these and evaluated constructors always got a tag
of 1.

We now cascade switches because we have to check the tag first and when
it is MAX_PTR_TAG then get the precise tag from the info table and
switch on that. The only technically tricky part is that the default
case needs (logical) duplication. To do this we emit an extra label for
it and branch to that from the second switch. This avoids duplicated
codegen.

Here's a simple example of the new code gen:

    data D = D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8

On a 64-bit system previously all constructors would be tagged 1. With
the new code gen D7 and D8 are tagged 7:

    [Lib.D7_con_entry() {
         ...
         {offset
           c1eu: // global
               R1 = R1 + 7;
               call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
         }
     }]

    [Lib.D8_con_entry() {
         ...
         {offset
           c1ez: // global
               R1 = R1 + 7;
               call (P64[Sp])(R1) args: 8, res: 0, upd: 8;
         }
     }]

When switching we now look at the info table only when the tag is 7. For
example, if we derive Enum for the type above, the Cmm looks like this:

    c2Le:
        _s2Js::P64 = R1;
        _c2Lq::P64 = _s2Js::P64 & 7;
        switch [1 .. 7] _c2Lq::P64 {
            case 1 : goto c2Lk;
            case 2 : goto c2Ll;
            case 3 : goto c2Lm;
            case 4 : goto c2Ln;
            case 5 : goto c2Lo;
            case 6 : goto c2Lp;
            case 7 : goto c2Lj;
        }

    // Read info table for tag
    c2Lj:
        _c2Lv::I64 = %MO_UU_Conv_W32_W64(I32[I64[_s2Js::P64 & (-8)] - 4]);
        if (_c2Lv::I64 != 6) goto c2Lu; else goto c2Lt;

Generated Cmm sizes do not change too much, but binaries are very
slightly larger, due to the fact that the new instructions are longer in
encoded form. E.g. previously entry code for D8 above would be

    00000000000001c0 <Lib_D8_con_info>:
     1c0:	48 ff c3             	inc    %rbx
     1c3:	ff 65 00             	jmpq   *0x0(%rbp)

With this patch

    00000000000001d0 <Lib_D8_con_info>:
     1d0:	48 83 c3 07          	add    $0x7,%rbx
     1d4:	ff 65 00             	jmpq   *0x0(%rbp)

This is one byte longer.

Secondly, reading info table directly and then switching is shorter

    _c1co:
            movq -1(%rbx),%rax
            movl -4(%rax),%eax
            // Switch on info table tag
            jmp *_n1d5(,%rax,8)

than doing the same switch, and then for the tag 7 doing another switch:

    // When tag is 7
    _c1ct:
            andq $-8,%rbx
            movq (%rbx),%rax
            movl -4(%rax),%eax
            // Switch on info table tag
            ...

Some changes of binary sizes in actual programs:

- In NoFib the worst case is 0.1% increase in benchmark "parser" (see
  NoFib results below). All programs get slightly larger.

- Stage 2 compiler size does not change.

- In "containers" (the library) size of all object files increases
  0.0005%. Size of the test program "bitqueue-properties" increases
  0.03%.

nofib benchmarks kindly provided by Ömer (@osa1):

NoFib Results
=============

--------------------------------------------------------------------------------
        Program           Size    Allocs    Instrs     Reads    Writes
--------------------------------------------------------------------------------
             CS          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
            CSD          +0.0%      0.0%      0.0%     +0.0%     +0.0%
             FS          +0.0%      0.0%      0.0%     +0.0%      0.0%
              S          +0.0%      0.0%     -0.0%      0.0%      0.0%
             VS          +0.0%      0.0%     -0.0%     +0.0%     +0.0%
            VSD          +0.0%      0.0%     -0.0%     +0.0%     -0.0%
            VSM          +0.0%      0.0%      0.0%      0.0%      0.0%
           anna          +0.0%      0.0%     +0.1%     -0.9%     -0.0%
           ansi          +0.0%      0.0%     -0.0%     +0.0%     +0.0%
           atom          +0.0%      0.0%      0.0%      0.0%      0.0%
         awards          +0.0%      0.0%     -0.0%     +0.0%      0.0%
         banner          +0.0%      0.0%     -0.0%     +0.0%      0.0%
     bernouilli          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
   binary-trees          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
          boyer          +0.0%      0.0%     +0.0%      0.0%     -0.0%
         boyer2          +0.0%      0.0%     +0.0%      0.0%     -0.0%
           bspt          +0.0%      0.0%     +0.0%     +0.0%      0.0%
      cacheprof          +0.0%      0.0%     +0.1%     -0.8%      0.0%
       calendar          +0.0%      0.0%     -0.0%     +0.0%     -0.0%
       cichelli          +0.0%      0.0%     +0.0%      0.0%      0.0%
        circsim          +0.0%      0.0%     -0.0%     -0.1%     -0.0%
       clausify          +0.0%      0.0%     +0.0%     +0.0%      0.0%
  comp_lab_zift          +0.0%      0.0%     +0.0%      0.0%     -0.0%
       compress          +0.0%      0.0%     +0.0%     +0.0%      0.0%
      compress2          +0.0%      0.0%      0.0%      0.0%      0.0%
    constraints          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
   cryptarithm1          +0.0%      0.0%     +0.0%      0.0%      0.0%
   cryptarithm2          +0.0%      0.0%     +0.0%     -0.0%      0.0%
            cse          +0.0%      0.0%     +0.0%     +0.0%      0.0%
   digits-of-e1          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
   digits-of-e2          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         dom-lt          +0.0%      0.0%     +0.0%     +0.0%      0.0%
          eliza          +0.0%      0.0%     -0.0%     +0.0%      0.0%
          event          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
    exact-reals          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         exp3_8          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
         expert          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
 fannkuch-redux          +0.0%      0.0%     +0.0%      0.0%      0.0%
          fasta          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
            fem          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            fft          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           fft2          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
       fibheaps          +0.0%      0.0%     +0.0%     +0.0%      0.0%
           fish          +0.0%      0.0%     +0.0%     +0.0%      0.0%
          fluid          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         fulsom          +0.0%      0.0%     +0.0%     -0.0%     +0.0%
         gamteb          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
            gcd          +0.0%      0.0%     +0.0%     +0.0%      0.0%
    gen_regexps          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         genfft          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
             gg          +0.0%      0.0%      0.0%     -0.0%      0.0%
           grep          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         hidden          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
            hpg          +0.0%      0.0%     +0.0%     -0.1%     -0.0%
            ida          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
          infer          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
        integer          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
      integrate          +0.0%      0.0%      0.0%     +0.0%      0.0%
   k-nucleotide          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
          kahan          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
        knights          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         lambda          +0.0%      0.0%     +1.2%     -6.1%     -0.0%
     last-piece          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           lcss          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           life          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
           lift          +0.0%      0.0%     +0.0%     +0.0%      0.0%
         linear          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      listcompr          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
       listcopy          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
       maillist          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         mandel          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
        mandel2          +0.0%      0.0%     +0.0%     +0.0%     -0.0%
           mate          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
        minimax          +0.0%      0.0%     -0.0%     +0.0%     -0.0%
        mkhprog          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
     multiplier          +0.0%      0.0%      0.0%     +0.0%     -0.0%
         n-body          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
       nucleic2          +0.0%      0.0%     +0.0%     +0.0%     -0.0%
           para          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      paraffins          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         parser          +0.1%      0.0%     +0.4%     -1.7%     -0.0%
        parstof          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
            pic          +0.0%      0.0%     +0.0%      0.0%     -0.0%
       pidigits          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
          power          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
         pretty          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         primes          +0.0%      0.0%     +0.0%      0.0%      0.0%
      primetest          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         prolog          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         puzzle          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
         queens          +0.0%      0.0%      0.0%     +0.0%     +0.0%
        reptile          +0.0%      0.0%     +0.0%     +0.0%      0.0%
reverse-complem          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
        rewrite          +0.0%      0.0%     +0.0%      0.0%     -0.0%
           rfib          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            rsa          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            scc          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
          sched          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            scs          +0.0%      0.0%     +0.0%     +0.0%      0.0%
         simple          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
          solid          +0.0%      0.0%     +0.0%     +0.0%      0.0%
        sorting          +0.0%      0.0%     +0.0%     -0.0%      0.0%
  spectral-norm          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
         sphere          +0.0%      0.0%     +0.0%     -1.0%      0.0%
         symalg          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
            tak          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      transform          +0.0%      0.0%     +0.4%     -1.3%     +0.0%
       treejoin          +0.0%      0.0%     +0.0%     -0.0%      0.0%
      typecheck          +0.0%      0.0%     -0.0%     +0.0%      0.0%
        veritas          +0.0%      0.0%     +0.0%     -0.1%     +0.0%
           wang          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
      wave4main          +0.0%      0.0%     +0.0%      0.0%     -0.0%
   wheel-sieve1          +0.0%      0.0%     +0.0%     +0.0%     +0.0%
   wheel-sieve2          +0.0%      0.0%     +0.0%     +0.0%      0.0%
           x2n1          +0.0%      0.0%     +0.0%     +0.0%      0.0%
--------------------------------------------------------------------------------
            Min          +0.0%      0.0%     -0.0%     -6.1%     -0.0%
            Max          +0.1%      0.0%     +1.2%     +0.0%     +0.0%
 Geometric Mean          +0.0%     -0.0%     +0.0%     -0.1%     -0.0%

NoFib GC Results
================

--------------------------------------------------------------------------------
        Program           Size    Allocs    Instrs     Reads    Writes
--------------------------------------------------------------------------------
        circsim          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
    constraints          +0.0%      0.0%     -0.0%      0.0%     -0.0%
       fibheaps          +0.0%      0.0%      0.0%     -0.0%     -0.0%
         fulsom          +0.0%      0.0%      0.0%     -0.6%     -0.0%
       gc_bench          +0.0%      0.0%      0.0%      0.0%     -0.0%
           hash          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
           lcss          +0.0%      0.0%      0.0%     -0.0%      0.0%
      mutstore1          +0.0%      0.0%      0.0%     -0.0%     -0.0%
      mutstore2          +0.0%      0.0%     +0.0%     -0.0%     -0.0%
          power          +0.0%      0.0%     -0.0%      0.0%     -0.0%
     spellcheck          +0.0%      0.0%     -0.0%     -0.0%     -0.0%
--------------------------------------------------------------------------------
            Min          +0.0%      0.0%     -0.0%     -0.6%     -0.0%
            Max          +0.0%      0.0%     +0.0%      0.0%      0.0%
 Geometric Mean          +0.0%     +0.0%     +0.0%     -0.1%     +0.0%

Fixes #14373

These performance regressions appear to be a fluke in CI. See the
discussion in !1742 for details.

Metric Increase:
    T6048
    T12234
    T12425
    Naperian
    T12150
    T5837
    T13035
parent f171b358
Pipeline #13551 passed with stages
in 479 minutes and 21 seconds
......@@ -362,20 +362,19 @@ type DynTag = Int -- The tag on a *pointer*
-- * big, otherwise.
--
-- Small families can have the constructor tag in the tag bits.
-- Big families only use the tag value 1 to represent evaluatedness.
-- Big families always use the tag values 1..mAX_PTR_TAG to represent
-- evaluatedness, the last one lumping together all overflowing ones.
-- We don't have very many tag bits: for example, we have 2 bits on
-- x86-32 and 3 bits on x86-64.
--
-- Also see Note [Tagging big families] in GHC.StgToCmm.Expr
isSmallFamily :: DynFlags -> Int -> Bool
isSmallFamily dflags fam_size = fam_size <= mAX_PTR_TAG dflags
tagForCon :: DynFlags -> DataCon -> DynTag
tagForCon dflags con
| isSmallFamily dflags fam_size = con_tag
| otherwise = 1
where
con_tag = dataConTag con -- NB: 1-indexed
fam_size = tyConFamilySize (dataConTyCon con)
tagForCon dflags con = min (dataConTag con) (mAX_PTR_TAG dflags)
-- NB: 1-indexed
tagForArity :: DynFlags -> RepArity -> DynTag
tagForArity dflags arity
......
This diff is collapsed.
......@@ -151,7 +151,7 @@ flattenCmmAGraph id (stmts_t, tscope) =
catAGraphs :: [CmmAGraph] -> CmmAGraph
catAGraphs = concatOL
-- | created a sequence "goto id; id:" as an AGraph
-- | creates a sequence "goto id; id:" as an AGraph
mkLabel :: BlockId -> CmmTickScope -> CmmAGraph
mkLabel bid scp = unitOL (CgLabel bid scp)
......@@ -159,7 +159,7 @@ mkLabel bid scp = unitOL (CgLabel bid scp)
mkMiddle :: CmmNode O O -> CmmAGraph
mkMiddle middle = unitOL (CgStmt middle)
-- | created a closed AGraph from a given node
-- | creates a closed AGraph from a given node
mkLast :: CmmNode O C -> CmmAGraph
mkLast last = unitOL (CgLast last)
......
......@@ -1220,6 +1220,9 @@ def multimod_compile( name, way, top_mod, extra_hc_opts ):
def multimod_compile_fail( name, way, top_mod, extra_hc_opts ):
return do_compile( name, way, True, top_mod, [], extra_hc_opts )
def multimod_compile_filter( name, way, top_mod, extra_hc_opts, filter_with, suppress_stdout=True ):
return do_compile( name, way, False, top_mod, [], extra_hc_opts, filter_with=filter_with, suppress_stdout=suppress_stdout )
def multi_compile( name, way, top_mod, extra_mods, extra_hc_opts ):
return do_compile( name, way, False, top_mod, extra_mods, extra_hc_opts)
......@@ -1459,12 +1462,14 @@ def simple_build(name: Union[TestName, str],
top_mod: Optional[Path],
link: bool,
addsuf: bool,
backpack: bool = False) -> Any:
backpack: bool = False,
suppress_stdout: bool = False,
filter_with: str = '') -> Any:
opts = getTestOpts()
# Redirect stdout and stderr to the same file
stdout = in_testdir(name, 'comp.stderr')
stderr = subprocess.STDOUT
stderr = subprocess.STDOUT if not suppress_stdout else None
if top_mod is not None:
srcname = top_mod
......@@ -1515,6 +1520,9 @@ def simple_build(name: Union[TestName, str],
'{{compiler}} {to_do} {srcname} {flags} {extra_hc_opts}'
).format(**locals())
if filter_with != '':
cmd = cmd + ' | ' + filter_with
exit_code = runCmd(cmd, None, stdout, stderr, opts.compile_timeout_multiplier)
actual_stderr_path = in_testdir(name, 'comp.stderr')
......
module T14373 where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P deriving (Enum, Show)
const T14373.A_closure+1;
const T14373.B_closure+2;
const T14373.C_closure+3;
const T14373.D_closure+3;
const T14373.E_closure+3;
const T14373.F_closure+3;
const T14373.G_closure+3;
const T14373.H_closure+3;
const T14373.I_closure+3;
const T14373.J_closure+3;
const T14373.K_closure+3;
const T14373.L_closure+3;
const T14373.M_closure+3;
const T14373.N_closure+3;
const T14373.O_closure+3;
const T14373.P_closure+3;
const T14373.A_closure+1;
const T14373.B_closure+2;
const T14373.C_closure+3;
const T14373.D_closure+4;
const T14373.E_closure+5;
const T14373.F_closure+6;
const T14373.G_closure+7;
const T14373.H_closure+7;
const T14373.I_closure+7;
const T14373.J_closure+7;
const T14373.K_closure+7;
const T14373.L_closure+7;
const T14373.M_closure+7;
const T14373.N_closure+7;
const T14373.O_closure+7;
const T14373.P_closure+7;
module T14373a where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
{-# NOINLINE lateSwitch #-}
lateSwitch P = "Cool"
switch [0 .. 15]
case 15 : goto
default: {goto
module T14373b where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
{-# NOINLINE earlySwitch #-}
earlySwitch A = True
earlySwitch B = False
earlySwitch C = False
switch [1 .. 3]
case 1 : goto
case 2 : goto
case 3 : goto
switch [2 .. 15]
case 2 : goto
default: {goto
switch [1 .. 7]
case 1 : goto
case 2 : goto
case 3 : goto
default: {goto
module T14373c where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
{-# NOINLINE mixedSwitch #-}
mixedSwitch A = True
mixedSwitch B = False
mixedSwitch C = False
mixedSwitch P = True
switch [1 .. 3]
case 1 : goto
case 2 : goto
case 3 : goto
switch [2 .. 15]
case 2 : goto
case 15 : goto
default: {goto
switch [1 .. 7]
case 1 : goto
case 2 : goto
case 3 : goto
case 7 : goto
default: {goto
switch [6 .. 15]
case 15 : goto
default: {goto
module T14373d where
data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
-- check that in all cases the default bloc is not duplicated
-- (but being jumped at)
{-# NOINLINE lateDefault #-}
lateDefault P = "Cool"
lateDefault _ = 'L' : "ate"
{-# NOINLINE earlyDefault #-}
earlyDefault B = "Cool"
earlyDefault _ = 'E' : "arly"
{-# NOINLINE mixedDefault #-}
mixedDefault B = "Cool"
mixedDefault P = "Cool"
mixedDefault _ = 'M' : "ixed"
[T14373d.lateDefault_entry() { //
switch [0 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.earlyDefault_entry() { //
switch [1 .. 3]
case 2 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.mixedDefault_entry() { //
switch [1 .. 3]
case 2 : goto
case 3 : goto
default: {goto
switch [2 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.lateDefault_entry() { //
switch [0 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.earlyDefault_entry() { //
switch [1 .. 7]
case 2 : goto
default: {goto
R1 = XYZ_closure+2;
[T14373d.mixedDefault_entry() { //
switch [1 .. 7]
case 2 : goto
case 7 : goto
default: {goto
switch [6 .. 15]
case 15 : goto
default: {goto
R1 = XYZ_closure+2;
......@@ -67,3 +67,25 @@ test('T17334', [ unless(have_ncg() and (arch('x86_64') or arch('i386')), skip)
, only_ways(['normal'])
], compile, ['-O'])
test('T14373', [],
multimod_compile_filter, ['T14373', '-fasm -O2 -c -ddump-cmm-from-stg',
'grep -e "const T14373\.._closure+.;"'])
switch_skeleton_only = 'grep -e "switch \[" -e "case " -e "default: " | sed -e "s|\] .*|\]|g" -e "s|goto .*|goto |g"'
test('T14373a', [],
multimod_compile_filter, ['T14373a', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_only])
test('T14373b', [],
multimod_compile_filter, ['T14373b', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_only])
test('T14373c', [],
multimod_compile_filter, ['T14373c', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_only])
switch_skeleton_and_entries_only = ('grep -e "switch \[" -e "case " -e "default: " -e "Default_entry(" -e "R1 = .*_closure+2;"'
'| sed -e "s|\] .*|\]|g" -e "s|goto .*|goto |g" -e "s|R1 = .*_closure+2;.*|R1 = XYZ_closure+2;|g" -e "s|//.*|//|g"')
test('T14373d', [],
multimod_compile_filter, ['T14373d', '-fasm -O2 -c -ddump-cmm-from-stg',
switch_skeleton_and_entries_only])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment