Inlining of exit join points can be problematic given current code generator behaviour.
In #22893 we saw that inlining exit join points can be problematic.
Looking at the generated code it became quite obvious why. Consider something like this:
use_scanr = (Data.ByteString.scanr (+) 0)
Full core for 9.6/9.4 below:
Giving us the following core on 9.4.4:
BenchShort.$wuse_scanr [InlPrag=[2]]
:: GHC.Prim.Addr#
-> GHC.ForeignPtr.ForeignPtrContents
-> GHC.Prim.Int#
-> Data.ByteString.Internal.Type.ByteString
[GblId, Arity=3, Str=<L><L><L>, Unf=OtherCon []] =
{} \r [ww_s3Xu ww1_s3Xv ww2_s3Xw]
case +# [ww2_s3Xw 1#] of x_s3Xx {
__DEFAULT ->
case <# [x_s3Xx 0#] of {
__DEFAULT ->
case newPinnedByteArray# [x_s3Xx GHC.Prim.realWorld#] of {
Solo# ipv1_s3XB ->
case mutableByteArrayContents# [ipv1_s3XB] of ipv2_s3XC {
__DEFAULT ->
case plusAddr# [ipv2_s3XC ww2_s3Xw] of sat_s3XD [Occ=Once1] {
__DEFAULT ->
case
writeWord8OffAddr# [sat_s3XD 0# 0##8 GHC.Prim.void#]
of
s2_s3XE [Occ=Once1]
{
(##) ->
let {
ipv3_s3XF :: GHC.ForeignPtr.ForeignPtrContents
[LclId, Unf=OtherCon []] =
CCCS GHC.ForeignPtr.PlainPtr! [ipv1_s3XB];
} in
case touch# [ipv3_s3XF GHC.Prim.void#] of s'_s3XG [Occ=Once1] {
(##) ->
case -# [ww2_s3Xw 1#] of sat_s3XX [Occ=Once1] {
__DEFAULT ->
let-no-escape {
$w$j_s3XH [InlPrag=[2], Occ=OnceL1T[0], Dmd=S!P(L,L,L)]
:: Data.ByteString.Internal.Type.ByteString
[LclId[JoinId(0)(Nothing)], Unf=OtherCon []] =
{ipv2_s3XC, ipv3_s3XF, x_s3Xx} \r []
Data.ByteString.Internal.Type.BS [ipv2_s3XC ipv3_s3XF x_s3Xx];
} in
let-no-escape {
Rec {
$wscanr__s3XI [InlPrag=[2],
Occ=LoopBreakerT[3],
Dmd=SCS(C1(C1(L)))]
:: GHC.Prim.Word8#
-> GHC.Prim.Int#
-> GHC.Prim.State# GHC.Prim.RealWorld
-> Data.ByteString.Internal.Type.ByteString
[LclId[JoinId(3)(Nothing)],
Arity=3,
Str=<L><L><L>,
Unf=OtherCon []] =
{$wscanr__s3XI, ipv3_s3XF, ipv2_s3XC, ww1_s3Xv, ww_s3Xu,
$w$j_s3XH} \r [ww3_s3XJ ww4_s3XK void_0E]
case <# [ww4_s3XK 0#] of {
__DEFAULT ->
case plusAddr# [ww_s3Xu ww4_s3XK] of sat_s3XN [Occ=Once1] {
__DEFAULT ->
case readWord8OffAddr# [sat_s3XN 0# GHC.Prim.void#] of {
Solo# ipv7_s3XQ [Occ=Once1] ->
case touch# [ww1_s3Xv GHC.Prim.void#] of s'1_s3XR [Occ=Once1] {
(##) ->
case plusWord8# [ipv7_s3XQ ww3_s3XJ] of x2_s3XS {
__DEFAULT ->
case plusAddr# [ipv2_s3XC ww4_s3XK] of sat_s3XT [Occ=Once1] {
__DEFAULT ->
case
writeWord8OffAddr# [sat_s3XT 0# x2_s3XS GHC.Prim.void#]
of
s3_s3XU [Occ=Once1]
{
(##) ->
case touch# [ipv3_s3XF GHC.Prim.void#] of s'2_s3XV [Occ=Once1] {
(##) ->
case -# [ww4_s3XK 1#] of sat_s3XW [Occ=Once1] {
__DEFAULT -> $wscanr__s3XI x2_s3XS sat_s3XW GHC.Prim.void#;
};
};
};
};
};
};
};
};
1# -> $w$j_s3XH;
};
end Rec }
} in $wscanr__s3XI 0##8 sat_s3XX GHC.Prim.void#;
};
};
};
};
};
};
1# -> GHC.ForeignPtr.mallocPlainForeignPtrBytes2;
};
};
In 9.6 where we inline join points on the other hand we instead get:
BenchShort.$wuse_scanr [InlPrag=[2]]
:: GHC.Prim.Addr#
-> GHC.ForeignPtr.ForeignPtrContents
-> GHC.Prim.Int#
-> Data.ByteString.Internal.Type.ByteString
[GblId, Arity=3, Str=<L><L><L>, Unf=OtherCon []] =
{} \r [ww_s4gR ww1_s4gS ww2_s4gT]
case +# [ww2_s4gT 1#] of x_s4gU {
__DEFAULT ->
case <# [x_s4gU 0#] of {
__DEFAULT ->
case newPinnedByteArray# [x_s4gU GHC.Prim.realWorld#] of {
Solo# ipv1_s4gY ->
case mutableByteArrayContents# [ipv1_s4gY] of ipv2_s4gZ {
__DEFAULT ->
case plusAddr# [ipv2_s4gZ ww2_s4gT] of sat_s4h0 [Occ=Once1] {
__DEFAULT ->
case
writeWord8OffAddr# [sat_s4h0 0# 0#Word8 GHC.Prim.void#]
of
s2_s4h1 [Occ=Once1]
{
(##) ->
let {
ipv3_s4h2 :: GHC.ForeignPtr.ForeignPtrContents
[LclId, Unf=OtherCon []] =
GHC.ForeignPtr.PlainPtr! [ipv1_s4gY];
} in
case touch# [ipv3_s4h2 GHC.Prim.void#] of s'_s4h3 [Occ=Once1] {
(##) ->
case -# [ww2_s4gT 1#] of sat_s4hj [Occ=Once1] {
__DEFAULT ->
let-no-escape {
Rec {
$wscanr__s4h4 [InlPrag=[2],
Occ=LoopBreakerT[3],
Dmd=SC(S,C(1,C(1,L)))]
:: GHC.Prim.Word8#
-> GHC.Prim.Int#
-> GHC.Prim.State# GHC.Prim.RealWorld
-> Data.ByteString.Internal.Type.ByteString
[LclId[JoinId(3)(Nothing)],
Arity=3,
Str=<L><L><L>,
Unf=OtherCon []] =
{$wscanr__s4h4, ipv3_s4h2, ipv2_s4gZ, ww1_s4gS, ww_s4gR,
x_s4gU} \r [ww3_s4h5 ww4_s4h6 void_0E]
case <# [ww4_s4h6 0#] of {
__DEFAULT ->
case plusAddr# [ww_s4gR ww4_s4h6] of sat_s4h9 [Occ=Once1] {
__DEFAULT ->
case readWord8OffAddr# [sat_s4h9 0# GHC.Prim.void#] of {
Solo# ipv7_s4hc [Occ=Once1] ->
case touch# [ww1_s4gS GHC.Prim.void#] of s'1_s4hd [Occ=Once1] {
(##) ->
case plusWord8# [ipv7_s4hc ww3_s4h5] of x2_s4he {
__DEFAULT ->
case plusAddr# [ipv2_s4gZ ww4_s4h6] of sat_s4hf [Occ=Once1] {
__DEFAULT ->
case
writeWord8OffAddr# [sat_s4hf 0# x2_s4he GHC.Prim.void#]
of
s3_s4hg [Occ=Once1]
{
(##) ->
case touch# [ipv3_s4h2 GHC.Prim.void#] of s'2_s4hh [Occ=Once1] {
(##) ->
case -# [ww4_s4h6 1#] of sat_s4hi [Occ=Once1] {
__DEFAULT -> $wscanr__s4h4 x2_s4he sat_s4hi GHC.Prim.void#;
};
};
};
};
};
};
};
};
1# ->
Data.ByteString.Internal.Type.BS [ipv2_s4gZ ipv3_s4h2 x_s4gU];
};
end Rec }
} in $wscanr__s4h4 0#Word8 sat_s4hj GHC.Prim.void#;
};
};
};
};
};
};
1# -> GHC.ForeignPtr.mallocPlainForeignPtrBytes2;
};
};
The difference being in 9.4 we get as the inner loopslightly simplified something like this:
join { $w$j_s3Uo = lazy (BS ipv2_s3Tw ipv3_s3Tv x_s3Tx) } in
joinrec {
$wscanr__s3UA ww3_s3Us ww4_s3Uw eta_s3Uy
= case <# ww4_s3Uw 0# of {
__DEFAULT ->
case readWord8OffAddr# (plusAddr# ww_s3UE ww4_s3Uw) 0# eta_s3Uy of
{ (# ipv6_i3RS, ipv7_i3RT #) ->
case touch# ww1_s3UF ipv6_i3RS of s'1_i3RV { __DEFAULT ->
let { x2_s3Tz = plusWord8# ipv7_i3RT ww3_s3Us } in
case writeWord8OffAddr#
(plusAddr# ipv2_s3Tw ww4_s3Uw) 0# x2_s3Tz s'1_i3RV
of s3_i3RZ
{ __DEFAULT ->
case touch# ipv3_s3Tv s3_i3RZ of s'2_i3S0 { __DEFAULT ->
jump $wscanr__s3UA x2_s3Tz (-# ww4_s3Uw 1#) s'2_i3S0
}
}
}
};
1# -> jump $w$j_s3Uo
}; } in ...
While in 9.6 we currently inline the join point and get:
joinrec {
$wscanr__s4dR ww3_s4dJ ww4_s4dN eta_s4dP
= case <# ww4_s4dN 0# of {
__DEFAULT ->
case readWord8OffAddr# (plusAddr# ww_s4dV ww4_s4dN) 0# eta_s4dP of
{ (# ipv6_i4bg, ipv7_i4bh #) ->
case touch# ww1_s4dW ipv6_i4bg of s'1_i4bj { __DEFAULT ->
let { x2_s4cY = plusWord8# ipv7_i4bh ww3_s4dJ } in
case writeWord8OffAddr#
(plusAddr# ipv2_s4cS ww4_s4dN) 0# x2_s4cY s'1_i4bj
of s3_i4bn
{ __DEFAULT ->
case touch# ipv3_s4cU s3_i4bn of s'2_i4bo { __DEFAULT ->
jump $wscanr__s4dR x2_s4cY (-# ww4_s4dN 1#) s'2_i4bo
}
}
}
};
1# -> lazy (BS ipv2_s4cS ipv3_s4cU x_s4cQ)
}; } in
This should seemingly produce the same code. However looking at Cmm the difference becomes obvious:
In 9.4 we produce this beautiful Cmm for wscanr__s4dR:
inner_loop:
if (%MO_S_Lt_W64(_s3XK::I64, 0)) goto exit_join_point; else goto c3YU;
c3YU:
_s3XQ::I8 = I8[_s3Xu::I64 + _s3XK::I64];
call MO_Touch(_s3Xv::P64);
_s3XS::I8 = _s3XQ::I8 + _s3XJ::I8;
I8^[_s3XC::I64 + _s3XK::I64] = _s3XS::I8;
call MO_Touch(_c3Yx::P64);
_s3XK::I64 = _s3XK::I64 - 1;
_s3XJ::I8 = _s3XS::I8;
goto inner_loop;
In 9.6 since we inlined the join point we produce this abomination instead:
inner_loop:
Hp = Hp + 32;
if (Hp > HpLim) (likely: False) goto c4i5; else goto c4i4;
c4i5:
HpAlloc = 32;
I64[Sp - 24] = c4i1;
I8[Sp - 16] = _s4h5::I8;
I64[Sp - 8] = _s4h6::I64;
I64[Sp] = _s4gZ::I64;
P64[Sp + 24] = _c4hT::P64;
Sp = Sp - 24;
call stg_gc_noregs() returns to c4i1, args: 8, res: 8, upd: 8;
c4i1:
_c4hT::P64 = P64[Sp + 48];
_s4gR::I64 = I64[Sp + 32];
_s4gS::P64 = P64[Sp + 40];
_s4gU::I64 = I64[Sp + 56];
_s4gZ::I64 = I64[Sp + 24];
_s4h5::I8 = I8[Sp + 8];
_s4h6::I64 = I64[Sp + 16];
Sp = Sp + 24;
goto inner_loop;
c4i4:
if (%MO_S_Ge_W64(_s4h6::I64, 0)) goto _b_lt_0; else goto loop_exit;
_b_lt_0:
_s4hc::I8 = I8[_s4gR::I64 + _s4h6::I64];
call MO_Touch(_s4gS::P64);
_s4he::I8 = _s4hc::I8 + _s4h5::I8;
I8^[_s4gZ::I64 + _s4h6::I64] = _s4he::I8;
call MO_Touch(_c4hT::P64);
Hp = Hp - 32;
_s4h6::I64 = _s4h6::I64 - 1;
_s4h5::I8 = _s4he::I8;
goto inner_loop;
loop_exit:
The underlying issue here is #16064
We have a recursive function where we place a heap check in the hot code path despite the recursive code path not actually allocating anything.
There is a note that is quite skeptical of this in Note [GC for conditionals]
and the relevant code is in cgCase
. I assume it's possible but non-trivial to address this in the code generator. So for now it seems best to avoid inlining allocating exit join points.