Skip to content

Inlining of exit join points can be problematic given current code generator behaviour.

In #22893 we saw that inlining exit join points can be problematic.

Looking at the generated code it became quite obvious why. Consider something like this:

use_scanr = (Data.ByteString.scanr (+) 0)

Full core for 9.6/9.4 below:

Giving us the following core on 9.4.4:

BenchShort.$wuse_scanr [InlPrag=[2]]
  :: GHC.Prim.Addr#
     -> GHC.ForeignPtr.ForeignPtrContents
     -> GHC.Prim.Int#
     -> Data.ByteString.Internal.Type.ByteString
[GblId, Arity=3, Str=<L><L><L>, Unf=OtherCon []] =
    {} \r [ww_s3Xu ww1_s3Xv ww2_s3Xw]
        case +# [ww2_s3Xw 1#] of x_s3Xx {
        __DEFAULT ->
        case <# [x_s3Xx 0#] of {
          __DEFAULT ->
              case newPinnedByteArray# [x_s3Xx GHC.Prim.realWorld#] of {
              Solo# ipv1_s3XB ->
              case mutableByteArrayContents# [ipv1_s3XB] of ipv2_s3XC {
              __DEFAULT ->
              case plusAddr# [ipv2_s3XC ww2_s3Xw] of sat_s3XD [Occ=Once1] {
              __DEFAULT ->
              case
                  writeWord8OffAddr# [sat_s3XD 0# 0##8 GHC.Prim.void#]
              of
              s2_s3XE [Occ=Once1]
              {
              (##) ->
              let {
                ipv3_s3XF :: GHC.ForeignPtr.ForeignPtrContents
                [LclId, Unf=OtherCon []] =
                    CCCS GHC.ForeignPtr.PlainPtr! [ipv1_s3XB];
              } in 
                case touch# [ipv3_s3XF GHC.Prim.void#] of s'_s3XG [Occ=Once1] {
                (##) ->
                case -# [ww2_s3Xw 1#] of sat_s3XX [Occ=Once1] {
                __DEFAULT ->
                let-no-escape {
                  $w$j_s3XH [InlPrag=[2], Occ=OnceL1T[0], Dmd=S!P(L,L,L)]
                    :: Data.ByteString.Internal.Type.ByteString
                  [LclId[JoinId(0)(Nothing)], Unf=OtherCon []] =
                      {ipv2_s3XC, ipv3_s3XF, x_s3Xx} \r []
                          Data.ByteString.Internal.Type.BS [ipv2_s3XC ipv3_s3XF x_s3Xx];
                } in 
                  let-no-escape {
                    Rec {
                    $wscanr__s3XI [InlPrag=[2],
                                   Occ=LoopBreakerT[3],
                                   Dmd=SCS(C1(C1(L)))]
                      :: GHC.Prim.Word8#
                         -> GHC.Prim.Int#
                         -> GHC.Prim.State# GHC.Prim.RealWorld
                         -> Data.ByteString.Internal.Type.ByteString
                    [LclId[JoinId(3)(Nothing)],
                     Arity=3,
                     Str=<L><L><L>,
                     Unf=OtherCon []] =
                        {$wscanr__s3XI, ipv3_s3XF, ipv2_s3XC, ww1_s3Xv, ww_s3Xu,
                         $w$j_s3XH} \r [ww3_s3XJ ww4_s3XK void_0E]
                            case <# [ww4_s3XK 0#] of {
                              __DEFAULT ->
                                  case plusAddr# [ww_s3Xu ww4_s3XK] of sat_s3XN [Occ=Once1] {
                                  __DEFAULT ->
                                  case readWord8OffAddr# [sat_s3XN 0# GHC.Prim.void#] of {
                                  Solo# ipv7_s3XQ [Occ=Once1] ->
                                  case touch# [ww1_s3Xv GHC.Prim.void#] of s'1_s3XR [Occ=Once1] {
                                  (##) ->
                                  case plusWord8# [ipv7_s3XQ ww3_s3XJ] of x2_s3XS {
                                  __DEFAULT ->
                                  case plusAddr# [ipv2_s3XC ww4_s3XK] of sat_s3XT [Occ=Once1] {
                                  __DEFAULT ->
                                  case
                                      writeWord8OffAddr# [sat_s3XT 0# x2_s3XS GHC.Prim.void#]
                                  of
                                  s3_s3XU [Occ=Once1]
                                  {
                                  (##) ->
                                  case touch# [ipv3_s3XF GHC.Prim.void#] of s'2_s3XV [Occ=Once1] {
                                  (##) ->
                                  case -# [ww4_s3XK 1#] of sat_s3XW [Occ=Once1] {
                                  __DEFAULT -> $wscanr__s3XI x2_s3XS sat_s3XW GHC.Prim.void#;
                                  };
                                  };
                                  };
                                  };
                                  };
                                  };
                                  };
                                  };
                              1# -> $w$j_s3XH;
                            };
                    end Rec }
                  } in  $wscanr__s3XI 0##8 sat_s3XX GHC.Prim.void#;
                };
                };
              };
              };
              };
              };
          1# -> GHC.ForeignPtr.mallocPlainForeignPtrBytes2;
        };
        };

In 9.6 where we inline join points on the other hand we instead get:

BenchShort.$wuse_scanr [InlPrag=[2]]
  :: GHC.Prim.Addr#
     -> GHC.ForeignPtr.ForeignPtrContents
     -> GHC.Prim.Int#
     -> Data.ByteString.Internal.Type.ByteString
[GblId, Arity=3, Str=<L><L><L>, Unf=OtherCon []] =
    {} \r [ww_s4gR ww1_s4gS ww2_s4gT]
        case +# [ww2_s4gT 1#] of x_s4gU {
        __DEFAULT ->
        case <# [x_s4gU 0#] of {
          __DEFAULT ->
              case newPinnedByteArray# [x_s4gU GHC.Prim.realWorld#] of {
              Solo# ipv1_s4gY ->
              case mutableByteArrayContents# [ipv1_s4gY] of ipv2_s4gZ {
              __DEFAULT ->
              case plusAddr# [ipv2_s4gZ ww2_s4gT] of sat_s4h0 [Occ=Once1] {
              __DEFAULT ->
              case
                  writeWord8OffAddr# [sat_s4h0 0# 0#Word8 GHC.Prim.void#]
              of
              s2_s4h1 [Occ=Once1]
              {
              (##) ->
              let {
                ipv3_s4h2 :: GHC.ForeignPtr.ForeignPtrContents
                [LclId, Unf=OtherCon []] =
                    GHC.ForeignPtr.PlainPtr! [ipv1_s4gY];
              } in 
                case touch# [ipv3_s4h2 GHC.Prim.void#] of s'_s4h3 [Occ=Once1] {
                (##) ->
                case -# [ww2_s4gT 1#] of sat_s4hj [Occ=Once1] {
                __DEFAULT ->
                let-no-escape {
                  Rec {
                  $wscanr__s4h4 [InlPrag=[2],
                                 Occ=LoopBreakerT[3],
                                 Dmd=SC(S,C(1,C(1,L)))]
                    :: GHC.Prim.Word8#
                       -> GHC.Prim.Int#
                       -> GHC.Prim.State# GHC.Prim.RealWorld
                       -> Data.ByteString.Internal.Type.ByteString
                  [LclId[JoinId(3)(Nothing)],
                   Arity=3,
                   Str=<L><L><L>,
                   Unf=OtherCon []] =
                      {$wscanr__s4h4, ipv3_s4h2, ipv2_s4gZ, ww1_s4gS, ww_s4gR,
                       x_s4gU} \r [ww3_s4h5 ww4_s4h6 void_0E]
                          case <# [ww4_s4h6 0#] of {
                            __DEFAULT ->
                                case plusAddr# [ww_s4gR ww4_s4h6] of sat_s4h9 [Occ=Once1] {
                                __DEFAULT ->
                                case readWord8OffAddr# [sat_s4h9 0# GHC.Prim.void#] of {
                                Solo# ipv7_s4hc [Occ=Once1] ->
                                case touch# [ww1_s4gS GHC.Prim.void#] of s'1_s4hd [Occ=Once1] {
                                (##) ->
                                case plusWord8# [ipv7_s4hc ww3_s4h5] of x2_s4he {
                                __DEFAULT ->
                                case plusAddr# [ipv2_s4gZ ww4_s4h6] of sat_s4hf [Occ=Once1] {
                                __DEFAULT ->
                                case
                                    writeWord8OffAddr# [sat_s4hf 0# x2_s4he GHC.Prim.void#]
                                of
                                s3_s4hg [Occ=Once1]
                                {
                                (##) ->
                                case touch# [ipv3_s4h2 GHC.Prim.void#] of s'2_s4hh [Occ=Once1] {
                                (##) ->
                                case -# [ww4_s4h6 1#] of sat_s4hi [Occ=Once1] {
                                __DEFAULT -> $wscanr__s4h4 x2_s4he sat_s4hi GHC.Prim.void#;
                                };
                                };
                                };
                                };
                                };
                                };
                                };
                                };
                            1# ->
                                Data.ByteString.Internal.Type.BS [ipv2_s4gZ ipv3_s4h2 x_s4gU];
                          };
                  end Rec }
                } in  $wscanr__s4h4 0#Word8 sat_s4hj GHC.Prim.void#;
                };
                };
              };
              };
              };
              };
          1# -> GHC.ForeignPtr.mallocPlainForeignPtrBytes2;
        };
        };

The difference being in 9.4 we get as the inner loopslightly simplified something like this:

               join { $w$j_s3Uo = lazy (BS ipv2_s3Tw ipv3_s3Tv x_s3Tx) } in
               joinrec {
                 $wscanr__s3UA ww3_s3Us ww4_s3Uw eta_s3Uy
                   = case <# ww4_s3Uw 0# of {
                       __DEFAULT ->
                         case readWord8OffAddr# (plusAddr# ww_s3UE ww4_s3Uw) 0# eta_s3Uy of
                         { (# ipv6_i3RS, ipv7_i3RT #) ->
                         case touch# ww1_s3UF ipv6_i3RS of s'1_i3RV { __DEFAULT ->
                         let { x2_s3Tz = plusWord8# ipv7_i3RT ww3_s3Us } in
                         case writeWord8OffAddr#
                                (plusAddr# ipv2_s3Tw ww4_s3Uw) 0# x2_s3Tz s'1_i3RV
                         of s3_i3RZ
                         { __DEFAULT ->
                         case touch# ipv3_s3Tv s3_i3RZ of s'2_i3S0 { __DEFAULT ->
                         jump $wscanr__s3UA x2_s3Tz (-# ww4_s3Uw 1#) s'2_i3S0
                         }
                         }
                         }
                         };
                       1# -> jump $w$j_s3Uo
                     }; } in ...

While in 9.6 we currently inline the join point and get:

               joinrec {
                 $wscanr__s4dR ww3_s4dJ ww4_s4dN eta_s4dP
                   = case <# ww4_s4dN 0# of {
                       __DEFAULT ->
                         case readWord8OffAddr# (plusAddr# ww_s4dV ww4_s4dN) 0# eta_s4dP of
                         { (# ipv6_i4bg, ipv7_i4bh #) ->
                         case touch# ww1_s4dW ipv6_i4bg of s'1_i4bj { __DEFAULT ->
                         let { x2_s4cY = plusWord8# ipv7_i4bh ww3_s4dJ } in
                         case writeWord8OffAddr#
                                (plusAddr# ipv2_s4cS ww4_s4dN) 0# x2_s4cY s'1_i4bj
                         of s3_i4bn
                         { __DEFAULT ->
                         case touch# ipv3_s4cU s3_i4bn of s'2_i4bo { __DEFAULT ->
                         jump $wscanr__s4dR x2_s4cY (-# ww4_s4dN 1#) s'2_i4bo
                         }
                         }
                         }
                         };
                       1# -> lazy (BS ipv2_s4cS ipv3_s4cU x_s4cQ)
                     }; } in

This should seemingly produce the same code. However looking at Cmm the difference becomes obvious:

In 9.4 we produce this beautiful Cmm for wscanr__s4dR:

       inner_loop:
           if (%MO_S_Lt_W64(_s3XK::I64, 0)) goto exit_join_point; else goto c3YU;
       c3YU:
           _s3XQ::I8 = I8[_s3Xu::I64 + _s3XK::I64];
           call MO_Touch(_s3Xv::P64);
           _s3XS::I8 = _s3XQ::I8 + _s3XJ::I8;
           I8^[_s3XC::I64 + _s3XK::I64] = _s3XS::I8;
           call MO_Touch(_c3Yx::P64);
           _s3XK::I64 = _s3XK::I64 - 1;
           _s3XJ::I8 = _s3XS::I8;
           goto inner_loop;

In 9.6 since we inlined the join point we produce this abomination instead:

       inner_loop:
           Hp = Hp + 32;
           if (Hp > HpLim) (likely: False) goto c4i5; else goto c4i4;
       c4i5:
           HpAlloc = 32;
           I64[Sp - 24] = c4i1;
           I8[Sp - 16] = _s4h5::I8;
           I64[Sp - 8] = _s4h6::I64;
           I64[Sp] = _s4gZ::I64;
           P64[Sp + 24] = _c4hT::P64;
           Sp = Sp - 24;
           call stg_gc_noregs() returns to c4i1, args: 8, res: 8, upd: 8;
       c4i1:
           _c4hT::P64 = P64[Sp + 48];
           _s4gR::I64 = I64[Sp + 32];
           _s4gS::P64 = P64[Sp + 40];
           _s4gU::I64 = I64[Sp + 56];
           _s4gZ::I64 = I64[Sp + 24];
           _s4h5::I8 = I8[Sp + 8];
           _s4h6::I64 = I64[Sp + 16];
           Sp = Sp + 24;
           goto inner_loop;
       c4i4:
           if (%MO_S_Ge_W64(_s4h6::I64, 0)) goto _b_lt_0; else goto loop_exit;
       _b_lt_0:
           _s4hc::I8 = I8[_s4gR::I64 + _s4h6::I64];
           call MO_Touch(_s4gS::P64);
           _s4he::I8 = _s4hc::I8 + _s4h5::I8;
           I8^[_s4gZ::I64 + _s4h6::I64] = _s4he::I8;
           call MO_Touch(_c4hT::P64);
           Hp = Hp - 32;
           _s4h6::I64 = _s4h6::I64 - 1;
           _s4h5::I8 = _s4he::I8;
           goto inner_loop;
       loop_exit:

The underlying issue here is #16064

We have a recursive function where we place a heap check in the hot code path despite the recursive code path not actually allocating anything.

There is a note that is quite skeptical of this in Note [GC for conditionals] and the relevant code is in cgCase. I assume it's possible but non-trivial to address this in the code generator. So for now it seems best to avoid inlining allocating exit join points.

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information