andArityType allocation regression - 9.2.4
From a client's code base we see the following function is eta-expanded in 9.2.3 but not 9.2.4 which leads to failures in their allocations benchmark.
See Encoding.hs
Compile with
ghc-9.2.3 -O2 Encoding.hs
ghc-9.2.4 -O2 Encoding.hs
{-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving, MagicHash,
UnliftedFFITypes #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS_GHC -ddump-simpl -ddump-to-file -fno-worker-wrapper #-}
module Encoding
( encodeUtf8BuilderEscaped
) where
import Data.Text.Internal
import GHC.Word
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (poke, peekByteOff)
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Builder.Prim.Internal as BP
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import Data.Bits
import Data.Char
#if defined(ASSERTS)
import GHC.Stack (HasCallStack)
#endif
{-# INLINE encodeUtf8BuilderEscaped #-}
-- TODO: Extend documentation with references to source code in @blaze-html@
-- or @aeson@ that uses this function.
encodeUtf8BuilderEscaped :: BP.BoundedPrim Word8 -> Text -> B.Builder
encodeUtf8BuilderEscaped be =
-- manual eta-expansion to ensure inlining works as expected
\txt -> B.builder (mkBuildstep txt)
where
bound = max 4 $ BP.sizeBound be
mkBuildstep (Text arr off len) !k =
outerLoop off
where
iend = off + len
outerLoop !i0 !br@(B.BufferRange op0 ope)
| i0 >= iend = k br
| outRemaining > 0 = goPartial (i0 + min outRemaining inpRemaining)
-- TODO: Use a loop with an integrated bound's check if outRemaining
-- is smaller than 8, as this will save on divisions.
| otherwise = return $ B.bufferFull bound op0 (outerLoop i0)
where
outRemaining = (ope `minusPtr` op0) `div` bound
inpRemaining = iend - i0
goPartial !iendTmp = go i0 op0
where
go !i !op
| i < iendTmp = case A.unsafeIndex arr i of
w | w <= 0x7F -> do
BP.runB be (fromIntegral w) op >>= go (i + 1)
| w <= 0x7FF -> do
poke8 @Word16 0 $ (w `shiftR` 6) + 0xC0
poke8 @Word16 1 $ (w .&. 0x3f) + 0x80
go (i + 1) (op `plusPtr` 2)
| 0xD800 <= w && w <= 0xDBFF -> do
let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1))
poke8 @Int 0 $ (c `shiftR` 18) + 0xF0
poke8 @Int 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80
poke8 @Int 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80
poke8 @Int 3 $ (c .&. 0x3F) + 0x80
go (i + 2) (op `plusPtr` 4)
| otherwise -> do
poke8 @Word16 0 $ (w `shiftR` 12) + 0xE0
poke8 @Word16 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80
poke8 @Word16 2 $ (w .&. 0x3F) + 0x80
go (i + 1) (op `plusPtr` 3)
| otherwise =
outerLoop i (B.BufferRange op ope)
where
-- Take care, a is either Word16 or Int above
poke8 :: Integral a => Int -> a -> IO ()
poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8)
Then look at the difference in core, you will see that in 9.2.3 we eta-expand but not in 9.2.4.
9.2.3 (see eta-expanded, State# token in outerLoop):
encodeUtf8BuilderEscaped [InlPrag=INLINE (sat-args=1)]
:: BP.BoundedPrim Word8 -> Text -> B.Builder
[GblId,
Arity=5,
Str=<MP(L,LCL(C1(C1(P(L,1P(L))))))><1P(L,L,L)><1CL(C1(L))><1P(L,L)><L>,
Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True,
WorkFree=True, Expandable=True,
Guidance=ALWAYS_IF(arity=1,unsat_ok=False,boring_ok=False)
Tmpl= (\ (be_aGl :: BP.BoundedPrim Word8)
(eta_B0 [Occ=Once1!] :: Text)
(@r_a1FM)
(eta1_B1 [Occ=Once1] :: B.BuildStep r_a1FM)
(eta2_B2 [Occ=Once1] :: B.BufferRange)
(eta3_B3 [Occ=Once1, OS=OneShot]
:: GHC.Prim.State# GHC.Prim.RealWorld) ->
let {
bound_aGm :: Int
[LclId]
bound_aGm
= case be_aGl of
{ Data.ByteString.Builder.Prim.Internal.BP dt_a1zq _ [Occ=Dead] ->
case GHC.Prim.<=# 4# dt_a1zq of {
__DEFAULT -> GHC.Types.I# 4#;
1# -> GHC.Types.I# dt_a1zq
}
} } in
case eta_B0 of { Text dt_d1Fy dt1_d1Fz dt2_d1FA [Occ=Once1] ->
case eta1_B1 of k_X2 [Occ=OnceL1!] { __DEFAULT ->
9.2.4 (not eta-expanded)
-- RHS size: {terms: 5,165,
types: 1,469,
coercions: 569,
joins: 15/64}
encodeUtf8BuilderEscaped [InlPrag=INLINE (sat-args=1)]
:: BP.BoundedPrim Word8 -> Text -> B.Builder
[GblId,
Arity=4,
Str=<MP(L,LCL(C1(C1(P(L,1P(L))))))><1P(L,L,L)><1L><1P(L,L)>,
Unf=Unf{Src=InlineStable, TopLvl=True, Value=True, ConLike=True,
WorkFree=True, Expandable=True,
Guidance=ALWAYS_IF(arity=1,unsat_ok=False,boring_ok=False)
Tmpl= (\ (be_aGm :: BP.BoundedPrim Word8)
(eta_B0 [Occ=Once1!] :: Text)
(@r_a1FN)
(eta1_B1 [Occ=Once1] :: B.BuildStep r_a1FN)
(eta2_B2 [Occ=Once1] :: B.BufferRange) ->
let {
bound_aGn :: Int
[LclId]
bound_aGn
= case be_aGm of
{ Data.ByteString.Builder.Prim.Internal.BP dt_a1zr _ [Occ=Dead] ->
case GHC.Prim.<=# 4# dt_a1zr of {
__DEFAULT -> GHC.Types.I# 4#;
1# -> GHC.Types.I# dt_a1zr
}
} } in
case eta_B0 of { Text dt_d1Fz dt1_d1FA dt2_d1FB [Occ=Once1] ->
case eta1_B1 of k_X2 [Occ=OnceL1!] { __DEFAULT ->
let {
iend_s1Kg :: GHC.Prim.Int#
[LclId]
iend_s1Kg = GHC.Prim.+# dt1_d1FA dt2_d1FB } in
let {
iend1_aS0 [Occ=OnceL1] :: Int
[LclId, Unf=OtherCon []]
iend1_aS0 = GHC.Types.I# iend_s1Kg } in
letrec {
outerLoop_a1gh [Occ=LoopBreaker]
:: Int -> B.BufferRange -> IO (B.BuildSignal r_a1FN)
[LclId, Arity=2, Unf=OtherCon []]