GHC fails to optimize IO
Summary
While manually writing a bottom-up parser for a simple arithmetic language I noticed some redundant allocations in the STG even when compiling with -O2
.
Steps to reproduce
Here is the parser that I'm writing. I've written two versions of step0
, step1
and step4
where I've manually applied an optimization to the version without the '
.
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnboxedTuples #-}
{-# LANGUAGE UnliftedDatatypes #-}
{-# LANGUAGE UnliftedNewtypes #-}
{-# LANGUAGE BangPatterns #-}
module BottomUp.ByteString (parseFile, step0', step1', step4') where
-- import Expr ( Expr(..), Op(..) )
import Data.Word ( Word64 )
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BS
import GHC.Exts (compactNew#, compactAdd#, Compact#, UnliftedType)
import GHC.IO (IO (IO), unIO)
import Data.Char (isSpace)
import Data.Kind (Type)
data Expr = Bin Op Expr Expr | Num Word64
data Op = Add | Sub | Mul | Div
type SnocList :: Type -> UnliftedType
data SnocList a = SnocList a :> !a | Lin
infixl :>
newtype Closure = MkC (ByteString -> SnocList Closure -> Expr -> IO (Maybe Expr))
-- 0 -- ( --> ^
-- 0 -- N --> 1
step0 :: Compact# -> ByteString -> SnocList Closure -> IO (Maybe Expr)
step0 c bs xs = IO \s ->
case BS.uncons bs of
Just ('(', bs') -> unIO (step0 c bs' (xs :> MkC \bs'' xs'' x -> step1 c bs'' xs'' x)) s
_ | Just (n, bs') <- BS.readInt bs -> unIO (step1 c bs' xs =<< num c (fromIntegral n)) s
Just (spc, bs') | isSpace spc -> unIO (step0 c (BS.dropWhile isSpace bs') xs) s
_ -> unIO (pure Nothing) s
step0' :: Compact# -> ByteString -> SnocList Closure -> IO (Maybe Expr)
step0' c bs xs =
case BS.uncons bs of
Just ('(', bs') -> step0 c bs' (xs :> MkC \bs'' xs'' x -> step1 c bs'' xs'' x)
_ | Just (n, bs') <- BS.readInt bs -> step1 c bs' xs =<< num c (fromIntegral n)
Just (spc, bs') | isSpace spc -> step0 c (BS.dropWhile isSpace bs') xs
_ -> pure Nothing
-- 1 -- ) --> v
-- 1 -- */ --> 2
-- 1 -- +- --> 3
step1 :: Compact# -> ByteString -> SnocList Closure -> Expr -> IO (Maybe Expr)
step1 c bs xs !x = IO \s ->
case BS.uncons bs of
Just (')', bs') | xs' :> MkC f <- xs -> unIO (f bs' xs' x) s
Just ('*', bs') -> unIO (step2 c bs' xs x Mul) s
Just ('/', bs') -> unIO (step2 c bs' xs x Div) s
Just ('+', bs') -> unIO (step3 c bs' xs x Mul) s
Just ('-', bs') -> unIO (step3 c bs' xs x Div) s
Nothing -> unIO (pure (Just x)) s
Just (spc, bs') | isSpace spc -> unIO (step0 c (BS.dropWhile isSpace bs') xs) s
_ -> unIO (pure Nothing) s
step1' :: Compact# -> ByteString -> SnocList Closure -> Expr -> IO (Maybe Expr)
step1' c bs xs !x =
case BS.uncons bs of
Just (')', bs') | xs' :> MkC f <- xs -> f bs' xs' x
Just ('*', bs') -> step2 c bs' xs x Mul
Just ('/', bs') -> step2 c bs' xs x Div
Just ('+', bs') -> step3 c bs' xs x Mul
Just ('-', bs') -> step3 c bs' xs x Div
Nothing -> pure (Just x)
Just (spc, bs') | isSpace spc -> step0 c (BS.dropWhile isSpace bs') xs
_ -> pure Nothing
-- 2 -- ( --> ^
-- 2 -- N --> 1
step2 :: Compact# -> ByteString -> SnocList Closure -> Expr -> Op -> IO (Maybe Expr)
step2 c bs xs !x !o =
case BS.uncons bs of
Just ('(', bs') -> step0 c bs' (xs :> MkC \bs'' xs'' y -> step1 c bs'' xs'' =<< bin c o x y)
_ | Just (n, bs') <- BS.readInt bs -> step1 c bs' xs =<< bin c o x =<< num c (fromIntegral n)
Just (s, bs') | isSpace s -> step0 c (BS.dropWhile isSpace bs') xs
_ -> pure Nothing
-- 3 -- ( --> ^
-- 3 -- N --> 4
step3 :: Compact# -> ByteString -> SnocList Closure -> Expr -> Op -> IO (Maybe Expr)
step3 c bs xs !x !o =
case BS.uncons bs of
Just ('(', bs') -> step0 c bs' (xs :> MkC \bs'' xs'' y -> step4 c bs'' xs'' x o y)
_ | Just (n, bs') <- BS.readInt bs -> step4 c bs' xs x o =<< num c (fromIntegral n)
Just (s, bs') | isSpace s -> step0 c (BS.dropWhile isSpace bs') xs
_ -> pure Nothing
-- 4 -- ) --> v
-- 4 -- */ --> 5
-- 4 -- +- --> 3
step4' :: Compact# -> ByteString -> SnocList Closure -> Expr -> Op -> Expr -> IO (Maybe Expr)
step4' c bs xs !x !o !y =
case BS.uncons bs of
Just (')', bs') | xs' :> MkC f <- xs -> f bs' xs' =<< bin c o x y
Just ('*', bs') -> step5 c bs' xs x o y Mul
Just ('/', bs') -> step5 c bs' xs x o y Div
Just ('+', bs') -> (\x' -> step3 c bs' xs x' Add) =<< bin c o x y
Just ('-', bs') -> (\x' -> step3 c bs' xs x' Sub) =<< bin c o x y
Nothing -> Just <$> bin c o x y
Just (spc, bs') | isSpace spc -> step0 c (BS.dropWhile isSpace bs') xs
_ -> pure Nothing
step4 :: Compact# -> ByteString -> SnocList Closure -> Expr -> Op -> Expr -> IO (Maybe Expr)
step4 c bs xs !x !o !y = IO \s ->
case BS.uncons bs of
Just (')', bs') | xs' :> MkC f <- xs -> unIO (f bs' xs' =<< bin c o x y) s
Just ('*', bs') -> unIO (step5 c bs' xs x o y Mul) s
Just ('/', bs') -> unIO (step5 c bs' xs x o y Div) s
Just ('+', bs') -> unIO ((\x' -> step3 c bs' xs x' Add) =<< bin c o x y) s
Just ('-', bs') -> unIO ((\x' -> step3 c bs' xs x' Sub) =<< bin c o x y) s
Nothing -> unIO (Just <$> bin c o x y) s
Just (spc, bs') | isSpace spc -> unIO (step0 c (BS.dropWhile isSpace bs') xs) s
_ -> unIO (pure Nothing) s
-- 5 -- ( --> ^
-- 5 -- N --> 4
step5 :: Compact# -> ByteString -> SnocList Closure -> Expr -> Op -> Expr -> Op -> IO (Maybe Expr)
step5 c bs xs !x !o1 !y !o2 =
case BS.uncons bs of
Just ('(', bs') -> step0 c bs' (xs :> MkC \bs'' xs'' z -> step4 c bs'' xs'' x o1 =<< bin c o2 y z)
_ | Just (n, bs') <- BS.readInt bs -> step4 c bs' xs x o1 =<< bin c o2 y =<< num c (fromIntegral n)
Just (s, bs') | isSpace s -> step0 c (BS.dropWhile isSpace bs') xs
_ -> pure Nothing
num :: Compact# -> Word64 -> IO Expr
num c !x = IO (compactAdd# c (Num x))
bin :: Compact# -> Op -> Expr -> Expr -> IO Expr
bin c !op !x !y = IO (compactAdd# c (Bin op x y))
parse :: ByteString -> IO (Maybe Expr)
parse t = IO \s ->
case compactNew# 65536## s of { (# s', c #) ->
case step0 c t Lin of { IO io ->
io s'
} }
parseFile :: FilePath -> IO (Maybe Expr)
parseFile path = parse =<< BS.readFile path
Expected behavior
The STG produced for the step1'
and step4'
functions contains unnecessary allocations, e.g. this is in the STG of the step1'
function:
$wstep1'
:: Compact#
-> Addr#
-> ForeignPtrContents
-> Int#
-> SnocList Closure
-> Expr
-> IO (Maybe Expr) =
[...]
let-no-escape {
$j_s2Fq :: IO (Maybe Expr) =
\r []
let {
sat_s2FA
:: State# RealWorld -> (# State# RealWorld, Maybe Expr #) =
\r [void_0E]
case $wfirstnonspace bs'_s2Fj 0# bs'1_s2Fk realWorld# of {
Solo# ww4_s2Fu ->
case ==# [ww4_s2Fu bs'1_s2Fk] of {
__DEFAULT ->
case touch# [ww1_s2F9 void#] of {
(##) ->
case -# [bs'1_s2Fk ww4_s2Fu] of sat_s2Fy {
__DEFAULT ->
case plusAddr# [bs'_s2Fj ww4_s2Fu] of sat_s2Fx {
__DEFAULT ->
$wstep0
c_s2F7 sat_s2Fx ww1_s2F9 sat_s2Fy xs_s2Fb void#;
};
};
};
1# ->
case touch# [ww1_s2F9 void#] of {
(##) ->
$wstep0 c_s2F7 __NULL FinalPtr 0# xs_s2Fb void#;
};
};
};
} in sat_s2FA;
} in
[...]
That should be the same as in the step1
function:
$wstep1_r2ti
:: Compact#
-> Addr#
-> ForeignPtrContents
-> Int#
-> SnocList Closure
-> Expr
-> State# RealWorld
-> (# State# RealWorld, Maybe Expr #) =
[...]
let-no-escape {
$j_s3y9 :: (# State# RealWorld, Maybe Expr #) =
\r []
case $wfirstnonspace bs'_s3y2 0# bs'1_s3y3 realWorld# of {
Solo# ww4_s3yc ->
case ==# [ww4_s3yc bs'1_s3y3] of {
__DEFAULT ->
case touch# [ww1_s3xR void#] of {
(##) ->
case -# [bs'1_s3y3 ww4_s3yc] of sat_s3yg {
__DEFAULT ->
case plusAddr# [bs'_s3y2 ww4_s3yc] of sat_s3yf {
__DEFAULT ->
$wstep0 c_s3xP sat_s3yf ww1_s3xR sat_s3yg xs_s3xT void#;
};
};
};
1# ->
case touch# [ww1_s3xR void#] of {
(##) -> $wstep0 c_s3xP __NULL FinalPtr 0# xs_s3xT void#;
};
};
};
} in
[...]
Concretely, the benchmark that I'm using takes 82.8 ms and allocates 292 MB in total without this optimization compared to 76.9 ms and 199 MB total allocation with the optimization.
Environment
- GHC version used: 9.2.4 and 9.4.2
Optional:
- Operating System:
- System Architecture: