From 001483ed6e6d259d060e69ddc359765c96b2449c Mon Sep 17 00:00:00 2001 From: Matthew Pickering <matthewtpickering@gmail.com> Date: Mon, 19 Jul 2021 14:58:23 +0100 Subject: [PATCH] Check the buffer size *before* calling the continuation in withEncodedCString This fixes a very subtle bug in withEncodedCString where a reference would be kept to the whole continuation until the continuation had finished executing. This was because the call to tryFillBufferAndCall could fail, if the buffer was already full and so the `go` helper would be recursively called on failure which necessitated keeping a reference to `act`. The failure could only happen during the initial checking phase of the function but not during the call to the continuation. Therefore the fix is to first perform the size check, potentially recursively and then finally calling tail calling the continuation. In the real world, this broke writing lazy bytestrings because a reference to the head of the bytestring would be retained in the continuation until the whole string had been written to a file. Fixes #20107 (cherry picked from commit 509445b5947ce85499672399f5e88b6196af4c5a) --- libraries/base/GHC/Foreign.hs | 64 +++++++++++++++++++++++++++------- libraries/base/tests/T20107.hs | 11 ++++++ libraries/base/tests/all.T | 1 + 3 files changed, 63 insertions(+), 13 deletions(-) create mode 100644 libraries/base/tests/T20107.hs diff --git a/libraries/base/GHC/Foreign.hs b/libraries/base/GHC/Foreign.hs index 196005d3a7a3..658f5c3515a0 100644 --- a/libraries/base/GHC/Foreign.hs +++ b/libraries/base/GHC/Foreign.hs @@ -233,14 +233,23 @@ withEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate let go !iteration to_sz_bytes = do putDebugMsg ("withEncodedCString: " ++ show iteration) allocaBytes to_sz_bytes $ \to_p -> do - mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes act + -- See Note [Check *before* fill in withEncodedCString] about why + -- this is subtle. + mb_res <- tryFillBuffer encoder null_terminate from to_p to_sz_bytes case mb_res of Nothing -> go (iteration + 1) (to_sz_bytes * 2) - Just res -> return res + Just to_buf -> withCStringBuffer to_buf null_terminate act -- If the input string is ASCII, this value will ensure we only allocate once go (0 :: Int) (cCharSize * (sz + 1)) +withCStringBuffer :: Buffer Word8 -> Bool -> (CStringLen -> IO r) -> IO r +withCStringBuffer to_buf null_terminate act = do + let bytes = bufferElems to_buf + withBuffer to_buf $ \to_ptr -> do + when null_terminate $ pokeElemOff to_ptr (bufR to_buf) 0 + act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes* + {-# INLINE newEncodedCString #-} newEncodedCString :: TextEncoding -- ^ Encoding of CString to create -> Bool -- ^ Null-terminate? @@ -252,13 +261,13 @@ newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s let go !iteration to_p to_sz_bytes = do putDebugMsg ("newEncodedCString: " ++ show iteration) - mb_res <- tryFillBufferAndCall encoder null_terminate from to_p to_sz_bytes return + mb_res <- tryFillBuffer encoder null_terminate from to_p to_sz_bytes case mb_res of Nothing -> do let to_sz_bytes' = to_sz_bytes * 2 to_p' <- reallocBytes to_p to_sz_bytes' go (iteration + 1) to_p' to_sz_bytes' - Just res -> return res + Just to_buf -> withCStringBuffer to_buf null_terminate return -- If the input string is ASCII, this value will ensure we only allocate once let to_sz_bytes = cCharSize * (sz + 1) @@ -266,9 +275,9 @@ newEncodedCString (TextEncoding { mkTextEncoder = mk_encoder }) null_terminate s go (0 :: Int) to_p to_sz_bytes -tryFillBufferAndCall :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int - -> (CStringLen -> IO a) -> IO (Maybe a) -tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do +tryFillBuffer :: TextEncoder dstate -> Bool -> Buffer Char -> Ptr Word8 -> Int + -> IO (Maybe (Buffer Word8)) +tryFillBuffer encoder null_terminate from0 to_p to_sz_bytes = do to_fp <- newForeignPtr_ to_p go (0 :: Int) (from0, emptyBuffer to_fp to_sz_bytes WriteBuffer) where @@ -278,14 +287,43 @@ tryFillBufferAndCall encoder null_terminate from0 to_p to_sz_bytes act = do if isEmptyBuffer from' then if null_terminate && bufferAvailable to' == 0 then return Nothing -- We had enough for the string but not the terminator: ask the caller for more buffer - else do - -- Awesome, we had enough buffer - let bytes = bufferElems to' - withBuffer to' $ \to_ptr -> do - when null_terminate $ pokeElemOff to_ptr (bufR to') 0 - fmap Just $ act (castPtr to_ptr, bytes) -- NB: the length information is specified as being in *bytes* + else return (Just to') else case why of -- We didn't consume all of the input InputUnderflow -> recover encoder from' to' >>= go (iteration + 1) -- These conditions are equally bad InvalidSequence -> recover encoder from' to' >>= go (iteration + 1) -- since the input was truncated/invalid OutputUnderflow -> return Nothing -- Oops, out of buffer during decoding: ask the caller for more +{- +Note [Check *before* fill in withEncodedCString] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It's very important that the size check and readjustment peformed by tryFillBuffer +happens before the continuation is called. The size check is the part which can +fail, the call to the continuation never fails and so the caller should respond +first to the size check failing and *then* call the continuation. Making this evident +to the compiler avoids historic space leaks. + +In a previous interation of this code we had a pattern that, somewhat simplified, +looked like this: + +go :: State -> (State -> IO a) -> IO a +go state action = + case tryFillBufferAndCall state action of + Left state' -> go state' action + Right result -> result + +`tryFillBufferAndCall` performed some checks, and then we either called action, +or we modified the state and tried again. +This went wrong because `action` can be a function closure containing a reference to +a lazy data structure. If we call action directly, without retaining any references +to action, that is fine. The data structure is consumed as it is produced and we operate +in constant space. + +However the failure branch `go state' action` *does* capture a reference to action. +This went wrong because the reference to action in the failure branch only becomes +unreachable *after* action returns. This means we keep alive the function closure +for `action` until `action` returns. Which in turn keeps alive the *whole* lazy list +via `action` until the action has fully run. +This went wrong in #20107, where the continuation kept an entire lazy bytestring alive +rather than allowing it to be incrementaly consumed and collected. +-} diff --git a/libraries/base/tests/T20107.hs b/libraries/base/tests/T20107.hs new file mode 100644 index 000000000000..37c34c472e8c --- /dev/null +++ b/libraries/base/tests/T20107.hs @@ -0,0 +1,11 @@ +module Main where + +import Data.ByteString.Char8 (pack) +import Data.ByteString.Builder + +import qualified Data.ByteString.Lazy as L + +main = + L.writeFile "out" + . toLazyByteString . foldMap byteString + . replicate 10000000 $ pack "text" diff --git a/libraries/base/tests/all.T b/libraries/base/tests/all.T index 32dfaecf3143..4c2d0d0ae7c7 100644 --- a/libraries/base/tests/all.T +++ b/libraries/base/tests/all.T @@ -253,3 +253,4 @@ test('T15349', [exit_code(1), expect_broken_for(15349, ['ghci'])], compile_and_r test('T16111', exit_code(1), compile_and_run, ['']) test('T16943a', normal, compile_and_run, ['']) test('T16943b', normal, compile_and_run, ['']) +test('T20107', extra_run_opts('+RTS -M50M'), compile_and_run, ['-package bytestring']) -- GitLab