Commit 204f6ac2 authored by Xia Li-yao's avatar Xia Li-yao Committed by Bodigrim
Browse files

Fix UTF-8 decoding of lazy bytestrings

At the beginning of a new chunk we may be trying to complete a UTF-8
sequence started in the previous chunk (contained in the `undecode0`
buffer). If it turns out to be invalid, we must apply the `onErr`
handler to every character in that buffer.

When we reach the end of the chunk, we must also be more careful
about when to keep the previous buffer: a UTF-8 sequence (up to 4 bytes)
can span more than two chunks, when those chunks are very short
(of length 0, 1, or 2).
parent 8d1b6ff5
......@@ -64,6 +64,7 @@ import Control.Monad.ST (runST)
import Data.Bits ((.&.))
import Data.ByteString as B
import qualified Data.ByteString.Internal as B
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal.Functions
......@@ -275,7 +276,7 @@ newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
streamDecodeUtf8 :: ByteString -> Decoding
streamDecodeUtf8 = streamDecodeUtf8With strictDecode
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- | Decode, in a stream oriented way, a lazy 'ByteString' containing UTF-8
-- encoded text.
--
-- @since 1.0.0.0
......@@ -283,11 +284,14 @@ streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential surrogate pair started in the last buffer
-- potential surrogate pair started in the last buffer (@undecoded0@), or
-- replacement characters for each byte in @undecoded0@ if the
-- sequence turns out to be invalid. There can be up to three bytes there,
-- hence we allocate @len+3@ 16-bit words.
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr ->
......@@ -297,23 +301,32 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` len
loop curPtr = do
prevState <- peek statePtr
poke curPtrPtr curPtr
curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
lastPtr <- c_decode_utf8_with_state (A.maBA dest) destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
x <- peek curPtr'
poke statePtr 0
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
loop $ curPtr' `plusPtr` 1
let skipByte x = case onErr desc (Just x) of
Nothing -> return ()
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite dest (fromIntegral destOff) (safe c)
poke destOffPtr (destOff + fromIntegral w)
if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
-- If we can't complete the sequence @undecoded0@ from
-- the previous chunk, we invalidate the bytes from
-- @undecoded0@ and retry decoding the current chunk from
-- the initial state.
traverse_ skipByte (B.unpack undecoded0 )
loop lastPtr
else do
peek lastPtr >>= skipByte
loop (lastPtr `plusPtr` 1)
_ -> do
-- We encountered the end of the buffer while decoding
......@@ -322,11 +335,11 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
chunkText <- unsafeSTToIO $ do
arr <- A.unsafeFreeze dest
return $! text arr 0 (fromIntegral n)
lastPtr <- peek curPtrPtr
let left = lastPtr `minusPtr` curPtr
let left = lastPtr `minusPtr` ptr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ -> B.append undecoded0 (B.drop left bs)
_ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
| otherwise -> B.drop left bs
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop ptr
......
......@@ -100,9 +100,13 @@ decodeUtf8With onErr (B.Chunk b0 bs0) =
TE.Some t l f -> chunk t (go f l bs)
go _ l _
| S.null l = empty
| otherwise = case onErr desc (Just (B.unsafeHead l)) of
Nothing -> empty
Just c -> Chunk (T.singleton c) Empty
| otherwise =
let !t = T.pack (skipBytes l)
skipBytes = S.foldr (\x s' ->
case onErr desc (Just x) of
Just c -> c : s'
Nothing -> s') [] in
Chunk t Empty
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
decodeUtf8With _ _ = empty
......
......@@ -24,6 +24,7 @@ import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as EL
......@@ -152,6 +153,18 @@ genInvalidUTF8 = B.pack <$> oneof [
k <- choose (0,n)
vectorOf k gen
decodeLL :: BL.ByteString -> TL.Text
decodeLL = EL.decodeUtf8With E.lenientDecode
decodeL :: B.ByteString -> T.Text
decodeL = E.decodeUtf8With E.lenientDecode
-- The lenient decoding of lazy bytestrings should not depend on how they are chunked,
-- and it should behave the same as decoding of strict bytestrings.
t_decode_utf8_lenient :: Property
t_decode_utf8_lenient = forAllShrinkShow arbitrary shrink (show . BL.toChunks) $ \bs ->
decodeLL bs === (TL.fromStrict . decodeL . B.concat . BL.toChunks) bs
-- See http://unicode.org/faq/utf_bom.html#gen8
-- A sequence such as <110xxxxx2 0xxxxxxx2> is illegal ...
-- When faced with this illegal byte sequence ... a UTF-8 conformant process
......@@ -206,6 +219,7 @@ testTranscoding =
testProperty "t_utf8_err'" t_utf8_err'
],
testGroup "error recovery" [
testProperty "t_decode_utf8_lenient" t_decode_utf8_lenient,
testProperty "t_decode_with_error2" t_decode_with_error2,
testProperty "t_decode_with_error3" t_decode_with_error3,
testProperty "t_decode_with_error4" t_decode_with_error4,
......
......@@ -49,6 +49,7 @@ import Test.QuickCheck.Monadic (assert, monadicIO, run)
import Test.QuickCheck.Unicode (string)
import Tests.Utils
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Text as T
import qualified Data.Text.Encoding.Error as T
import qualified Data.Text.Internal.Fusion as TF
......@@ -61,6 +62,9 @@ import qualified System.IO as IO
genUnicode :: IsString a => Gen a
genUnicode = fromString <$> string
genWord8 :: Gen Word8
genWord8 = chooseAny
instance Random I16 where
randomR = integralRandomR
random = randomR (minBound,maxBound)
......@@ -70,9 +74,23 @@ instance Arbitrary I16 where
shrink = shrinkIntegral
instance Arbitrary B.ByteString where
arbitrary = B.pack `fmap` arbitrary
arbitrary = B.pack `fmap` listOf genWord8
shrink = map B.pack . shrink . B.unpack
instance Arbitrary BL.ByteString where
arbitrary = oneof
[ BL.fromChunks <$> arbitrary
-- so that a single utf8 code point could appear split over up to 4 chunks
, BL.fromChunks . map B.singleton <$> listOf genWord8
-- so that a code point with 4 byte long utf8 representation
-- could appear split over 3 non-singleton chunks
, (\a b c -> BL.fromChunks [a, b, c])
<$> arbitrary
<*> ((\a b -> B.pack [a, b]) <$> genWord8 <*> genWord8)
<*> arbitrary
]
shrink xs = BL.fromChunks <$> shrink (BL.toChunks xs)
-- For tests that have O(n^2) running times or input sizes, resize
-- their inputs to the square root of the originals.
unsquare :: (Arbitrary a, Show a, Testable b) => (a -> b) -> Property
......
......@@ -20,6 +20,7 @@ import qualified Data.ByteString.Lazy as LB
import qualified Data.Text as T
import qualified Data.Text.Array as TA
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Encoding.Error as E
import qualified Data.Text.Internal as T
import qualified Data.Text.IO as T
import qualified Data.Text.Lazy as LT
......@@ -136,6 +137,13 @@ t301 = do
original@(T.Text originalArr originalOff originalLen) = T.pack "1234567890"
T.Text newArr _off _len = T.take 1 $ T.drop 1 original
t330 :: IO ()
t330 = do
let decodeL = LE.decodeUtf8With E.lenientDecode
assertEqual "The lenient decoding of lazy bytestrings should not depend on how they are chunked"
(decodeL (LB.fromChunks [B.pack [194], B.pack [97, 98, 99]]))
(decodeL (LB.fromChunks [B.pack [194, 97, 98, 99]]))
tests :: F.TestTree
tests = F.testGroup "Regressions"
[ F.testCase "hGetContents_crash" hGetContents_crash
......@@ -149,4 +157,5 @@ tests = F.testGroup "Regressions"
, F.testCase "t280/fromString" t280_fromString
, F.testCase "t280/singleton" t280_singleton
, F.testCase "t301" t301
, F.testCase "t330" t330
]
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment