From aca09714e2a413e5bc7893e5f6c9ccda995b5c2e Mon Sep 17 00:00:00 2001 From: Ben Gamari <bgamari.foss@gmail.com> Date: Fri, 20 Sep 2013 20:57:27 -0400 Subject: [PATCH] Add support for incremental decoding Decoding multi-byte encodings such as UTF-8 pose difficulty for streaming I/O as one must take care to carry the decoder state between incoming chunks. Here we introduce `decodeUtf8With'` which exposes an interface similar to that provided by cassava's `Data.Csv.Incremental`. To do this, we adapt the C UTF-8 decoder to expose its automaton state and codepoint accumulator. --- Data/Text/Encoding.hs | 62 ++++++++++++++++++++++++++++++++++++++++++- cbits/cbits.c | 47 ++++++++++++++++++++++++++------ 2 files changed, 100 insertions(+), 9 deletions(-) diff --git a/Data/Text/Encoding.hs b/Data/Text/Encoding.hs index d5c90d31..939386e3 100644 --- a/Data/Text/Encoding.hs +++ b/Data/Text/Encoding.hs @@ -43,6 +43,10 @@ module Data.Text.Encoding , decodeUtf32LEWith , decodeUtf32BEWith + -- ** Streaming decoding with controllable error handling + , decodeUtf8With' + , Decoder(..) + -- * Encoding Text to ByteStrings , encodeUtf8 , encodeUtf16LE @@ -57,6 +61,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) #else import Control.Monad.ST (unsafeIOToST, unsafeSTToIO) #endif +import Control.Monad.ST (runST) import Data.Bits ((.&.)) import Data.ByteString as B import Data.ByteString.Internal as B @@ -65,7 +70,7 @@ import Data.Text.Internal (Text(..), safe, textP) import Data.Text.Private (runText) import Data.Text.UnsafeChar (ord, unsafeWrite) import Data.Text.UnsafeShift (shiftL, shiftR) -import Data.Word (Word8) +import Data.Word (Word8, Word32) import Foreign.C.Types (CSize) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Utils (with) @@ -139,6 +144,56 @@ decodeUtf8With onErr (PS fp off len) = runText $ \done -> do desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream" {- INLINE[0] decodeUtf8With #-} +data Decoder = Some !Text (ByteString -> Decoder) + +-- | (codepoint, state) +type DecoderState = (Word32, Word32) + +decodeUtf8With' :: OnDecodeError -> ByteString -> Decoder +decodeUtf8With' onErr = decodeChunk (0,0) + where + -- We create a slightly larger than necessary buffer to accomodate a + -- potential surrogate pair started in the last buffer + decodeChunk :: DecoderState -> ByteString -> Decoder + decodeChunk (codepoint0,state0) (PS fp off len) = + runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) + where + decodeChunkToBuffer :: A.MArray s -> IO Decoder + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + let end = ptr `plusPtr` (off + len) + loop curPtr = do + curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtr end + codepointPtr statePtr + state <- peek statePtr + case state of + 12 -> do + -- We encountered an encoding error + x <- peek curPtr' + case onErr desc (Just x) of + Nothing -> loop $ curPtr' `plusPtr` 1 + Just c -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) (safe c) + poke destOffPtr (destOff + fromIntegral w) + poke statePtr 0 + loop $ curPtr' `plusPtr` 1 + + _ -> do + -- We encountered the end of the buffer while decoding + n <- peek destOffPtr + codepoint <- peek codepointPtr + chunkText <- unsafeSTToIO $ do + arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) + return $ Some chunkText $ decodeChunk (codepoint, state) + in loop (ptr `plusPtr` off) + desc = "Data.Text.Encoding.decodeUtf8With': Invalid UTF-8 stream" +{- INLINE[0] decodeUtf8With' #-} + -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. -- @@ -296,5 +351,10 @@ foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8 :: MutableByteArray# s -> Ptr CSize -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8) +foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state + :: MutableByteArray# s -> Ptr CSize + -> Ptr Word8 -> Ptr Word8 + -> Ptr Word32 -> Ptr Word32 -> IO (Ptr Word8) + foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1 :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO () diff --git a/cbits/cbits.c b/cbits/cbits.c index 012239b0..fee1db2a 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -104,19 +104,36 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src, * the start of an invalid byte sequence. * * At exit, updates *destoff with the next offset to write to, and - * returns the next source offset to read from. + * returns the next source offset to read from. Moreover, this function + * exposes the internal decoder state (state0 and codepoint0), allowing one + * to restart the decoder after it terminates (say, due to a partial codepoint). + * + * In particular, there are a few possible outcomes, + * + * 1) We decoded the buffer entirely: + * In this case we return srcend + * state0 == UTF8_ACCEPT + * + * 2) We met an invalid encoding + * In this case we return the address of the first invalid byte + * state0 == UTF8_REJECT + * + * 3) We reached the end of the buffer while decoding a codepoint + * In this case we return a pointer to the first byte of the partial codepoint + * state0 != UTF8_ACCEPT, UTF8_REJECT + * */ uint8_t const * -_hs_text_decode_utf8(uint16_t *dest, size_t *destoff, - const uint8_t const *src, const uint8_t const *srcend) +_hs_text_decode_utf8_state(uint16_t *dest, size_t *destoff, + const uint8_t const *src, const uint8_t const *srcend, + uint32_t *codepoint0, uint32_t *state0) { uint16_t *d = dest + *destoff; const uint8_t const *s = src; - uint32_t state = UTF8_ACCEPT; + uint32_t state = *state0; + uint32_t codepoint = *codepoint0; while (s < srcend) { - uint32_t codepoint; - #if defined(__i386__) || defined(__x86_64__) /* * This code will only work on a little-endian system that @@ -161,11 +178,25 @@ _hs_text_decode_utf8(uint16_t *dest, size_t *destoff, } } - /* Error recovery - if we're not in a valid finishing state, back up. */ - if (state != UTF8_ACCEPT) + /* Invalid encoding, back up to the errant character */ + if (state == UTF8_REJECT) s -= 1; *destoff = d - dest; + *codepoint0 = codepoint; + *state0 = state; return s; } + +/* + * Helper to decode buffer and discard final decoder state + */ +uint8_t const * +_hs_text_decode_utf8(uint16_t *dest, size_t *destoff, + const uint8_t const *src, const uint8_t const *srcend) +{ + uint32_t codepoint; + uint32_t state = UTF8_ACCEPT; + return _hs_text_decode_utf8_state(dest, destoff, src, srcend, &codepoint, &state); +} -- GitLab