From aca09714e2a413e5bc7893e5f6c9ccda995b5c2e Mon Sep 17 00:00:00 2001
From: Ben Gamari <bgamari.foss@gmail.com>
Date: Fri, 20 Sep 2013 20:57:27 -0400
Subject: [PATCH] Add support for incremental decoding

Decoding multi-byte encodings such as UTF-8 pose difficulty for
streaming I/O as one must take care to carry the decoder state between
incoming chunks. Here we introduce `decodeUtf8With'` which exposes an
interface similar to that provided by cassava's `Data.Csv.Incremental`.
To do this, we adapt the C UTF-8 decoder to expose its automaton state
and codepoint accumulator.
---
 Data/Text/Encoding.hs | 62 ++++++++++++++++++++++++++++++++++++++++++-
 cbits/cbits.c         | 47 ++++++++++++++++++++++++++------
 2 files changed, 100 insertions(+), 9 deletions(-)

diff --git a/Data/Text/Encoding.hs b/Data/Text/Encoding.hs
index d5c90d31..939386e3 100644
--- a/Data/Text/Encoding.hs
+++ b/Data/Text/Encoding.hs
@@ -43,6 +43,10 @@ module Data.Text.Encoding
     , decodeUtf32LEWith
     , decodeUtf32BEWith
 
+    -- ** Streaming decoding with controllable error handling
+    , decodeUtf8With'
+    , Decoder(..)
+
     -- * Encoding Text to ByteStrings
     , encodeUtf8
     , encodeUtf16LE
@@ -57,6 +61,7 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 #else
 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
 #endif
+import Control.Monad.ST (runST)
 import Data.Bits ((.&.))
 import Data.ByteString as B
 import Data.ByteString.Internal as B
@@ -65,7 +70,7 @@ import Data.Text.Internal (Text(..), safe, textP)
 import Data.Text.Private (runText)
 import Data.Text.UnsafeChar (ord, unsafeWrite)
 import Data.Text.UnsafeShift (shiftL, shiftR)
-import Data.Word (Word8)
+import Data.Word (Word8, Word32)
 import Foreign.C.Types (CSize)
 import Foreign.ForeignPtr (withForeignPtr)
 import Foreign.Marshal.Utils (with)
@@ -139,6 +144,56 @@ decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
   desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
 {- INLINE[0] decodeUtf8With #-}
 
+data Decoder = Some !Text (ByteString -> Decoder)
+
+-- | (codepoint, state)
+type DecoderState = (Word32, Word32)
+
+decodeUtf8With' :: OnDecodeError -> ByteString -> Decoder
+decodeUtf8With' onErr = decodeChunk (0,0)
+ where
+  -- We create a slightly larger than necessary buffer to accomodate a
+  -- potential surrogate pair started in the last buffer
+  decodeChunk :: DecoderState -> ByteString -> Decoder
+  decodeChunk (codepoint0,state0) (PS fp off len) =
+    runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1)
+   where
+    decodeChunkToBuffer :: A.MArray s -> IO Decoder
+    decodeChunkToBuffer dest = withForeignPtr fp $ \ptr ->
+      with (0::CSize) $ \destOffPtr ->
+      with codepoint0 $ \codepointPtr ->
+      with state0 $ \statePtr ->
+        let end = ptr `plusPtr` (off + len)
+            loop curPtr = do
+              curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr curPtr end
+                                                  codepointPtr statePtr
+              state <- peek statePtr
+              case state of
+                12 -> do
+                  -- We encountered an encoding error
+                  x <- peek curPtr'
+                  case onErr desc (Just x) of
+                    Nothing -> loop $ curPtr' `plusPtr` 1
+                    Just c -> do
+                      destOff <- peek destOffPtr
+                      w <- unsafeSTToIO $
+                           unsafeWrite dest (fromIntegral destOff) (safe c)
+                      poke destOffPtr (destOff + fromIntegral w)
+                      poke statePtr 0
+                      loop $ curPtr' `plusPtr` 1
+ 
+                _ -> do
+                  -- We encountered the end of the buffer while decoding
+                  n <- peek destOffPtr
+                  codepoint <- peek codepointPtr
+                  chunkText <- unsafeSTToIO $ do
+                      arr <- A.unsafeFreeze dest
+                      return $! textP arr 0 (fromIntegral n)
+                  return $ Some chunkText $ decodeChunk (codepoint, state)
+        in loop (ptr `plusPtr` off)
+  desc = "Data.Text.Encoding.decodeUtf8With': Invalid UTF-8 stream"
+{- INLINE[0] decodeUtf8With' #-}
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 --
@@ -296,5 +351,10 @@ foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
 
+foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr Word8 -> Ptr Word8
+    -> Ptr Word32 -> Ptr Word32 -> IO (Ptr Word8)
+
 foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
     :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()
diff --git a/cbits/cbits.c b/cbits/cbits.c
index 012239b0..fee1db2a 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -104,19 +104,36 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src,
  * the start of an invalid byte sequence.
  *
  * At exit, updates *destoff with the next offset to write to, and
- * returns the next source offset to read from.
+ * returns the next source offset to read from. Moreover, this function
+ * exposes the internal decoder state (state0 and codepoint0), allowing one
+ * to restart the decoder after it terminates (say, due to a partial codepoint).
+ *
+ * In particular, there are a few possible outcomes,
+ *
+ *   1) We decoded the buffer entirely:
+ *      In this case we return srcend
+ *      state0 == UTF8_ACCEPT
+ *
+ *   2) We met an invalid encoding
+ *      In this case we return the address of the first invalid byte
+ *      state0 == UTF8_REJECT
+ *
+ *   3) We reached the end of the buffer while decoding a codepoint
+ *      In this case we return a pointer to the first byte of the partial codepoint
+ *      state0 != UTF8_ACCEPT, UTF8_REJECT
+ *
  */
 uint8_t const *
-_hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
-		     const uint8_t const *src, const uint8_t const *srcend)
+_hs_text_decode_utf8_state(uint16_t *dest, size_t *destoff,
+                           const uint8_t const *src, const uint8_t const *srcend,
+                           uint32_t *codepoint0, uint32_t *state0)
 {
   uint16_t *d = dest + *destoff;
   const uint8_t const *s = src;
-  uint32_t state = UTF8_ACCEPT;
+  uint32_t state = *state0;
+  uint32_t codepoint = *codepoint0;
 
   while (s < srcend) {
-    uint32_t codepoint;
-
 #if defined(__i386__) || defined(__x86_64__)
     /*
      * This code will only work on a little-endian system that
@@ -161,11 +178,25 @@ _hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
     }
   }
 
-  /* Error recovery - if we're not in a valid finishing state, back up. */
-  if (state != UTF8_ACCEPT)
+  /* Invalid encoding, back up to the errant character */
+  if (state == UTF8_REJECT)
     s -= 1;
 
   *destoff = d - dest;
+  *codepoint0 = codepoint;
+  *state0 = state;
 
   return s;
 }
+
+/*
+ * Helper to decode buffer and discard final decoder state
+ */
+uint8_t const *
+_hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
+                     const uint8_t const *src, const uint8_t const *srcend)
+{
+  uint32_t codepoint;
+  uint32_t state = UTF8_ACCEPT;
+  return _hs_text_decode_utf8_state(dest, destoff, src, srcend, &codepoint, &state);
+}
-- 
GitLab