Commit cdc80a84 authored by Bodigrim's avatar Bodigrim
Browse files

Use native (and naive) UTF8 decoder

parent 14c6ae11
/*
* Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
*
* Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
*
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
*/
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#include <xmmintrin.h>
#endif
#include "text_cbits.h"
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
static const uint8_t utf8d[] = {
/*
* The first part of the table maps bytes to character classes that
* to reduce the size of the transition table and create bitmasks.
*/
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
/*
* The second part is a transition table that maps a combination of
* a state of the automaton and a character class to a state.
*/
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};
static inline uint32_t
decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
return *state = utf8d[256 + *state + type];
}
/*
* A best-effort decoder. Runs until it hits either end of input or
* the start of an invalid byte sequence.
*
* At exit, we update *destoff with the next offset to write to, *src
* with the next source location past the last one successfully
* decoded, and return the next source location to read from.
*
* Moreover, we expose the internal decoder state (state0 and
* codepoint0), allowing one to restart the decoder after it
* terminates (say, due to a partial codepoint).
*
* In particular, there are a few possible outcomes,
*
* 1) We decoded the buffer entirely:
* In this case we return srcend
* state0 == UTF8_ACCEPT
*
* 2) We met an invalid encoding
* In this case we return the address of the first invalid byte
* state0 == UTF8_REJECT
*
* 3) We reached the end of the buffer while decoding a codepoint
* In this case we return a pointer to the first byte of the partial codepoint
* state0 != UTF8_ACCEPT, UTF8_REJECT
*
*/
#if defined(__GNUC__) || defined(__clang__)
static inline uint8_t const *
_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
const uint8_t **src, const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
__attribute((always_inline));
#endif
static inline uint8_t const *
_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
const uint8_t **src, const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
{
uint8_t *d = dest + *destoff;
const uint8_t *s = *src, *last = *src;
uint32_t state = *state0;
uint32_t codepoint = *codepoint0;
while (s < srcend) {
if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
if (state != UTF8_REJECT)
continue;
break;
}
if(codepoint < 0x80){
*d++ = (uint8_t) codepoint;
} else if(codepoint < 0x800){
*d++ = (uint8_t) (0xC0 + (codepoint >> 6));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
} else if(codepoint < 0x10000){
*d++ = (uint8_t) (0xE0 + (codepoint >> 12));
*d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
} else {
*d++ = (uint8_t) (0xF0 + (codepoint >> 18));
*d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
*d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}
last = s;
}
*destoff = d - dest;
*codepoint0 = codepoint;
*state0 = state;
*src = last;
return s;
}
uint8_t const *
_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
const uint8_t **src,
const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
{
_hs_text_decode_utf8_int(dest, destoff, src, srcend, codepoint0, state0);
return *src;
}
/*
* Helper to decode buffer and discard final decoder state
*/
const uint8_t *
_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
const uint8_t *src, const uint8_t *const srcend)
{
uint32_t codepoint;
uint32_t state = UTF8_ACCEPT;
_hs_text_decode_utf8_int(dest, destoff, &src, srcend,
&codepoint, &state);
return src;
}
/*
* Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
*/
#ifndef _text_cbits_h
#define _text_cbits_h
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
#endif
......@@ -231,7 +231,7 @@ import qualified Data.Text.Internal.Fusion.Common as S
import Data.Text.Encoding (decodeUtf8', encodeUtf8)
import Data.Text.Internal.Fusion (stream, reverseStream, unstream)
import Data.Text.Internal.Private (span_)
import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text, append)
import Data.Text.Internal.Unsafe.Char (unsafeWrite, unsafeChr8)
import Data.Text.Show (singleton, unpack, unpackCString#)
import qualified Prelude as P
......@@ -446,24 +446,6 @@ snoc :: Text -> Char -> Text
snoc t c = unstream (S.snoc (stream t) (safe c))
{-# INLINE snoc #-}
-- | /O(n)/ Appends one 'Text' to the other by copying both of them
-- into a new 'Text'.
append :: Text -> Text -> Text
append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
| len1 == 0 = b
| len2 == 0 = a
| len > 0 = Text (A.run x) 0 len
| otherwise = overflowError "append"
where
len = len1+len2
x :: ST s (A.MArray s)
x = do
arr <- A.new len
A.copyI len1 arr 0 arr1 off1
A.copyI len2 arr len1 arr2 off2
return arr
{-# NOINLINE append #-}
-- | /O(1)/ Returns the first character of a 'Text', which must be
-- non-empty.
head :: Text -> Char
......
......@@ -2,6 +2,8 @@
UnliftedFFITypes #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
-- |
-- Module : Data.Text.Encoding
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
......@@ -62,33 +64,30 @@ module Data.Text.Encoding
import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
import Control.Monad.ST (runST)
import Control.Exception (evaluate, try)
import Control.Monad.ST (runST, ST)
import Data.Bits (shiftR, (.&.))
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Short.Internal as SBS
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
import Data.Text.Internal (Text(..), safe, empty, text)
import Data.Text.Internal.Private (runText)
import Data.Text.Internal (Text(..), safe, empty, append)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show ()
import Data.Text.Show as T (singleton)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8, Word32)
import Data.Word (Word8)
import Foreign.C.Types (CSize(..))
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke, peekByteOff)
import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (poke, peekByteOff)
import GHC.Exts (byteArrayContents#, unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Builder.Prim.Internal as BP
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, utf8DecodeStart, utf8DecodeContinue, DecoderResult(..))
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Fusion as E
import qualified Data.Text.Internal.Fusion as F
......@@ -97,8 +96,6 @@ import Data.Text.Internal.ByteStringCompat
import GHC.Stack (HasCallStack)
#endif
#include "text_cbits.h"
-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
......@@ -159,53 +156,77 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- __NOTE__: The replacement character returned by 'OnDecodeError'
-- MUST be within the BMP plane; surrogate code points will
-- automatically be remapped to the replacement char @U+FFFD@
-- (/since 0.11.3.0/), whereas code points beyond the BMP will throw an
-- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
-- those unsupported code points would result in undefined behavior.
-- Surrogate code points in replacement character returned by 'OnDecodeError'
-- will be automatically remapped to the replacement char @U+FFFD@.
decodeUtf8With ::
#if defined(ASSERTS)
HasCallStack =>
#endif
OnDecodeError -> ByteString -> Text
decodeUtf8With onErr bs = withBS bs aux
where
aux fp len = runText $ \done -> do
let go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr -> do
let end = ptr `plusPtr` len
loop curPtr = do
curPtr' <- c_decode_utf8 dest destOffPtr curPtr end
if curPtr' == end
then do
n <- peek destOffPtr
unsafeSTToIO (done (A.MutableByteArray dest) (cSizeToInt n))
else do
x <- peek curPtr'
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c
-- TODO This is problematic, because even BMP replacement characters
-- can take longer than one UTF8 code unit (which is byte).
| c > '\xFFFF' -> throwUnsupportedReplChar
| otherwise -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite (A.MutableByteArray dest) (cSizeToInt destOff)
(safe c)
poke destOffPtr (destOff + intToCSize w)
loop $ curPtr' `plusPtr` 1
loop ptr
-- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.
-- This is incorrect in general, but for now we just want to pass tests.
(unsafeIOToST . go) =<< A.new (len * 2 + 100)
where
desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
throwUnsupportedReplChar = throwIO $
ErrorCall "decodeUtf8With: non-BMP replacement characters not supported"
decodeUtf8With onErr bs
| B.null undecoded = txt
| otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
Nothing -> txt'
Just c -> T.singleton c `append` txt')
where
(txt, undecoded) = decodeUtf8With2 onErr mempty bs
txt' = decodeUtf8With onErr (B.tail undecoded)
desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
decodeUtf8With2 ::
#if defined(ASSERTS)
HasCallStack =>
#endif
OnDecodeError -> ByteString -> ByteString -> (Text, ByteString)
decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
marr <- A.new len'
outer marr len' 0 0
where
len = len1 + len2
len' = len + 4
index i
| i < len1 = B.index bs1 i
| otherwise = B.index bs2 (i - len1)
decodeFrom :: Int -> DecoderResult
decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
where
step i (Incomplete a b)
| i < len = step (i + 1) (utf8DecodeContinue (index i) a b)
step _ st = st
outer :: forall s. A.MArray s -> Int -> Int -> Int -> ST s (Text, ByteString)
outer dst dstLen = inner
where
inner srcOff dstOff
| srcOff >= len = do
A.shrinkM dst dstOff
arr <- A.unsafeFreeze dst
return (Text arr 0 dstOff, mempty)
| dstOff + 4 > dstLen = do
let dstLen' = dstLen + 4
dst' <- A.resizeM dst dstLen'
outer dst' dstLen' srcOff dstOff
| otherwise = case decodeFrom srcOff of
Accept c -> do
d <- unsafeWrite dst dstOff c
inner (srcOff + d) (dstOff + d)
Reject -> case onErr desc (Just (index srcOff)) of
Nothing -> inner (srcOff + 1) dstOff
Just c -> do
d <- unsafeWrite dst dstOff (safe c)
inner (srcOff + 1) (dstOff + d)
Incomplete{} -> do
A.shrinkM dst dstOff
arr <- A.unsafeFreeze dst
let bs = if srcOff >= len1
then B.drop (srcOff - len1) bs2
else B.drop srcOff (bs1 `B.append` bs2)
return (Text arr 0 dstOff, bs)
desc = "Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-- $stream
--
......@@ -272,9 +293,6 @@ instance Show Decoding where
showString " _"
where prec = 10; prec' = prec + 1
newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable)
newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable)
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text that is known to be valid.
--
......@@ -300,72 +318,11 @@ streamDecodeUtf8With ::
HasCallStack =>
#endif
OnDecodeError -> ByteString -> Decoding
streamDecodeUtf8With onErr = decodeChunk B.empty 0 0
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential code point started in the last buffer (@undecoded0@), or
-- replacement characters for each byte in @undecoded0@ if the
-- sequence turns out to be invalid. There can be up to three bytes there,
-- hence we allocate @len+3@ bytes.
decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString
-> Decoding
decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where
-- TODO Replace (+100) with something sensible.
aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+100)
where
decodeChunkToBuffer :: A.MArray s -> IO Decoding
decodeChunkToBuffer (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \ptr ->
with (0::CSize) $ \destOffPtr ->
with codepoint0 $ \codepointPtr ->
with state0 $ \statePtr ->
with nullPtr $ \curPtrPtr ->
let end = ptr `plusPtr` len
loop curPtr = do
prevState <- peek statePtr
poke curPtrPtr curPtr
lastPtr <- c_decode_utf8_with_state dest destOffPtr
curPtrPtr end codepointPtr statePtr
state <- peek statePtr
case state of
UTF8_REJECT -> do
-- We encountered an encoding error
poke statePtr 0
let skipByte x = case onErr desc (Just x) of
Nothing -> return ()
Just c -> do
destOff <- peek destOffPtr
w <- unsafeSTToIO $
unsafeWrite (A.MutableByteArray dest) (cSizeToInt destOff) (safe c)
poke destOffPtr (destOff + intToCSize w)
if ptr == lastPtr && prevState /= UTF8_ACCEPT then do
-- If we can't complete the sequence @undecoded0@ from
-- the previous chunk, we invalidate the bytes from
-- @undecoded0@ and retry decoding the current chunk from
-- the initial state.
traverse_ skipByte (B.unpack undecoded0)
loop lastPtr
else do
peek lastPtr >>= skipByte
loop (lastPtr `plusPtr` 1)
_ -> do
-- We encountered the end of the buffer while decoding
n <- peek destOffPtr
codepoint <- peek codepointPtr
chunkText <- unsafeSTToIO $ do
let l = cSizeToInt n
A.shrinkM (A.MutableByteArray dest) l
arr <- A.unsafeFreeze (A.MutableByteArray dest)
return $! text arr 0 l
let left = lastPtr `minusPtr` ptr
!undecoded = case state of
UTF8_ACCEPT -> B.empty
_ | left == 0 && prevState /= UTF8_ACCEPT -> B.append undecoded0 bs
| otherwise -> B.drop left bs
return $ Some chunkText undecoded
(decodeChunk undecoded codepoint state)
in loop ptr
desc = "Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
streamDecodeUtf8With onErr = go mempty
where
go bs1 bs2 = Some txt undecoded (go undecoded)
where
(txt, undecoded) = decodeUtf8With2 onErr bs1 bs2
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
......@@ -551,15 +508,3 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
cSizeToInt :: CSize -> Int
cSizeToInt = fromIntegral
intToCSize :: Int -> CSize
intToCSize = fromIntegral
foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
:: MutableByteArray# s -> Ptr CSize
-> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state
:: MutableByteArray# s -> Ptr CSize
-> Ptr (Ptr Word8) -> Ptr Word8
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
......@@ -33,6 +33,7 @@ module Data.Text.Internal
-- * Code that must be here for accessibility
, empty
, empty_
, append
-- * Utilities
, firstf
-- * Checked multiplication
......@@ -47,6 +48,7 @@ module Data.Text.Internal
import Control.Exception (assert)
import GHC.Stack (HasCallStack)
#endif
import Control.Monad.ST (ST)
import Data.Bits
import Data.Int (Int32, Int64)
import Data.Text.Internal.Unsafe.Char (ord)
......@@ -89,6 +91,24 @@ empty_ :: Text
empty_ = Text A.empty 0 0
{-# NOINLINE empty_ #-}
-- | /O(n)/ Appends one 'Text' to the other by copying both of them
-- into a new 'Text'.
append :: Text -> Text -> Text
append a@(Text arr1 off1 len1) b@(Text arr2 off2 len2)
| len1 == 0 = b
| len2 == 0 = a
| len > 0 = Text (A.run x) 0 len
| otherwise = error $ "Data.Text.append: size overflow"
where
len = len1+len2
x :: ST s (A.MArray s)
x = do
arr <- A.new len
A.copyI len1 arr 0 arr1 off1
A.copyI len2 arr len1 arr2 off2
return arr
{-# NOINLINE append #-}
-- | Construct a 'Text' without invisibly pinning its byte array in
-- memory if its length has dwindled to zero.
text ::
......
......@@ -33,6 +33,10 @@ module Data.Text.Internal.Encoding.Utf8
, validate2
, validate3
, validate4
-- * Naive decoding
, DecoderResult(..)
, utf8DecodeStart
, utf8DecodeContinue
) where
#if defined(ASSERTS)
......@@ -40,7 +44,7 @@ import Control.Exception (assert)
import GHC.Stack (HasCallStack)
#endif
import Data.Bits (Bits(..), FiniteBits(..))
import Data.Char (ord)
import Data.Char (ord, chr)
import GHC.Exts
import GHC.Word (Word8(..))
......@@ -213,3 +217,66 @@ validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
intToWord8 :: Int -> Word8
intToWord8 = fromIntegral
word8ToInt :: Word8 -> Int
word8ToInt = fromIntegral
-------------------------------------------------------------------------------
-- Naive UTF8 decoder.
-- See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for the explanation of the state machine.
newtype ByteClass = ByteClass Word8
byteToClass :: Word8 -> ByteClass
byteToClass n = ByteClass (W8# el#)
where
!(I# n#) = word8ToInt n
el# = indexWord8OffAddr# table# n#
table# :: Addr#
table# = "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b"#
newtype DecoderState = DecoderState Word8
deriving (Eq)
utf8AcceptState :: DecoderState
utf8AcceptState = DecoderState 0
utf8RejectState :: DecoderState
utf8RejectState = DecoderState 12
updateState :: ByteClass -> DecoderState -> DecoderState
updateState (ByteClass c) (DecoderState s) = DecoderState (W8# el#)
where
!(I# n#) = word8ToInt (c + s)
el# = indexWord8OffAddr# table# n#
table# :: Addr#
table# = "\NUL\f\CAN$<`T\f\f\f0H\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f$\f$\f\f\f$\f\f\f\f\f\f\f\f\f\f"#
newtype CodePoint = CodePoint Int
data DecoderResult
= Accept !Char
| Incomplete !DecoderState !CodePoint
| Reject
utf8DecodeStart :: Word8 -> DecoderResult
utf8DecodeStart w
| st == utf8AcceptState = Accept (chr (word8ToInt w))
| st == utf8RejectState = Reject
| otherwise = Incomplete st (CodePoint cp)
where