diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index c707134d24e089f7a09a61a50d5f916bfbbc092f..59a48ee73733ee5b487cf76dff07159ae0bf5ca2 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -79,9 +79,6 @@ import Data.Text.Show as T (singleton) import Data.Text.Unsafe (unsafeDupablePerformIO) import Data.Word (Word8) import Foreign.C.Types (CSize(..)) -#ifdef SIMDUTF -import Foreign.C.Types (CInt(..)) -#endif import Foreign.Ptr (Ptr, minusPtr, plusPtr) import Foreign.Storable (poke, peekByteOff) import GHC.Exts (byteArrayContents#, unsafeCoerce#) @@ -99,6 +96,13 @@ import Data.Text.Internal.ByteStringCompat import GHC.Stack (HasCallStack) #endif +#ifdef SIMDUTF +import Foreign.C.Types (CInt(..)) +#else +import qualified Data.ByteString.Unsafe as B +import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) +#endif + -- $strict -- -- All of the single-parameter functions for decoding bytestrings @@ -164,10 +168,30 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize -#ifdef SIMDUTF isValidBS :: ByteString -> Bool +#ifdef SIMDUTF isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $ unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len) +#else +#if MIN_VERSION_bytestring(0,11,2) +isValidBS = B.isValidUtf8 +#else +isValidBS bs = start 0 + where + start ix + | ix >= B.length bs = True + | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st _ -> step (ix + 1) st + step ix st + | ix >= B.length bs = False + -- We do not use decoded code point, so passing a dummy value to save an argument. + | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st' _ -> step (ix + 1) st' +#endif #endif -- | Decode a 'ByteString' containing UTF-8 encoded text. @@ -180,11 +204,9 @@ decodeUtf8With :: #endif OnDecodeError -> ByteString -> Text decodeUtf8With onErr bs -#ifdef SIMDUTF | isValidBS bs = let !(SBS.SBS arr) = SBS.toShort bs in (Text (A.ByteArray arr) 0 (B.length bs)) -#endif | B.null undecoded = txt | otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of Nothing -> txt' @@ -211,7 +233,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do | i < len1 = B.index bs1 i | otherwise = B.index bs2 (i - len1) -#ifdef SIMDUTF -- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0 guessUtf8Boundary :: Int guessUtf8Boundary @@ -226,7 +247,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do w1 = B.index bs2 (len2 - 2) w2 = B.index bs2 (len2 - 3) w3 = B.index bs2 (len2 - 4) -#endif decodeFrom :: Int -> DecoderResult decodeFrom off = step (off + 1) (utf8DecodeStart (index off)) @@ -244,7 +264,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do arr <- A.unsafeFreeze dst return (Text arr 0 dstOff, mempty) -#ifdef SIMDUTF | srcOff >= len1 , srcOff < len1 + guessUtf8Boundary , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen @@ -253,7 +272,6 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) -#endif | dstOff + 4 > dstLen = do let dstLen' = dstLen + 4 diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index 1645086e688bd327d8eeb74e2d407852e4754449..bcf6a778f6dc7c7efd08ebfb4414c8ad8345aa64 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -35,6 +35,8 @@ module Data.Text.Internal.Encoding.Utf8 , validate4 -- * Naive decoding , DecoderResult(..) + , DecoderState(..) + , CodePoint(..) , utf8DecodeStart , utf8DecodeContinue ) where @@ -269,18 +271,18 @@ data DecoderResult -- | @since 2.0 utf8DecodeStart :: Word8 -> DecoderResult -utf8DecodeStart w +utf8DecodeStart !w | st == utf8AcceptState = Accept (chr (word8ToInt w)) | st == utf8RejectState = Reject | otherwise = Incomplete st (CodePoint cp) where cl@(ByteClass cl') = byteToClass w st = updateState cl utf8AcceptState - cp = word8ToInt $ (0xff `shiftR` word8ToInt cl') .&. w + cp = word8ToInt $ (0xff `unsafeShiftR` word8ToInt cl') .&. w -- | @since 2.0 utf8DecodeContinue :: Word8 -> DecoderState -> CodePoint -> DecoderResult -utf8DecodeContinue w st (CodePoint cp) +utf8DecodeContinue !w !st (CodePoint !cp) | st' == utf8AcceptState = Accept (chr cp') | st' == utf8RejectState = Reject | otherwise = Incomplete st' (CodePoint cp')