From 4b3667e04ca14d0b3ecf70d0a8135e482314b633 Mon Sep 17 00:00:00 2001 From: Bodigrim <andrew.lelechenko@gmail.com> Date: Fri, 21 May 2021 01:54:41 +0100 Subject: [PATCH] Switch internal representation to UTF-8 --- README.markdown | 2 +- cbits/cbits.c | 232 +++--------------- cbits/utils.c | 11 + src/Data/Text.hs | 50 ++-- src/Data/Text/Array.hs | 38 +-- src/Data/Text/Encoding.hs | 139 ++++------- src/Data/Text/Foreign.hs | 105 ++++---- src/Data/Text/Internal.hs | 32 +-- src/Data/Text/Internal/Builder.hs | 4 +- src/Data/Text/Internal/Encoding/Fusion.hs | 6 +- src/Data/Text/Internal/Encoding/Utf8.hs | 23 +- src/Data/Text/Internal/Fusion.hs | 81 +++--- src/Data/Text/Internal/Fusion/Common.hs | 35 ++- src/Data/Text/Internal/Fusion/Size.hs | 12 +- src/Data/Text/Internal/Fusion/Types.hs | 2 +- .../Text/Internal/Lazy/Encoding/Fusion.hs | 10 +- src/Data/Text/Internal/Lazy/Fusion.hs | 4 +- src/Data/Text/Internal/Lazy/Search.hs | 18 +- src/Data/Text/Internal/Search.hs | 10 +- src/Data/Text/Internal/Unsafe/Char.hs | 47 ++-- src/Data/Text/Lazy.hs | 30 +-- src/Data/Text/Lazy/Builder/Int.hs | 9 +- src/Data/Text/Show.hs | 4 +- src/Data/Text/Unsafe.hs | 101 ++++---- tests/Tests/Properties/LowLevel.hs | 14 +- tests/Tests/QuickCheckUtils.hs | 4 +- tests/Tests/Regressions.hs | 6 +- text.cabal | 16 +- 28 files changed, 428 insertions(+), 617 deletions(-) create mode 100644 cbits/utils.c diff --git a/README.markdown b/README.markdown index cf355297..8c27cb0e 100644 --- a/README.markdown +++ b/README.markdown @@ -29,4 +29,4 @@ based on the stream fusion framework developed by Roman Leshchinskiy, Duncan Coutts, and Don Stewart. The core library was fleshed out, debugged, and tested by Bryan -O'Sullivan <bos@serpentine.com>, and he is the current maintainer. +O'Sullivan. Transition from UTF-16 to UTF-8 is by Andrew Lelechenko. diff --git a/cbits/cbits.c b/cbits/cbits.c index 1d8322ae..33bab908 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -16,12 +16,6 @@ #include "text_cbits.h" -int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff, - size_t n) -{ - return memcmp(a + (aoff<<1), b + (boff<<1), n<<1); -} - #define UTF8_ACCEPT 0 #define UTF8_REJECT 12 @@ -61,60 +55,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) { return *state = utf8d[256 + *state + type]; } -/* - * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode - * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to - * an UTF16 array - */ -void -_hs_text_decode_latin1(uint16_t *dest, const uint8_t *src, +size_t +_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src, const uint8_t *srcend) { + const uint8_t *dest0 = dest; const uint8_t *p = src; -#if defined(__i386__) || defined(__x86_64__) - /* This optimization works on a little-endian systems by using - (aligned) 32-bit loads instead of 8-bit loads - */ - - /* consume unaligned prefix */ - while (p != srcend && (uintptr_t)p & 0x3) - *dest++ = *p++; - -#if defined(__x86_64__) - /* All the intrinsics used here are from SSE2, - * so every x86_64 CPU supports them. - */ - const __m128i zeros = _mm_set1_epi32(0); - while (p < srcend - 7) { - /* Load 8 bytes of ASCII data */ - const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p)); - /* Interleave with zeros */ - const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros); - /* Store the resulting 16 bytes into destination */ - _mm_storeu_si128((__m128i *)dest, utf16); - - dest += 8; - p += 8; - } -#else - /* iterate over 32-bit aligned loads */ - while (p < srcend - 3) { - const uint32_t w = *((const uint32_t *)p); - - *dest++ = w & 0xff; - *dest++ = (w >> 8) & 0xff; - *dest++ = (w >> 16) & 0xff; - *dest++ = (w >> 24) & 0xff; - - p += 4; + while (p != srcend){ + uint8_t codepoint = *p++; + if(codepoint < 0x80){ + *dest++ = (uint8_t)codepoint; + } else { + *dest++ = (uint8_t) (0xC0 + (codepoint >> 6)); + *dest++ = (uint8_t) (0x80 + (codepoint & 0x3F)); + } } -#endif -#endif - /* handle unaligned suffix */ - while (p != srcend) - *dest++ = *p++; + return (dest - dest0); } /* @@ -146,82 +104,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src, */ #if defined(__GNUC__) || defined(__clang__) static inline uint8_t const * -_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, +_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff, const uint8_t **src, const uint8_t *srcend, uint32_t *codepoint0, uint32_t *state0) __attribute((always_inline)); #endif static inline uint8_t const * -_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, +_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff, const uint8_t **src, const uint8_t *srcend, uint32_t *codepoint0, uint32_t *state0) { - uint16_t *d = dest + *destoff; + uint8_t *d = dest + *destoff; const uint8_t *s = *src, *last = *src; uint32_t state = *state0; uint32_t codepoint = *codepoint0; while (s < srcend) { -#if defined(__i386__) || defined(__x86_64__) - /* - * This code will only work on a little-endian system that - * supports unaligned loads. - * - * It gives a substantial speed win on data that is purely or - * partly ASCII (e.g. HTML), at only a slight cost on purely - * non-ASCII text. - */ - - if (state == UTF8_ACCEPT) { -#if defined(__x86_64__) - const __m128i zeros = _mm_set1_epi32(0); - while (s < srcend - 8) { - const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s); - if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL) - break; - s += 8; - - /* Load 8 bytes of ASCII data */ - const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars); - /* Interleave with zeros */ - const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros); - /* Store the resulting 16 bytes into destination */ - _mm_storeu_si128((__m128i *)d, eight_utf16_chars); - d += 8; - } -#else - while (s < srcend - 4) { - codepoint = *((uint32_t *) s); - if ((codepoint & 0x80808080) != 0) - break; - s += 4; - /* - * Tried 32-bit stores here, but the extra bit-twiddling - * slowed the code down. - */ - *d++ = (uint16_t) (codepoint & 0xff); - *d++ = (uint16_t) ((codepoint >> 8) & 0xff); - *d++ = (uint16_t) ((codepoint >> 16) & 0xff); - *d++ = (uint16_t) ((codepoint >> 24) & 0xff); - } -#endif - last = s; - } /* end if (state == UTF8_ACCEPT) */ -#endif - if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { if (state != UTF8_REJECT) - continue; + continue; break; } - if (codepoint <= 0xffff) - *d++ = (uint16_t) codepoint; - else { - *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); - *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); + if(codepoint < 0x80){ + *d++ = (uint8_t) codepoint; + } else if(codepoint < 0x800){ + *d++ = (uint8_t) (0xC0 + (codepoint >> 6)); + *d++ = (uint8_t) (0x80 + (codepoint & 0x3F)); + } else if(codepoint < 0x10000){ + *d++ = (uint8_t) (0xE0 + (codepoint >> 12)); + *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F)); + *d++ = (uint8_t) (0x80 + (codepoint & 0x3F)); + } else { + *d++ = (uint8_t) (0xF0 + (codepoint >> 18)); + *d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F)); + *d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F)); + *d++ = (uint8_t) (0x80 + (codepoint & 0x3F)); } + last = s; } @@ -234,7 +155,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, } uint8_t const * -_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, +_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff, const uint8_t **src, const uint8_t *srcend, uint32_t *codepoint0, uint32_t *state0) @@ -248,7 +169,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, * Helper to decode buffer and discard final decoder state */ const uint8_t * -_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff, +_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff, const uint8_t *src, const uint8_t *const srcend) { uint32_t codepoint; @@ -257,90 +178,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff, &codepoint, &state); return src; } - -void -_hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff, - size_t srclen) -{ - const uint16_t *srcend; - uint8_t *dest = *destp; - - src += srcoff; - srcend = src + srclen; - - ascii: -#if defined(__x86_64__) - while (srcend - src >= 8) { - union { uint64_t halves[2]; __m128i whole; } eight_chars; - eight_chars.whole = _mm_loadu_si128((__m128i *) src); - - const uint64_t w = eight_chars.halves[0]; - if (w & 0xFF80FF80FF80FF80ULL) { - if (!(w & 0x000000000000FF80ULL)) { - *dest++ = w & 0xFFFF; - src++; - if (!(w & 0x00000000FF800000ULL)) { - *dest++ = (w >> 16) & 0xFFFF; - src++; - if (!(w & 0x0000FF8000000000ULL)) { - *dest++ = (w >> 32) & 0xFFFF; - src++; - } - } - } - break; - } - - if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) { - break; - } - - const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole); - _mm_storel_epi64((__m128i *)dest, eight_ascii_chars); - - dest += 8; - src += 8; - } -#endif - -#if defined(__i386__) - while (srcend - src >= 2) { - uint32_t w = *((uint32_t *) src); - - if (w & 0xFF80FF80) - break; - *dest++ = w & 0xFFFF; - *dest++ = w >> 16; - src += 2; - } -#endif - - while (src < srcend) { - uint16_t w = *src++; - - if (w <= 0x7F) { - *dest++ = w; - /* An ASCII byte is likely to begin a run of ASCII bytes. - Falling back into the fast path really helps performance. */ - goto ascii; - } - else if (w <= 0x7FF) { - *dest++ = (w >> 6) | 0xC0; - *dest++ = (w & 0x3f) | 0x80; - } - else if (w < 0xD800 || w > 0xDBFF) { - *dest++ = (w >> 12) | 0xE0; - *dest++ = ((w >> 6) & 0x3F) | 0x80; - *dest++ = (w & 0x3F) | 0x80; - } else { - uint32_t c = ((((uint32_t) w) - 0xD800) << 10) + - (((uint32_t) *src++) - 0xDC00) + 0x10000; - *dest++ = (c >> 18) | 0xF0; - *dest++ = ((c >> 12) & 0x3F) | 0x80; - *dest++ = ((c >> 6) & 0x3F) | 0x80; - *dest++ = (c & 0x3F) | 0x80; - } - } - - *destp = dest; -} diff --git a/cbits/utils.c b/cbits/utils.c new file mode 100644 index 00000000..2baa78c7 --- /dev/null +++ b/cbits/utils.c @@ -0,0 +1,11 @@ +/* + * Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com> + */ + +#include <stdio.h> +#include <string.h> + +int _hs_text_memcmp(const void *arr1, size_t off1, const void *arr2, size_t off2, size_t len) +{ + return memcmp(arr1 + off1, arr2 + off2, len); +} diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 9bac1b9c..b035ed71 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -9,6 +9,7 @@ -- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan, -- (c) 2009 Duncan Coutts, -- (c) 2008, 2009 Tom Harper +-- (c) 2021 Andrew Lelechenko -- -- License : BSD-style -- Maintainer : bos@serpentine.com @@ -227,10 +228,8 @@ import Data.Text.Internal.Private (span_) import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text) import Data.Text.Show (singleton, unpack, unpackCString#) import qualified Prelude as P -import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter, +import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter, reverseIter_, unsafeHead, unsafeTail) -import Data.Text.Internal.Unsafe.Char (unsafeChr) -import qualified Data.Text.Internal.Encoding.Utf16 as U16 import Data.Text.Internal.Search (indices) #if defined(__HADDOCK__) import Data.ByteString (ByteString) @@ -291,7 +290,8 @@ import Text.Printf (PrintfArg, formatArg, formatString) -- points -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >) -- as 'Char' values, including code points from this invalid range. --- This means that there are some 'Char' values that are not valid +-- This means that there are some 'Char' values +-- (corresponding to 'Data.Char.Surrogate' category) that are not valid -- Unicode scalar values, and the functions in this module must handle -- those cases. -- @@ -300,12 +300,7 @@ import Text.Printf (PrintfArg, formatArg, formatString) -- that are not valid Unicode scalar values with the replacement -- character \"�\" (U+FFFD). Functions that perform this -- inspection and replacement are documented with the phrase --- \"Performs replacement on invalid scalar values\". --- --- (One reason for this policy of replacement is that internally, a --- 'Text' value is represented as packed UTF-16 data. Values in the --- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate --- code points, and so cannot be represented. The functions replace +-- \"Performs replacement on invalid scalar values\". The functions replace -- invalid scalar values, instead of dropping them, as a security -- measure. For details, see -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.) @@ -487,12 +482,9 @@ second f (a, b) = (a, f b) -- | /O(1)/ Returns the last character of a 'Text', which must be -- non-empty. last :: Text -> Char -last (Text arr off len) - | len <= 0 = emptyError "last" - | n < 0xDC00 || n > 0xDFFF = unsafeChr n - | otherwise = U16.chr2 n0 n - where n = A.unsafeIndex arr (off+len-1) - n0 = A.unsafeIndex arr (off+len-2) +last t@(Text _ _ len) + | len <= 0 = emptyError "last" + | otherwise = let Iter c _ = reverseIter t (len - 1) in c {-# INLINE [1] last #-} -- | /O(1)/ Returns all characters after the head of a 'Text', which @@ -507,11 +499,9 @@ tail t@(Text arr off len) -- | /O(1)/ Returns all but the last character of a 'Text', which must -- be non-empty. init :: Text -> Text -init (Text arr off len) | len <= 0 = emptyError "init" - | n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2) - | otherwise = text arr off (len-1) - where - n = A.unsafeIndex arr (off+len-1) +init t@(Text arr off len) + | len <= 0 = emptyError "init" + | otherwise = text arr off (len + reverseIter_ t (len - 1)) {-# INLINE [1] init #-} -- | /O(1)/ Returns all but the last character and the last character of a @@ -519,12 +509,11 @@ init (Text arr off len) | len <= 0 = emptyError "init" -- -- @since 1.2.3.0 unsnoc :: Text -> Maybe (Text, Char) -unsnoc (Text arr off len) - | len <= 0 = Nothing - | n < 0xDC00 || n > 0xDFFF = Just (text arr off (len-1), unsafeChr n) - | otherwise = Just (text arr off (len-2), U16.chr2 n0 n) - where n = A.unsafeIndex arr (off+len-1) - n0 = A.unsafeIndex arr (off+len-2) +unsnoc t@(Text arr off len) + | len <= 0 = Nothing + | otherwise = Just (text arr off (len + d), c) + where + Iter c d = reverseIter t (len - 1) {-# INLINE [1] unsnoc #-} -- | /O(1)/ Tests whether a 'Text' is empty or not. @@ -911,7 +900,7 @@ concat ts = case ts' of _ -> Text (A.run go) 0 len where ts' = L.filter (not . null) ts - len = sumP "concat" $ L.map lengthWord16 ts' + len = sumP "concat" $ L.map lengthWord8 ts' go :: ST s (A.MArray s) go = do arr <- A.new len @@ -1263,7 +1252,7 @@ groupBy p = loop where Iter c d = iter t 0 n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d)) --- | Returns the /array/ index (in units of 'Word16') at which a +-- | Returns the /array/ index (in units of 'Word8') at which a -- character may be found. This is /not/ the same as the logical -- index returned by e.g. 'findIndex'. findAIndexOrEnd :: (Char -> Bool) -> Text -> Int @@ -1569,9 +1558,10 @@ words t@(Text arr off len) = loop 0 0 | n >= len = if start == n then [] else [Text arr (start+off) (n-start)] + -- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000. | isSpace c = if start == n - then loop (start+1) (start+1) + then loop (n+d) (n+d) else Text arr (start+off) (n-start) : loop (n+d) (n+d) | otherwise = loop start (n+d) where Iter c d = iter t n diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs index c406c983..6566f40a 100644 --- a/src/Data/Text/Array.hs +++ b/src/Data/Text/Array.hs @@ -44,14 +44,14 @@ module Data.Text.Array import Control.Exception (assert) import GHC.Stack (HasCallStack) #endif -import Data.Bits ((.&.), xor, shiftL, shiftR) +import Data.Bits ((.&.), xor, shiftR) #if !MIN_VERSION_base(4,11,0) import Data.Text.Internal.Unsafe (inlinePerformIO) -#endif import Foreign.C.Types (CInt(..)) +#endif import GHC.Exts hiding (toList) import GHC.ST (ST(..), runST) -import GHC.Word (Word16(..)) +import GHC.Word (Word8(..)) import Prelude hiding (length, read) -- | Immutable array type. @@ -88,7 +88,7 @@ unsafeFreeze MArray{..} = ST $ \s1# -> -- | Indicate how many bytes would be used for an array of the given -- size. bytesInArray :: Int -> Int -bytesInArray n = n `shiftL` 1 +bytesInArray n = n {-# INLINE bytesInArray #-} -- | Unchecked read of an immutable array. May return garbage or @@ -97,15 +97,15 @@ unsafeIndex :: #if defined(ASSERTS) HasCallStack => #endif - Array -> Int -> Word16 + Array -> Int -> Word8 unsafeIndex a@Array{..} i@(I# i#) = #if defined(ASSERTS) - let word16len = I# (sizeofByteArray# aBA) `quot` 2 in - if i < 0 || i >= word16len - then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word16len) + let word8len = I# (sizeofByteArray# aBA) in + if i < 0 || i >= word8len + then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word8len) else #endif - case indexWord16Array# aBA i# of r# -> (W16# r#) + case indexWord8Array# aBA i# of r# -> (W8# r#) {-# INLINE unsafeIndex #-} #if defined(ASSERTS) @@ -130,17 +130,17 @@ unsafeWrite :: #if defined(ASSERTS) HasCallStack => #endif - MArray s -> Int -> Word16 -> ST s () -unsafeWrite ma@MArray{..} i@(I# i#) (W16# e#) = + MArray s -> Int -> Word8 -> ST s () +unsafeWrite ma@MArray{..} i@(I# i#) (W8# e#) = #if defined(ASSERTS) - checkBoundsM ma (i * 2) 2 >> + checkBoundsM ma i 1 >> #endif - (ST $ \s1# -> case writeWord16Array# maBA i# e# s1# of + (ST $ \s1# -> case writeWord8Array# maBA i# e# s1# of s2# -> (# s2#, () #)) {-# INLINE unsafeWrite #-} -- | Convert an immutable array to a list. -toList :: Array -> Int -> Int -> [Word16] +toList :: Array -> Int -> Int -> [Word8] toList ary off len = loop 0 where loop i | i < len = unsafeIndex ary (off+i) : loop (i+1) | otherwise = [] @@ -176,10 +176,10 @@ copyM dst@(MArray dst#) dstOff@(I# dstOff#) src@(MArray src#) srcOff@(I# srcOff# #if defined(ASSERTS) srcLen <- getSizeofMArray src dstLen <- getSizeofMArray dst - assert (srcOff + count <= srcLen `quot` 2) . - assert (dstOff + count <= dstLen `quot` 2) . + assert (srcOff + count <= srcLen) . + assert (dstOff + count <= dstLen) . #endif - ST $ \s1# -> case copyMutableByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# count#) s1# of + ST $ \s1# -> case copyMutableByteArray# src# srcOff# dst# dstOff# count# s1# of s2# -> (# s2#, () #) {-# INLINE copyM #-} @@ -194,7 +194,7 @@ copyI :: MArray s -- ^ Destination copyI (MArray dst#) dstOff@(I# dstOff#) (Array src#) (I# srcOff#) top@(I# top#) | dstOff >= top = return () | otherwise = ST $ \s1# -> - case copyByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# (top# -# dstOff#)) s1# of + case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of s2# -> (# s2#, () #) {-# INLINE copyI #-} @@ -209,7 +209,7 @@ equal :: Array -- ^ First equal (Array src1#) (I# off1#) (Array src2#) (I# off2#) (I# count#) = i == 0 where #if MIN_VERSION_base(4,11,0) - i = I# (compareByteArrays# src1# (2# *# off1#) src2# (2# *# off2#) (2# *# count#)) + i = I# (compareByteArrays# src1# off1# src2# off2# count#) #else i = fromIntegral (inlinePerformIO (memcmp src1# off1# src2# off2# count#)) diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index d44e73ff..186e12b4 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -7,6 +7,7 @@ -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, -- (c) 2009 Duncan Coutts, -- (c) 2008, 2009 Tom Harper +-- (c) 2021 Andrew Lelechenko -- -- License : BSD-style -- Maintainer : bos@serpentine.com @@ -63,30 +64,29 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall)) import Control.Monad.ST (runST) -import Data.Bits ((.&.), shiftR) import Data.ByteString as B -import qualified Data.ByteString.Internal as B +import qualified Data.ByteString.Short.Internal as SBS import Data.Foldable (traverse_) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode) import Data.Text.Internal (Text(..), safe, text) import Data.Text.Internal.Private (runText) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Internal.Unsafe.Char (ord, unsafeWrite) +import Data.Text.Internal.Unsafe.Char (unsafeWrite) import Data.Text.Show () import Data.Text.Unsafe (unsafeDupablePerformIO) -import Data.Word (Word8, Word16, Word32) -import Foreign.C.Types (CSize(CSize)) +import Data.Word (Word8, Word32) +import Foreign.C.Types (CSize) import Foreign.Marshal.Utils (with) import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) import Foreign.Storable (Storable, peek, poke) -import GHC.Base (ByteArray#, MutableByteArray#) +import GHC.Base (MutableByteArray#) import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Internal as B hiding (empty, append) import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Builder.Prim.Internal as BP +import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Encoding.Fusion as E -import qualified Data.Text.Internal.Encoding.Utf16 as U16 import qualified Data.Text.Internal.Fusion as F import Data.Text.Internal.ByteStringCompat #if defined(ASSERTS) @@ -124,12 +124,12 @@ decodeLatin1 :: #endif ByteString -> Text decodeLatin1 bs = withBS bs aux where - aux fp len = text a 0 len + aux fp len = text a 0 actualLen where - a = A.run (A.new len >>= unsafeIOToST . go) - go dest = unsafeWithForeignPtr fp $ \ptr -> do - c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len) - return dest + (a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go) + go dest = unsafeWithForeignPtr fp $ \src -> do + destLen <- c_decode_latin1 (A.maBA dest) src (src `plusPtr` len) + return (dest, destLen) -- | Decode a 'ByteString' containing UTF-8 encoded text. -- @@ -161,6 +161,8 @@ decodeUtf8With onErr bs = withBS bs aux case onErr desc (Just x) of Nothing -> loop $ curPtr' `plusPtr` 1 Just c + -- TODO This is problematic, because even BMP replacement characters + -- can take longer than one UTF8 code unit (which is byte). | c > '\xFFFF' -> throwUnsupportedReplChar | otherwise -> do destOff <- peek destOffPtr @@ -170,43 +172,14 @@ decodeUtf8With onErr bs = withBS bs aux poke destOffPtr (destOff + intToCSize w) loop $ curPtr' `plusPtr` 1 loop ptr - (unsafeIOToST . go) =<< A.new len + -- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare. + -- This is incorrect in general, but for now we just want to pass tests. + (unsafeIOToST . go) =<< A.new (len * 2 + 100) where desc = "Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream" throwUnsupportedReplChar = throwIO $ ErrorCall "decodeUtf8With: non-BMP replacement characters not supported" - -- TODO: The code currently assumes that the transcoded UTF-16 - -- stream is at most twice as long (in bytes) as the input UTF-8 - -- stream. To justify this assumption one has to assume that the - -- error handler replacement character also satisfies this - -- invariant, by emitting at most one UTF16 code unit. - -- - -- One easy way to support the full range of code-points for - -- replacement characters in the error handler is to simply change - -- the (over-)allocation to `A.new (2*len)` and then shrink back the - -- `ByteArray#` to the real size (recent GHCs have a cheap - -- `ByteArray#` resize-primop for that which allow the GC to reclaim - -- the overallocation). However, this would require 4 times as much - -- (temporary) storage as the original UTF-8 required. - -- - -- Another strategy would be to optimistically assume that - -- replacement characters are within the BMP, and if the case of a - -- non-BMP replacement occurs reallocate the target buffer (or throw - -- an exception, and fallback to a pessimistic codepath, like e.g. - -- `decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)`) - -- - -- Alternatively, `OnDecodeError` could become a datastructure which - -- statically encodes the replacement-character range, - -- e.g. something isomorphic to - -- - -- Either (... -> Maybe Word16) (... -> Maybe Char) - -- - -- And allow to statically switch between the BMP/non-BMP - -- replacement-character codepaths. There's multiple ways to address - -- this with different tradeoffs; but ideally we should optimise for - -- the optimistic/error-free case. -{- INLINE[0] decodeUtf8With #-} -- $stream -- @@ -304,14 +277,15 @@ streamDecodeUtf8With :: streamDecodeUtf8With onErr = decodeChunk B.empty 0 0 where -- We create a slightly larger than necessary buffer to accommodate a - -- potential surrogate pair started in the last buffer (@undecoded0@), or + -- potential code point started in the last buffer (@undecoded0@), or -- replacement characters for each byte in @undecoded0@ if the -- sequence turns out to be invalid. There can be up to three bytes there, - -- hence we allocate @len+3@ 16-bit words. + -- hence we allocate @len+3@ bytes. decodeChunk :: ByteString -> CodePoint -> DecoderState -> ByteString -> Decoding decodeChunk undecoded0 codepoint0 state0 bs = withBS bs aux where - aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+3) + -- TODO Replace (+100) with something sensible. + aux fp len = runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+100) where decodeChunkToBuffer :: A.MArray s -> IO Decoding decodeChunkToBuffer dest = unsafeWithForeignPtr fp $ \ptr -> @@ -342,7 +316,7 @@ streamDecodeUtf8With onErr = decodeChunk B.empty 0 0 -- the previous chunk, we invalidate the bytes from -- @undecoded0@ and retry decoding the current chunk from -- the initial state. - traverse_ skipByte (B.unpack undecoded0 ) + traverse_ skipByte (B.unpack undecoded0) loop lastPtr else do peek lastPtr >>= skipByte @@ -436,50 +410,33 @@ encodeUtf8BuilderEscaped be = goPartial !iendTmp = go i0 op0 where go !i !op - | i < iendTmp = case A.unsafeIndex arr i of - w | w <= 0x7F -> do - BP.runB be (word16ToWord8 w) op >>= go (i + 1) - | w <= 0x7FF -> do - poke8 @Word16 0 $ (w `shiftR` 6) + 0xC0 - poke8 @Word16 1 $ (w .&. 0x3f) + 0x80 - go (i + 1) (op `plusPtr` 2) - | 0xD800 <= w && w <= 0xDBFF -> do - let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1)) - poke8 @Int 0 $ (c `shiftR` 18) + 0xF0 - poke8 @Int 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80 - poke8 @Int 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80 - poke8 @Int 3 $ (c .&. 0x3F) + 0x80 - go (i + 2) (op `plusPtr` 4) - | otherwise -> do - poke8 @Word16 0 $ (w `shiftR` 12) + 0xE0 - poke8 @Word16 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80 - poke8 @Word16 2 $ (w .&. 0x3F) + 0x80 - go (i + 1) (op `plusPtr` 3) - | otherwise = - outerLoop i (B.BufferRange op ope) + | i < iendTmp = case utf8LengthByLeader w of + 1 -> do + BP.runB be w op >>= go (i + 1) + 2 -> do + poke (op `plusPtr` 0) w + poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1)) + go (i + 2) (op `plusPtr` 2) + 3 -> do + poke (op `plusPtr` 0) w + poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1)) + poke (op `plusPtr` 2) (A.unsafeIndex arr (i+2)) + go (i + 3) (op `plusPtr` 3) + _ -> do + poke (op `plusPtr` 0) w + poke (op `plusPtr` 1) (A.unsafeIndex arr (i+1)) + poke (op `plusPtr` 2) (A.unsafeIndex arr (i+2)) + poke (op `plusPtr` 3) (A.unsafeIndex arr (i+3)) + go (i + 4) (op `plusPtr` 4) + | otherwise = outerLoop i (B.BufferRange op ope) where - -- Take care, a is either Word16 or Int above - poke8 :: Integral a => Int -> a -> IO () - poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8) + w = A.unsafeIndex arr i -- | Encode text using UTF-8 encoding. encodeUtf8 :: Text -> ByteString -encodeUtf8 (Text arr off len) +encodeUtf8 (Text (A.Array arr) off len) | len == 0 = B.empty - | otherwise = unsafeDupablePerformIO $ do - fp <- B.mallocByteString (len*3) -- see https://github.com/haskell/text/issues/194 for why len*3 is enough - unsafeWithForeignPtr fp $ \ptr -> - with ptr $ \destPtr -> do - c_encode_utf8 destPtr (A.aBA arr) (intToCSize off) (intToCSize len) - newDest <- peek destPtr - let utf8len = newDest `minusPtr` ptr - if utf8len >= len `shiftR` 1 - then return (mkBS fp utf8len) - else do - fp' <- B.mallocByteString utf8len - unsafeWithForeignPtr fp' $ \ptr' -> do - B.memcpy ptr' ptr utf8len - return (mkBS fp' utf8len) + | otherwise = B.take len $ B.drop off $ SBS.fromShort $ SBS.SBS arr -- | Decode text from little endian UTF-16 encoding. decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text @@ -563,9 +520,6 @@ cSizeToInt = fromIntegral intToCSize :: Int -> CSize intToCSize = fromIntegral -word16ToWord8 :: Word16 -> Word8 -word16ToWord8 = fromIntegral - foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8 :: MutableByteArray# s -> Ptr CSize -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8) @@ -576,7 +530,4 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1 - :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO () - -foreign import ccall unsafe "_hs_text_encode_utf8" c_encode_utf8 - :: Ptr (Ptr Word8) -> ByteArray# -> CSize -> CSize -> IO () + :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int diff --git a/src/Data/Text/Foreign.hs b/src/Data/Text/Foreign.hs index 65805b2f..87742011 100644 --- a/src/Data/Text/Foreign.hs +++ b/src/Data/Text/Foreign.hs @@ -14,7 +14,7 @@ module Data.Text.Foreign ( -- * Interoperability with native code -- $interop - I16 + I8 -- * Safe conversion functions , fromPtr , useAsPtr @@ -23,12 +23,12 @@ module Data.Text.Foreign , peekCStringLen , withCStringLen -- * Unsafe conversion code - , lengthWord16 + , lengthWord8 , unsafeCopyToPtr -- * Low-level manipulation -- $lowlevel - , dropWord16 - , takeWord16 + , dropWord8 + , takeWord8 ) where #if defined(ASSERTS) @@ -39,8 +39,8 @@ import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen) import Data.Text.Encoding (decodeUtf8, encodeUtf8) import Data.Text.Internal (Text(..), empty) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) -import Data.Text.Unsafe (lengthWord16) -import Data.Word (Word16) +import Data.Text.Unsafe (lengthWord8) +import Data.Word (Word8) import Foreign.C.String (CStringLen) import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray) import Foreign.Marshal.Alloc (allocaBytes) @@ -54,24 +54,22 @@ import qualified Data.Text.Array as A -- to have a fixed address in the Haskell heap. All communication with -- native code must thus occur by copying data back and forth. -- --- The 'Text' type's internal representation is UTF-16, using the --- platform's native endianness. This makes copied data suitable for --- use with native libraries that use a similar representation, such --- as ICU. To interoperate with native libraries that use different --- internal representations, such as UTF-8 or UTF-32, consider using +-- The 'Text' type's internal representation is UTF-8. +-- To interoperate with native libraries that use different +-- internal representations, such as UTF-16 or UTF-32, consider using -- the functions in the 'Data.Text.Encoding' module. --- | A type representing a number of UTF-16 code units. -newtype I16 = I16 Int +-- | A type representing a number of UTF-8 code units. +newtype I8 = I8 Int deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show) --- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the +-- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word8' by copying the -- contents of the array. -fromPtr :: Ptr Word16 -- ^ source array - -> I16 -- ^ length of source array (in 'Word16' units) +fromPtr :: Ptr Word8 -- ^ source array + -> I8 -- ^ length of source array (in 'Word8' units) -> IO Text -fromPtr _ (I16 0) = return empty -fromPtr ptr (I16 len) = +fromPtr _ (I8 0) = return empty +fromPtr ptr (I8 len) = #if defined(ASSERTS) assert (len > 0) $ #endif @@ -83,72 +81,77 @@ fromPtr ptr (I16 len) = loop !p !i | i == len = return marr | otherwise = do A.unsafeWrite marr i =<< unsafeIOToST (peek p) - loop (p `plusPtr` 2) (i + 1) + loop (p `plusPtr` 1) (i + 1) -- $lowlevel -- --- Foreign functions that use UTF-16 internally may return indices in --- units of 'Word16' instead of characters. These functions may +-- Foreign functions that use UTF-8 internally may return indices in +-- units of 'Word8' instead of characters. These functions may -- safely be used with such indices, as they will adjust offsets if -- necessary to preserve the validity of a Unicode string. --- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in +-- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word8' units in -- length. -- --- If @n@ would cause the 'Text' to end inside a surrogate pair, the --- end of the prefix will be advanced by one additional 'Word16' unit +-- If @n@ would cause the 'Text' to end inside a code point, the +-- end of the prefix will be advanced by several additional 'Word8' units -- to maintain its validity. -takeWord16 :: I16 -> Text -> Text -takeWord16 (I16 n) t@(Text arr off len) - | n <= 0 = empty - | n >= len || m >= len = t - | otherwise = Text arr off m - where - m | w < 0xD800 || w > 0xDBFF = n - | otherwise = n+1 - w = A.unsafeIndex arr (off+n-1) +takeWord8 :: I8 -> Text -> Text +takeWord8 = (fst .) . splitAtWord8 --- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units +-- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word8' units -- dropped from its beginning. -- --- If @n@ would cause the 'Text' to begin inside a surrogate pair, the --- beginning of the suffix will be advanced by one additional 'Word16' +-- If @n@ would cause the 'Text' to begin inside a code point, the +-- beginning of the suffix will be advanced by several additional 'Word8' -- unit to maintain its validity. -dropWord16 :: I16 -> Text -> Text -dropWord16 (I16 n) t@(Text arr off len) - | n <= 0 = t - | n >= len || m >= len = empty - | otherwise = Text arr (off+m) (len-m) +dropWord8 :: I8 -> Text -> Text +dropWord8 = (snd .) . splitAtWord8 + +splitAtWord8 :: I8 -> Text -> (Text, Text) +splitAtWord8 (I8 n) t@(Text arr off len) + | n <= 0 = (empty, t) + | n >= len || m >= len = (t, empty) + | otherwise = (Text arr off m, Text arr (off+m) (len-m)) where - m | w < 0xD800 || w > 0xDBFF = n - | otherwise = n+1 - w = A.unsafeIndex arr (off+n-1) + m | w0 < 0x80 = n -- last char is ASCII + | w0 >= 0xF0 = n+3 -- last char starts 4-byte sequence + | w0 >= 0xE0 = n+2 -- last char starts 3-byte sequence + | w0 >= 0xC0 = n+1 -- last char starts 2-byte sequence + | w1 >= 0xF0 = n+2 -- pre-last char starts 4-byte sequence + | w1 >= 0xE0 = n+1 -- pre-last char starts 3-byte sequence + | w1 >= 0xC0 = n -- pre-last char starts 2-byte sequence + | w2 >= 0xF0 = n+1 -- pre-pre-last char starts 4-byte sequence + | otherwise = n -- pre-pre-last char starts 3-byte sequence + w0 = A.unsafeIndex arr (off+n-1) + w1 = A.unsafeIndex arr (off+n-2) + w2 = A.unsafeIndex arr (off+n-3) -- | /O(n)/ Copy a 'Text' to an array. The array is assumed to be big -- enough to hold the contents of the entire 'Text'. -unsafeCopyToPtr :: Text -> Ptr Word16 -> IO () +unsafeCopyToPtr :: Text -> Ptr Word8 -> IO () unsafeCopyToPtr (Text arr off len) ptr = loop ptr off where end = off + len loop !p !i | i == end = return () | otherwise = do poke p (A.unsafeIndex arr i) - loop (p `plusPtr` 2) (i + 1) + loop (p `plusPtr` 1) (i + 1) -- | /O(n)/ Perform an action on a temporary, mutable copy of a -- 'Text'. The copy is freed as soon as the action returns. -useAsPtr :: Text -> (Ptr Word16 -> I16 -> IO a) -> IO a +useAsPtr :: Text -> (Ptr Word8 -> I8 -> IO a) -> IO a useAsPtr t@(Text _arr _off len) action = - allocaBytes (len * 2) $ \buf -> do + allocaBytes len $ \buf -> do unsafeCopyToPtr t buf - action (castPtr buf) (I16 len) + action (castPtr buf) (I8 len) -- | /O(n)/ Make a mutable copy of a 'Text'. -asForeignPtr :: Text -> IO (ForeignPtr Word16, I16) +asForeignPtr :: Text -> IO (ForeignPtr Word8, I8) asForeignPtr t@(Text _arr _off len) = do fp <- mallocForeignPtrArray len unsafeWithForeignPtr fp $ unsafeCopyToPtr t - return (fp, I16 len) + return (fp, I8 len) -- | /O(n)/ Decode a C string with explicit length, which is assumed -- to have been encoded as UTF-8. If decoding fails, a diff --git a/src/Data/Text/Internal.hs b/src/Data/Text/Internal.hs index 36fa36e0..b5a1d443 100644 --- a/src/Data/Text/Internal.hs +++ b/src/Data/Text/Internal.hs @@ -55,9 +55,9 @@ import qualified Data.Text.Array as A -- | A space efficient, packed, unboxed Unicode text type. data Text = Text - {-# UNPACK #-} !A.Array -- payload (Word16 elements) - {-# UNPACK #-} !Int -- offset (units of Word16, not Char) - {-# UNPACK #-} !Int -- length (units of Word16, not Char) + {-# UNPACK #-} !A.Array -- bytearray encoded as UTF-8 + {-# UNPACK #-} !Int -- offset in bytes (not in Char!), pointing to a start of UTF-8 sequence + {-# UNPACK #-} !Int -- length in bytes (not in Char!), pointing to an end of UTF-8 sequence deriving (Typeable) -- | Smart constructor. @@ -65,13 +65,16 @@ text_ :: #if defined(ASSERTS) HasCallStack => #endif - A.Array -> Int -> Int -> Text + A.Array -- ^ bytearray encoded as UTF-8 + -> Int -- ^ offset in bytes (not in Char!), pointing to a start of UTF-8 sequence + -> Int -- ^ length in bytes (not in Char!), pointing to an end of UTF-8 sequence + -> Text text_ arr off len = #if defined(ASSERTS) let c = A.unsafeIndex arr off in assert (len >= 0) . assert (off >= 0) . - assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $ + assert (len == 0 || c < 0x80 || c >= 0xC0) $ #endif Text arr off len {-# INLINE text_ #-} @@ -92,7 +95,10 @@ text :: #if defined(ASSERTS) HasCallStack => #endif - A.Array -> Int -> Int -> Text + A.Array -- ^ bytearray encoded as UTF-8 + -> Int -- ^ offset in bytes (not in Char!), pointing to a start of UTF-8 sequence + -> Int -- ^ length in bytes (not in Char!), pointing to an end of UTF-8 sequence + -> Text text arr off len | len == 0 = empty | otherwise = text_ arr off len {-# INLINE text #-} @@ -109,7 +115,7 @@ showText (Text arr off len) = -- | Map a 'Char' to a 'Text'-safe value. -- --- UTF-16 surrogate code points are not included in the set of Unicode +-- Unicode 'Data.Char.Surrogate' code points are not included in the set of Unicode -- scalar values, but are unfortunately admitted as valid 'Char' -- values by Haskell. They cannot be represented in a 'Text'. This -- function remaps those code points to the Unicode replacement @@ -191,19 +197,17 @@ int64ToInt32 = fromIntegral -- $internals -- --- Internally, the 'Text' type is represented as an array of 'Word16' --- UTF-16 code units. The offset and length fields in the constructor +-- Internally, the 'Text' type is represented as an array of 'Word8' +-- UTF-8 code units. The offset and length fields in the constructor -- are in these units, /not/ units of 'Char'. -- -- Invariants that all functions must maintain: -- --- * Since the 'Text' type uses UTF-16 internally, it cannot represent +-- * Since the 'Text' type uses UTF-8 internally, it cannot represent -- characters in the reserved surrogate code point range U+D800 to -- U+DFFF. To maintain this invariant, the 'safe' function maps -- 'Char' values in this range to the replacement character (U+FFFD, -- \'�\'). -- --- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must --- always be followed by a trailing (or \"low\") surrogate code unit --- (0xDC00-0xDFFF). A trailing surrogate code unit must always be --- preceded by a leading surrogate code unit. +-- * Offset and length must point to a valid UTF-8 sequence of bytes. +-- Violation of this may cause memory access violation and divergence. diff --git a/src/Data/Text/Internal/Builder.hs b/src/Data/Text/Internal/Builder.hs index 3da66c42..9d181276 100644 --- a/src/Data/Text/Internal/Builder.hs +++ b/src/Data/Text/Internal/Builder.hs @@ -140,7 +140,7 @@ singleton :: HasCallStack => #endif Char -> Builder -singleton c = writeAtMost 2 $ \ marr o -> unsafeWrite marr o (safe c) +singleton c = writeAtMost 4 $ \ marr o -> unsafeWrite marr o (safe c) {-# INLINE singleton #-} ------------------------------------------------------------------------ @@ -185,7 +185,7 @@ fromString :: String -> Builder fromString str = Builder $ \k (Buffer p0 o0 u0 l0) -> let loop !marr !o !u !l [] = k (Buffer marr o u l) loop marr o u l s@(c:cs) - | l <= 1 = do + | l <= 3 = do arr <- A.unsafeFreeze marr let !t = Text arr o u marr' <- A.new chunkSize diff --git a/src/Data/Text/Internal/Encoding/Fusion.hs b/src/Data/Text/Internal/Encoding/Fusion.hs index b42315e0..aa8f0d02 100644 --- a/src/Data/Text/Internal/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Encoding/Fusion.hs @@ -43,7 +43,7 @@ import Data.Text.Internal.Fusion (Step(..), Stream(..)) import Data.Text.Internal.Fusion.Size import Data.Text.Encoding.Error import Data.Text.Internal.Encoding.Fusion.Common -import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32) +import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Word (Word8, Word16, Word32) import Foreign.ForeignPtr (ForeignPtr) @@ -99,7 +99,7 @@ streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1)) {-# INLINE next #-} next i | i >= l = Done - | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2) + | i+1 < l && U16.validate1 x1 = Yield (unsafeChr16 x1) (i+2) | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4) | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1) where @@ -117,7 +117,7 @@ streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1)) {-# INLINE next #-} next i | i >= l = Done - | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2) + | i+1 < l && U16.validate1 x1 = Yield (unsafeChr16 x1) (i+2) | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4) | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1) where diff --git a/src/Data/Text/Internal/Encoding/Utf8.hs b/src/Data/Text/Internal/Encoding/Utf8.hs index f0a04fa7..fa69b7d9 100644 --- a/src/Data/Text/Internal/Encoding/Utf8.hs +++ b/src/Data/Text/Internal/Encoding/Utf8.hs @@ -5,6 +5,7 @@ -- Copyright : (c) 2008, 2009 Tom Harper, -- (c) 2009, 2010 Bryan O'Sullivan, -- (c) 2009 Duncan Coutts +-- (c) 2021 Andrew Lelechenko -- -- License : BSD-style -- Maintainer : bos@serpentine.com @@ -17,9 +18,10 @@ -- -- Basic UTF-8 validation and character manipulation. module Data.Text.Internal.Encoding.Utf8 - ( + ( utf8Length + , utf8LengthByLeader -- Decomposition - ord2 + , ord2 , ord3 , ord4 -- Construction @@ -34,7 +36,7 @@ module Data.Text.Internal.Encoding.Utf8 ) where import Data.Bits ((.&.), shiftR) -import Data.Text.Internal.Unsafe.Char (ord) +import Data.Char (ord) import GHC.Exts import GHC.Word (Word8(..)) @@ -52,6 +54,21 @@ between :: Word8 -- ^ byte to check between x y z = x >= y && x <= z {-# INLINE between #-} +-- TODO make branchless by looking into Word64 by clz (ord c) +utf8Length :: Char -> Int +utf8Length c + | ord c < 0x80 = 1 + | ord c < 0x800 = 2 + | ord c < 0x10000 = 3 + | otherwise = 4 + +utf8LengthByLeader :: Word8 -> Int +utf8LengthByLeader w + | w < 0x80 = 1 + | w < 0xE0 = 2 + | w < 0xF0 = 3 + | otherwise = 4 + ord2 :: Char -> (Word8,Word8) ord2 c = -- ord2 is used only in test suite to construct a deliberately invalid ByteString, diff --git a/src/Data/Text/Internal/Fusion.hs b/src/Data/Text/Internal/Fusion.hs index a96d2a17..01b781a1 100644 --- a/src/Data/Text/Internal/Fusion.hs +++ b/src/Data/Text/Internal/Fusion.hs @@ -50,19 +50,18 @@ module Data.Text.Internal.Fusion ) where import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int, - Num(..), Ord(..), ($), (&&), - fromIntegral, otherwise) -import Data.Bits ((.&.), shiftL, shiftR) + Num(..), Ord(..), ($), + otherwise) +import Data.Bits (shiftL, shiftR) import Data.Text.Internal (Text(..)) import Data.Text.Internal.Private (runText) -import Data.Text.Internal.Unsafe.Char (ord, unsafeChr, unsafeWrite) +import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeWrite) import qualified Data.Text.Array as A import qualified Data.Text.Internal.Fusion.Common as S import Data.Text.Internal.Fusion.Types import Data.Text.Internal.Fusion.Size import qualified Data.Text.Internal as I -import qualified Data.Text.Internal.Encoding.Utf16 as U16 -import Data.Word (Word16) +import qualified Data.Text.Internal.Encoding.Utf8 as U8 #if defined(ASSERTS) import GHC.Stack (HasCallStack) @@ -82,16 +81,24 @@ stream :: HasCallStack => #endif Text -> Stream Char -stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) +stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 2) len) where !end = off+len next !i - | i >= end = Done - | n >= 0xD800 && n <= 0xDBFF = Yield (U16.chr2 n n2) (i + 2) - | otherwise = Yield (unsafeChr n) (i + 1) + | i >= end = Done + | otherwise = Yield chr (i + l) where - n = A.unsafeIndex arr i - n2 = A.unsafeIndex arr (i + 1) + n0 = A.unsafeIndex arr i + n1 = A.unsafeIndex arr (i + 1) + n2 = A.unsafeIndex arr (i + 2) + n3 = A.unsafeIndex arr (i + 3) + + l = U8.utf8LengthByLeader n0 + chr = case l of + 1 -> unsafeChr8 n0 + 2 -> U8.chr2 n0 n1 + 3 -> U8.chr3 n0 n1 n2 + _ -> U8.chr4 n0 n1 n2 n3 {-# INLINE [0] stream #-} -- | /O(n)/ Converts 'Text' into a 'Stream' 'Char', but iterates @@ -101,16 +108,20 @@ stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) -- -- @'unstream' . 'reverseStream' = 'Data.Text.reverse' @ reverseStream :: Text -> Stream Char -reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 1) len) +reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 2) len) where {-# INLINE next #-} next !i - | i < off = Done - | n >= 0xDC00 && n <= 0xDFFF = Yield (U16.chr2 n2 n) (i - 2) - | otherwise = Yield (unsafeChr n) (i - 1) + | i < off = Done + | n0 < 0x80 = Yield (unsafeChr8 n0) (i - 1) + | n1 >= 0xC0 = Yield (U8.chr2 n1 n0) (i - 2) + | n2 >= 0xC0 = Yield (U8.chr3 n2 n1 n0) (i - 3) + | otherwise = Yield (U8.chr4 n3 n2 n1 n0) (i - 4) where - n = A.unsafeIndex arr i - n2 = A.unsafeIndex arr (i - 1) + n0 = A.unsafeIndex arr i + n1 = A.unsafeIndex arr (i - 1) + n2 = A.unsafeIndex arr (i - 2) + n3 = A.unsafeIndex arr (i - 3) {-# INLINE [0] reverseStream #-} -- | /O(n)/ Convert 'Stream' 'Char' into a 'Text'. @@ -123,10 +134,10 @@ reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `sh unstream :: Stream Char -> Text unstream (Stream next0 s0 len) = runText $ \done -> do -- Before encoding each char we perform a buffer realloc check assuming - -- worst case encoding size of two 16-bit units for the char. Just add an + -- worst case encoding size of four 8-bit units for the char. Just add an -- extra space to the buffer so that we do not end up reallocating even when -- all the chars are encoded as single unit. - let mlen = upperBound 4 len + 1 + let mlen = upperBound 4 len + 3 arr0 <- A.new mlen let outer !arr !maxi = encode where @@ -137,7 +148,7 @@ unstream (Stream next0 s0 len) = runText $ \done -> do Skip si' -> encode si' di Yield c si' -- simply check for the worst case - | maxi < di + 1 -> realloc si di + | maxi < di + 3 -> realloc si di | otherwise -> do n <- unsafeWrite arr di c encode si' (di + n) @@ -192,22 +203,12 @@ reverse (Stream next s len0) let newLen = len `shiftL` 1 marr' <- A.new newLen A.copyM marr' (newLen-len) marr 0 len - write s1 (len+i) newLen marr' - | otherwise -> write s1 i len marr - where n = ord x - least | n < 0x10000 = 0 - | otherwise = 1 - m = n - 0x10000 - lo = intToWord16 $ (m `shiftR` 10) + 0xD800 - hi = intToWord16 $ (m .&. 0x3FF) + 0xDC00 - write t j l mar - | n < 0x10000 = do - A.unsafeWrite mar j (intToWord16 n) - loop t (j-1) l mar - | otherwise = do - A.unsafeWrite mar (j-1) lo - A.unsafeWrite mar j hi - loop t (j-2) l mar + _ <- unsafeWrite marr' (len + i - least) x + loop s1 (len + i - least - 1) newLen marr' + | otherwise -> do + _ <- unsafeWrite marr (i - least) x + loop s1 (i - least - 1) len marr + where least = U8.utf8Length x - 1 {-# INLINE [0] reverse #-} -- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with @@ -304,9 +305,5 @@ mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl) | otherwise -> do d <- unsafeWrite arr i c loop z' s' (i+d) where (z',c) = f z x - j | ord c < 0x10000 = i - | otherwise = i + 1 + j = i + U8.utf8Length c - 1 {-# INLINE [0] mapAccumL #-} - -intToWord16 :: Int -> Word16 -intToWord16 = fromIntegral diff --git a/src/Data/Text/Internal/Fusion/Common.hs b/src/Data/Text/Internal/Fusion/Common.hs index d9f83498..dc19cf1b 100644 --- a/src/Data/Text/Internal/Fusion/Common.hs +++ b/src/Data/Text/Internal/Fusion/Common.hs @@ -124,15 +124,17 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..), (&&), fromIntegral, otherwise) import qualified Data.List as L import qualified Prelude as P -import Data.Bits (shiftL) import Data.Char (isLetter, isSpace) import Data.Int (Int64) +import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader) import Data.Text.Internal.Fusion.Types import Data.Text.Internal.Fusion.CaseMapping (foldMapping, lowerMapping, titleMapping, upperMapping) import Data.Text.Internal.Fusion.Size -import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#) -import GHC.Types (Char(..), Int(..)) +import GHC.Prim (Addr#, indexWord8OffAddr#) +import GHC.Types (Int(..)) +import Data.Text.Internal.Unsafe.Char (unsafeChr8) +import GHC.Word -- | /O(1)/ Convert a character into a 'Stream' -- @@ -185,23 +187,16 @@ streamCString# addr = Stream step 0 unknownSize where step !i | b == 0 = Done - | b <= 0x7f = Yield (C# b#) (i+1) - | b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1 - in Yield c (i+2) - | b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) + - (next 1 `shiftL` 6) + - next 2 - in Yield c (i+3) - | otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) + - (next 1 `shiftL` 12) + - (next 2 `shiftL` 6) + - next 3 - in Yield c (i+4) - where b = I# (ord# b#) - next n = I# (ord# (at# (i+n))) - 0x80 - !b# = at# i - at# (I# i#) = indexCharOffAddr# addr i# - chr (I# i#) = C# (chr# i#) + | otherwise = Yield chr (i + l) + where b = at# i + l = utf8LengthByLeader b + next n = at# (i+n) + chr = case l of + 1 -> unsafeChr8 b + 2 -> chr2 b (next 1) + 3 -> chr3 b (next 1) (next 2) + _ -> chr4 b (next 1) (next 2) (next 3) + at# (I# i#) = W8# (indexWord8OffAddr# addr i#) {-# INLINE [0] streamCString# #-} -- ---------------------------------------------------------------------------- diff --git a/src/Data/Text/Internal/Fusion/Size.hs b/src/Data/Text/Internal/Fusion/Size.hs index 50118c97..d6420555 100644 --- a/src/Data/Text/Internal/Fusion/Size.hs +++ b/src/Data/Text/Internal/Fusion/Size.hs @@ -37,13 +37,13 @@ module Data.Text.Internal.Fusion.Size , isEmpty ) where -import Data.Char (ord) +import Data.Text.Internal.Encoding.Utf8 (utf8Length) import Data.Text.Internal (mul) #if defined(ASSERTS) import Control.Exception (assert) #endif --- | A size in UTF-16 code units. +-- | A size in UTF-8 code units (which is bytes). data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size. | Unknown -- ^ Unknown size. deriving (Eq, Show) @@ -55,9 +55,7 @@ exactly _ = Nothing -- | The 'Size' of the given code point. charSize :: Char -> Size -charSize c - | ord c < 0x10000 = exactSize 1 - | otherwise = exactSize 2 +charSize = exactSize . utf8Length -- | The 'Size' of @n@ code points. codePointsSize :: Int -> Size @@ -65,7 +63,7 @@ codePointsSize n = #if defined(ASSERTS) assert (n >= 0) #endif - Between n (2*n) + Between n (4*n) {-# INLINE codePointsSize #-} exactSize :: Int -> Size @@ -160,7 +158,7 @@ upperBound _ (Between _ n) = n upperBound k _ = k {-# INLINE upperBound #-} --- | Compute the maximum size from a size hint, if possible. +-- | Compute the minimum size from a size hint, if possible. lowerBound :: Int -> Size -> Int lowerBound _ (Between n _) = n lowerBound k _ = k diff --git a/src/Data/Text/Internal/Fusion/Types.hs b/src/Data/Text/Internal/Fusion/Types.hs index df05773d..b97784f8 100644 --- a/src/Data/Text/Internal/Fusion/Types.hs +++ b/src/Data/Text/Internal/Fusion/Types.hs @@ -75,7 +75,7 @@ instance (Ord a) => Ord (Stream a) where -- unstreaming functions must be able to cope with the hint being too -- small or too large. -- --- The size hint tries to track the UTF-16 code units in a stream, +-- The size hint tries to track the UTF-8 code units in a stream, -- but often counts the number of code points instead. It can easily -- undercount if, for instance, a transformed stream contains astral -- plane code points (those above 0x10000). diff --git a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs index dba6db34..69149779 100644 --- a/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs +++ b/src/Data/Text/Internal/Lazy/Encoding/Fusion.hs @@ -41,7 +41,7 @@ import Data.Text.Internal.Encoding.Fusion.Common import Data.Text.Encoding.Error import Data.Text.Internal.Fusion (Step(..), Stream(..)) import Data.Text.Internal.Fusion.Size -import Data.Text.Internal.Unsafe.Char (unsafeChr, unsafeChr8, unsafeChr32) +import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeChr16, unsafeChr32) import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) import Data.Word (Word8, Word16, Word32) import qualified Data.Text.Internal.Encoding.Utf8 as U8 @@ -112,7 +112,7 @@ streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize where next (T bs@(Chunk ps _) S0 i) | i + 1 < len && U16.validate1 x1 = - Yield (unsafeChr x1) (T bs S0 (i+2)) + Yield (unsafeChr16 x1) (T bs S0 (i+2)) | i + 3 < len && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (T bs S0 (i+4)) where len = B.length ps @@ -123,7 +123,7 @@ streamUtf16LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize next st@(T bs s i) = case s of S2 w1 w2 | U16.validate1 (c w1 w2) -> - Yield (unsafeChr (c w1 w2)) es + Yield (unsafeChr16 (c w1 w2)) es S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) -> Yield (U16.chr2 (c w1 w2) (c w3 w4)) es _ -> consume st @@ -152,7 +152,7 @@ streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize where next (T bs@(Chunk ps _) S0 i) | i + 1 < len && U16.validate1 x1 = - Yield (unsafeChr x1) (T bs S0 (i+2)) + Yield (unsafeChr16 x1) (T bs S0 (i+2)) | i + 3 < len && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (T bs S0 (i+4)) where len = B.length ps @@ -163,7 +163,7 @@ streamUtf16BE onErr bs0 = Stream next (T bs0 S0 0) unknownSize next st@(T bs s i) = case s of S2 w1 w2 | U16.validate1 (c w1 w2) -> - Yield (unsafeChr (c w1 w2)) es + Yield (unsafeChr16 (c w1 w2)) es S4 w1 w2 w3 w4 | U16.validate2 (c w1 w2) (c w3 w4) -> Yield (U16.chr2 (c w1 w2) (c w3 w4)) es _ -> consume st diff --git a/src/Data/Text/Internal/Lazy/Fusion.hs b/src/Data/Text/Internal/Lazy/Fusion.hs index c9b71c86..867d0ac4 100644 --- a/src/Data/Text/Internal/Lazy/Fusion.hs +++ b/src/Data/Text/Internal/Lazy/Fusion.hs @@ -79,8 +79,8 @@ unstreamChunks !chunkSize (Stream next s0 len0) where unknownLength = 4 where inner marr !len s !i - | i + 1 >= chunkSize = finish marr i s - | i + 1 >= len = {-# SCC "unstreamChunks/resize" #-} do + | i + 3 >= chunkSize = finish marr i s + | i + 3 >= len = {-# SCC "unstreamChunks/resize" #-} do let newLen = min (len `shiftL` 1) chunkSize marr' <- A.new newLen A.copyM marr' 0 marr 0 len diff --git a/src/Data/Text/Internal/Lazy/Search.hs b/src/Data/Text/Internal/Lazy/Search.hs index 72930482..78450cbf 100644 --- a/src/Data/Text/Internal/Lazy/Search.hs +++ b/src/Data/Text/Internal/Lazy/Search.hs @@ -25,7 +25,7 @@ module Data.Text.Internal.Lazy.Search import Data.Bits (unsafeShiftL) import qualified Data.Text.Array as A import Data.Int (Int64) -import Data.Word (Word16, Word64) +import Data.Word (Word8, Word64) import qualified Data.Text.Internal as T import Data.Text.Internal.Fusion.Types (PairS(..)) import Data.Text.Internal.Lazy (Text(..), foldlChunks) @@ -75,8 +75,8 @@ indices needle@(Chunk n ns) _haystack@(Chunk k ks) where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1) (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2) - swizzle :: Word16 -> Word64 - swizzle w = 1 `unsafeShiftL` (word16ToInt w .&. 0x3f) + swizzle :: Word8 -> Word64 + swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f) buildTable (T.Text xarr xoff xlen) xs = go where @@ -105,7 +105,7 @@ indices _ _ = [] -- | Fast index into a partly unpacked 'Text'. We take into account -- the possibility that the caller might try to access one element -- past the end. -index :: T.Text -> Text -> Int64 -> Word16 +index :: T.Text -> Text -> Int64 -> Word8 index (T.Text arr off len) xs !i | j < len = A.unsafeIndex arr (off+j) | otherwise = case xs of @@ -117,8 +117,8 @@ index (T.Text arr off len) xs !i Chunk c cs -> index c cs (i-intToInt64 len) where j = int64ToInt i --- | A variant of 'indices' that scans linearly for a single 'Word16'. -indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64] +-- | A variant of 'indices' that scans linearly for a single 'Word8'. +indicesOne :: Word8 -> Int64 -> T.Text -> Text -> [Int64] indicesOne c = chunk where chunk :: Int64 -> T.Text -> Text -> [Int64] @@ -131,7 +131,7 @@ indicesOne c = chunk | otherwise = go (h+1) where on = A.unsafeIndex oarr (ooff+h) --- | The number of 'Word16' values in a 'Text'. +-- | The number of 'Word8' values in a 'Text'. wordLength :: Text -> Int64 wordLength = foldlChunks sumLength 0 where @@ -147,5 +147,5 @@ intToInt64 = fromIntegral int64ToInt :: Int64 -> Int int64ToInt = fromIntegral -word16ToInt :: Word16 -> Int -word16ToInt = fromIntegral +word8ToInt :: Word8 -> Int +word8ToInt = fromIntegral diff --git a/src/Data/Text/Internal/Search.hs b/src/Data/Text/Internal/Search.hs index 0aaab2ab..5688917f 100644 --- a/src/Data/Text/Internal/Search.hs +++ b/src/Data/Text/Internal/Search.hs @@ -32,7 +32,7 @@ module Data.Text.Internal.Search ) where import qualified Data.Text.Array as A -import Data.Word (Word64, Word16) +import Data.Word (Word64, Word8) import Data.Text.Internal (Text(..)) import Data.Bits ((.|.), (.&.), unsafeShiftL) @@ -67,8 +67,8 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen) skp' | c == z = nlen - i - 2 | otherwise = skp - swizzle :: Word16 -> Word64 - swizzle k = 1 `unsafeShiftL` (word16ToInt k .&. 0x3f) + swizzle :: Word8 -> Word64 + swizzle k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f) scan !i | i > ldiff = [] @@ -90,5 +90,5 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen) | otherwise = loop (i+1) {-# INLINE indices #-} -word16ToInt :: Word16 -> Int -word16ToInt = fromIntegral +word8ToInt :: Word8 -> Int +word8ToInt = fromIntegral diff --git a/src/Data/Text/Internal/Unsafe/Char.hs b/src/Data/Text/Internal/Unsafe/Char.hs index 7982e276..3f3372c8 100644 --- a/src/Data/Text/Internal/Unsafe/Char.hs +++ b/src/Data/Text/Internal/Unsafe/Char.hs @@ -19,14 +19,14 @@ module Data.Text.Internal.Unsafe.Char ( ord - , unsafeChr + , unsafeChr16 , unsafeChr8 , unsafeChr32 , unsafeWrite ) where import Control.Monad.ST (ST) -import Data.Bits ((.&.), shiftR) +import Data.Text.Internal.Encoding.Utf8 import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) import GHC.Word (Word8(..), Word16(..), Word32(..)) import qualified Data.Text.Array as A @@ -39,9 +39,9 @@ ord :: Char -> Int ord (C# c#) = I# (ord# c#) {-# INLINE ord #-} -unsafeChr :: Word16 -> Char -unsafeChr (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#))) -{-# INLINE unsafeChr #-} +unsafeChr16 :: Word16 -> Char +unsafeChr16 (W16# w#) = C# (chr# (word2Int# (word16ToWord# w#))) +{-# INLINE unsafeChr16 #-} unsafeChr8 :: Word8 -> Char unsafeChr8 (W8# w#) = C# (chr# (word2Int# (word8ToWord# w#))) @@ -52,25 +52,36 @@ unsafeChr32 (W32# w#) = C# (chr# (word2Int# (word32ToWord# w#))) {-# INLINE unsafeChr32 #-} -- | Write a character into the array at the given offset. Returns --- the number of 'Word16's written. +-- the number of 'Word8's written. unsafeWrite :: #if defined(ASSERTS) HasCallStack => #endif A.MArray s -> Int -> Char -> ST s Int -unsafeWrite marr i c - | n < 0x10000 = do - A.unsafeWrite marr i (intToWord16 n) +unsafeWrite marr i c = case utf8Length c of + 1 -> do + let n0 = intToWord8 (ord c) + A.unsafeWrite marr i n0 return 1 - | otherwise = do - A.unsafeWrite marr i lo - A.unsafeWrite marr (i+1) hi + 2 -> do + let (n0, n1) = ord2 c + A.unsafeWrite marr i n0 + A.unsafeWrite marr (i+1) n1 return 2 - where n = ord c - m = n - 0x10000 - lo = intToWord16 $ (m `shiftR` 10) + 0xD800 - hi = intToWord16 $ (m .&. 0x3FF) + 0xDC00 + 3 -> do + let (n0, n1, n2) = ord3 c + A.unsafeWrite marr i n0 + A.unsafeWrite marr (i+1) n1 + A.unsafeWrite marr (i+2) n2 + return 3 + _ -> do + let (n0, n1, n2, n3) = ord4 c + A.unsafeWrite marr i n0 + A.unsafeWrite marr (i+1) n1 + A.unsafeWrite marr (i+2) n2 + A.unsafeWrite marr (i+3) n3 + return 4 {-# INLINE unsafeWrite #-} -intToWord16 :: Int -> Word16 -intToWord16 = fromIntegral +intToWord8 :: Int -> Word8 +intToWord8 = fromIntegral diff --git a/src/Data/Text/Lazy.hs b/src/Data/Text/Lazy.hs index 8f67b653..bb716286 100644 --- a/src/Data/Text/Lazy.hs +++ b/src/Data/Text/Lazy.hs @@ -253,7 +253,8 @@ import GHC.Stack (HasCallStack) -- points -- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >) -- as 'Char' values, including code points from this invalid range. --- This means that there are some 'Char' values that are not valid +-- This means that there are some 'Char' values +-- (corresponding to 'Data.Char.Surrogate' category) that are not valid -- Unicode scalar values, and the functions in this module must handle -- those cases. -- @@ -262,12 +263,7 @@ import GHC.Stack (HasCallStack) -- that are not valid Unicode scalar values with the replacement -- character \"�\" (U+FFFD). Functions that perform this -- inspection and replacement are documented with the phrase --- \"Performs replacement on invalid scalar values\". --- --- (One reason for this policy of replacement is that internally, a --- 'Text' value is represented as packed UTF-16 data. Values in the --- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate --- code points, and so cannot be represented. The functions replace +-- \"Performs replacement on invalid scalar values\". The functions replace -- invalid scalar values, instead of dropping them, as a security -- measure. For details, see -- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.) @@ -283,13 +279,13 @@ equal Empty _ = False equal _ Empty = False equal (Chunk a as) (Chunk b bs) = case compare lenA lenB of - LT -> a == (T.takeWord16 lenA b) && - as `equal` Chunk (T.dropWord16 lenA b) bs + LT -> a == (T.takeWord8 lenA b) && + as `equal` Chunk (T.dropWord8 lenA b) bs EQ -> a == b && as `equal` bs - GT -> T.takeWord16 lenB a == b && - Chunk (T.dropWord16 lenB a) as `equal` bs - where lenA = T.lengthWord16 a - lenB = T.lengthWord16 b + GT -> T.takeWord8 lenB a == b && + Chunk (T.dropWord8 lenB a) as `equal` bs + where lenA = T.lengthWord8 a + lenB = T.lengthWord8 b instance Eq Text where (==) = equal @@ -1040,9 +1036,9 @@ dropEnd n t0 T.dropEnd (int64ToInt m) t : ts where l = intToInt64 (T.length t) --- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word16' +-- | /O(n)/ 'dropWords' @n@ returns the suffix with @n@ 'Word8' -- values dropped, or the empty 'Text' if @n@ is greater than the --- number of 'Word16' values present. +-- number of 'Word8' values present. dropWords :: Int64 -> Text -> Text dropWords i t0 | i <= 0 = t0 @@ -1082,7 +1078,7 @@ takeWhileEnd :: (Char -> Bool) -> Text -> Text takeWhileEnd p = takeChunk empty . L.reverse . toChunks where takeChunk acc [] = acc takeChunk acc (t:ts) - | T.lengthWord16 t' < T.lengthWord16 t + | T.lengthWord8 t' < T.lengthWord8 t = chunk t' acc | otherwise = takeChunk (Chunk t' acc) ts where t' = T.takeWhileEnd p t @@ -1164,7 +1160,7 @@ splitAt = loop where len = intToInt64 (T.length t) -- | /O(n)/ 'splitAtWord' @n t@ returns a strict pair whose first --- element is a prefix of @t@ whose chunks contain @n@ 'Word16' +-- element is a prefix of @t@ whose chunks contain @n@ 'Word8' -- values, and whose second is the remainder of the string. splitAtWord :: Int64 -> Text -> PairS Text Text splitAtWord _ Empty = empty :*: empty diff --git a/src/Data/Text/Lazy/Builder/Int.hs b/src/Data/Text/Lazy/Builder/Int.hs index a6a79d64..f861d1e5 100644 --- a/src/Data/Text/Lazy/Builder/Int.hs +++ b/src/Data/Text/Lazy/Builder/Int.hs @@ -102,15 +102,15 @@ posDecimal marr off0 ds v0 = go (off0 + ds - 1) v0 let i = fromIntegral i0; j = i + i unsafeWrite marr off $ get (j + 1) unsafeWrite marr (off - 1) $ get j - get = word8ToWord16 . B.unsafeIndex digits + get = B.unsafeIndex digits -minus, zero :: Word16 +minus, zero :: Word8 {-# INLINE minus #-} {-# INLINE zero #-} minus = 45 zero = 48 -i2w :: (Integral a) => a -> Word16 +i2w :: (Integral a) => a -> Word8 {-# INLINE i2w #-} i2w v = zero + fromIntegral v @@ -242,6 +242,3 @@ integer base i | otherwise = loop (d-1) q <> hexDigit r where q = n `quotInt` base r = n `remInt` base - -word8ToWord16 :: Word8 -> Word16 -word8ToWord16 = fromIntegral diff --git a/src/Data/Text/Show.hs b/src/Data/Text/Show.hs index 8d7ceb0e..9baeaeee 100644 --- a/src/Data/Text/Show.hs +++ b/src/Data/Text/Show.hs @@ -21,6 +21,7 @@ module Data.Text.Show import Control.Monad.ST (ST) import Data.Text.Internal (Text(..), empty_, safe) +import Data.Text.Internal.Encoding.Utf8 (utf8Length) import Data.Text.Internal.Fusion (stream, unstream) import Data.Text.Internal.Unsafe.Char (unsafeWrite) import GHC.Prim (Addr#) @@ -95,7 +96,6 @@ singleton_ c = Text (A.run x) 0 len x = do arr <- A.new len _ <- unsafeWrite arr 0 d return arr - len | d < '\x10000' = 1 - | otherwise = 2 + len = utf8Length d d = safe c {-# NOINLINE singleton_ #-} diff --git a/src/Data/Text/Unsafe.hs b/src/Data/Text/Unsafe.hs index 88727074..64cc83c4 100644 --- a/src/Data/Text/Unsafe.hs +++ b/src/Data/Text/Unsafe.hs @@ -20,19 +20,19 @@ module Data.Text.Unsafe , reverseIter_ , unsafeHead , unsafeTail - , lengthWord16 - , takeWord16 - , dropWord16 + , lengthWord8 + , takeWord8 + , dropWord8 ) where #if defined(ASSERTS) import Control.Exception (assert) import GHC.Stack (HasCallStack) #endif -import Data.Text.Internal.Encoding.Utf16 (chr2) +import Data.Text.Internal.Encoding.Utf8 (chr2, chr3, chr4, utf8LengthByLeader) import Data.Text.Internal (Text(..)) import Data.Text.Internal.Unsafe (inlineInterleaveST, inlinePerformIO) -import Data.Text.Internal.Unsafe.Char (unsafeChr) +import Data.Text.Internal.Unsafe.Char (unsafeChr8) import qualified Data.Text.Array as A import GHC.IO (unsafeDupablePerformIO) @@ -40,11 +40,15 @@ import GHC.IO (unsafeDupablePerformIO) -- omits the check for the empty case, so there is an obligation on -- the programmer to provide a proof that the 'Text' is non-empty. unsafeHead :: Text -> Char -unsafeHead (Text arr off _len) - | m < 0xD800 || m > 0xDBFF = unsafeChr m - | otherwise = chr2 m n - where m = A.unsafeIndex arr off - n = A.unsafeIndex arr (off+1) +unsafeHead (Text arr off _len) = case utf8LengthByLeader m0 of + 1 -> unsafeChr8 m0 + 2 -> chr2 m0 m1 + 3 -> chr3 m0 m1 m2 + _ -> chr4 m0 m1 m2 m3 + where m0 = A.unsafeIndex arr off + m1 = A.unsafeIndex arr (off+1) + m2 = A.unsafeIndex arr (off+2) + m3 = A.unsafeIndex arr (off+3) {-# INLINE unsafeHead #-} -- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeTail' @@ -60,8 +64,9 @@ unsafeTail t@(Text arr off len) = {-# INLINE unsafeTail #-} data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int + deriving (Show) --- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16 +-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-8 -- array, returning the current character and the delta to add to give -- the next offset to iterate at. iter :: @@ -69,61 +74,73 @@ iter :: HasCallStack => #endif Text -> Int -> Iter -iter (Text arr off _len) i - | m < 0xD800 || m > 0xDBFF = Iter (unsafeChr m) 1 - | otherwise = Iter (chr2 m n) 2 - where m = A.unsafeIndex arr j - n = A.unsafeIndex arr k +iter (Text arr off _len) i = Iter chr l + where m0 = A.unsafeIndex arr j + m1 = A.unsafeIndex arr (j+1) + m2 = A.unsafeIndex arr (j+2) + m3 = A.unsafeIndex arr (j+3) j = off + i - k = j + 1 + l = utf8LengthByLeader m0 + chr = case l of + 1 -> unsafeChr8 m0 + 2 -> chr2 m0 m1 + 3 -> chr3 m0 m1 m2 + _ -> chr4 m0 m1 m2 m3 {-# INLINE iter #-} --- | /O(1)/ Iterate one step through a UTF-16 array, returning the +-- | /O(1)/ Iterate one step through a UTF-8 array, returning the -- delta to add to give the next offset to iterate at. iter_ :: Text -> Int -> Int -iter_ (Text arr off _len) i | m < 0xD800 || m > 0xDBFF = 1 - | otherwise = 2 +iter_ (Text arr off _len) i = utf8LengthByLeader m where m = A.unsafeIndex arr (off+i) {-# INLINE iter_ #-} --- | /O(1)/ Iterate one step backwards through a UTF-16 array, +-- | /O(1)/ Iterate one step backwards through a UTF-8 array, -- returning the current character and the delta to add (i.e. a -- negative number) to give the next offset to iterate at. reverseIter :: Text -> Int -> Iter reverseIter (Text arr off _len) i - | m < 0xDC00 || m > 0xDFFF = Iter (unsafeChr m) (-1) - | otherwise = Iter (chr2 n m) (-2) - where m = A.unsafeIndex arr j - n = A.unsafeIndex arr k + | m0 < 0x80 = Iter (unsafeChr8 m0) (-1) + | m1 >= 0xC0 = Iter (chr2 m1 m0) (-2) + | m2 >= 0xC0 = Iter (chr3 m2 m1 m0) (-3) + | otherwise = Iter (chr4 m3 m2 m1 m0) (-4) + where m0 = A.unsafeIndex arr j + m1 = A.unsafeIndex arr (j-1) + m2 = A.unsafeIndex arr (j-2) + m3 = A.unsafeIndex arr (j-3) j = off + i - k = j - 1 {-# INLINE reverseIter #-} --- | /O(1)/ Iterate one step backwards through a UTF-16 array, +-- | /O(1)/ Iterate one step backwards through a UTF-8 array, -- returning the delta to add (i.e. a negative number) to give the -- next offset to iterate at. -- -- @since 1.1.1.0 reverseIter_ :: Text -> Int -> Int reverseIter_ (Text arr off _len) i - | m < 0xDC00 || m > 0xDFFF = -1 - | otherwise = -2 - where m = A.unsafeIndex arr (off+i) + | m0 < 0x80 = -1 + | m1 >= 0xC0 = -2 + | m2 >= 0xC0 = -3 + | otherwise = -4 + where m0 = A.unsafeIndex arr j + m1 = A.unsafeIndex arr (j-1) + m2 = A.unsafeIndex arr (j-2) + j = off + i {-# INLINE reverseIter_ #-} --- | /O(1)/ Return the length of a 'Text' in units of 'Word16'. This +-- | /O(1)/ Return the length of a 'Text' in units of 'Word8'. This -- is useful for sizing a target array appropriately before using -- 'unsafeCopyToPtr'. -lengthWord16 :: Text -> Int -lengthWord16 (Text _arr _off len) = len -{-# INLINE lengthWord16 #-} +lengthWord8 :: Text -> Int +lengthWord8 (Text _arr _off len) = len +{-# INLINE lengthWord8 #-} --- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'. -takeWord16 :: Int -> Text -> Text -takeWord16 k (Text arr off _len) = Text arr off k -{-# INLINE takeWord16 #-} +-- | /O(1)/ Unchecked take of 'k' 'Word8's from the front of a 'Text'. +takeWord8 :: Int -> Text -> Text +takeWord8 k (Text arr off _len) = Text arr off k +{-# INLINE takeWord8 #-} --- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'. -dropWord16 :: Int -> Text -> Text -dropWord16 k (Text arr off len) = Text arr (off+k) (len-k) -{-# INLINE dropWord16 #-} +-- | /O(1)/ Unchecked drop of 'k' 'Word8's from the front of a 'Text'. +dropWord8 :: Int -> Text -> Text +dropWord8 k (Text arr off len) = Text arr (off+k) (len-k) +{-# INLINE dropWord8 #-} diff --git a/tests/Tests/Properties/LowLevel.hs b/tests/Tests/Properties/LowLevel.hs index 1972fd36..79aecd05 100644 --- a/tests/Tests/Properties/LowLevel.hs +++ b/tests/Tests/Properties/LowLevel.hs @@ -10,7 +10,7 @@ import Control.Exception as E (SomeException, catch, evaluate) import Data.Int (Int32, Int64) import Data.Text.Foreign import Data.Text.Internal (mul, mul32, mul64) -import Data.Word (Word16, Word32) +import Data.Word (Word8, Word16, Word32) import System.IO.Unsafe (unsafePerformIO) import Test.Tasty (TestTree, testGroup) import Test.Tasty.QuickCheck (testProperty) @@ -46,9 +46,9 @@ t_mul a b = mulRef a b === eval mul a b -- Misc. -t_dropWord16 m t = dropWord16 m t `T.isSuffixOf` t -t_takeWord16 m t = takeWord16 m t `T.isPrefixOf` t -t_take_drop_16 (Small n) t = T.append (takeWord16 n t) (dropWord16 n t) === t +t_dropWord8 m t = dropWord8 m t `T.isSuffixOf` t +t_takeWord8 m t = takeWord8 m t `T.isPrefixOf` t +t_take_drop_8 (Small n) t = T.append (takeWord8 n t) (dropWord8 n t) === t t_use_from t = ioProperty $ (==t) <$> useAsPtr t fromPtr t_copy t = T.copy t === t @@ -80,9 +80,9 @@ testLowLevel = ], testGroup "misc" [ - testProperty "t_dropWord16" t_dropWord16, - testProperty "t_takeWord16" t_takeWord16, - testProperty "t_take_drop_16" t_take_drop_16, + testProperty "t_dropWord8" t_dropWord8, + testProperty "t_takeWord8" t_takeWord8, + testProperty "t_take_drop_8" t_take_drop_8, testProperty "t_use_from" t_use_from, testProperty "t_copy" t_copy ], diff --git a/tests/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs index e6e1a60d..94790fa0 100644 --- a/tests/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -32,7 +32,7 @@ import Control.Arrow ((***)) import Control.DeepSeq (NFData (..), deepseq) import Control.Exception (bracket) import Data.Char (isSpace) -import Data.Text.Foreign (I16) +import Data.Text.Foreign (I8) import Data.Text.Lazy.Builder.RealFloat (FPFormat(..)) import Data.Word (Word8, Word16) import Test.QuickCheck hiding (Fixed(..), Small (..), (.&.)) @@ -51,7 +51,7 @@ import qualified System.IO as IO genWord8 :: Gen Word8 genWord8 = chooseAny -instance Arbitrary I16 where +instance Arbitrary I8 where arbitrary = arbitrarySizedIntegral shrink = shrinkIntegral diff --git a/tests/Tests/Regressions.hs b/tests/Tests/Regressions.hs index 157d0e89..90de08bf 100644 --- a/tests/Tests/Regressions.hs +++ b/tests/Tests/Regressions.hs @@ -80,10 +80,10 @@ mapAccumL_resize = do let f a _ = (a, '\65536') count = 5 val = T.mapAccumL f (0::Int) (T.replicate count "a") - assertEqual "mapAccumL should correctly fill buffers for two-word results" + assertEqual "mapAccumL should correctly fill buffers for four-byte results" (0, T.replicate count "\65536") val - assertEqual "mapAccumL should correctly size buffers for two-word results" - (count * 2) (T.lengthWord16 (snd val)) + assertEqual "mapAccumL should correctly size buffers for four-byte results" + (count * 4) (T.lengthWord8 (snd val)) -- See GitHub #197 t197 :: IO () diff --git a/text.cabal b/text.cabal index 2535129c..ccf86a6d 100644 --- a/text.cabal +++ b/text.cabal @@ -8,7 +8,7 @@ synopsis: An efficient packed Unicode text type. description: . An efficient packed, immutable Unicode text type (both strict and - lazy), with a powerful loop fusion optimization framework. + lazy). . The 'Text' type represents Unicode character strings, in a time and space-efficient manner. This package provides text processing @@ -37,23 +37,12 @@ description: the [text-icu package](https://hackage.haskell.org/package/text-icu) based on the well-respected and liberally licensed [ICU library](http://site.icu-project.org/). - . - == Internal Representation: UTF-16 vs. UTF-8 - . - Currently the @text@ library uses UTF-16 as its internal representation - which is [neither a fixed-width nor always the most dense representation](http://utf8everywhere.org/) - for Unicode text. We're currently investigating the feasibility - of [changing Text's internal representation to UTF-8](https://github.com/text-utf8) - and if you need such a 'Text' type right now you might be interested in using the spin-off - packages <https://hackage.haskell.org/package/text-utf8 text-utf8> and - <https://hackage.haskell.org/package/text-short text-short>. - license: BSD2 license-file: LICENSE author: Bryan O'Sullivan <bos@serpentine.com> maintainer: Haskell Text Team <andrew.lelechenko@gmail.com>, Core Libraries Committee -copyright: 2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper +copyright: 2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper, 2021 Andrew Lelechenko category: Data, Text build-type: Simple tested-with: GHC==9.0.1, @@ -76,6 +65,7 @@ flag developer library c-sources: cbits/cbits.c + cbits/utils.c include-dirs: include hs-source-dirs: src -- GitLab