Commit 4b3667e0 authored by Bodigrim's avatar Bodigrim
Browse files

Switch internal representation to UTF-8

parent d615c996
......@@ -29,4 +29,4 @@ based on the stream fusion framework developed by Roman Leshchinskiy,
Duncan Coutts, and Don Stewart.
The core library was fleshed out, debugged, and tested by Bryan
O'Sullivan <bos@serpentine.com>, and he is the current maintainer.
O'Sullivan. Transition from UTF-16 to UTF-8 is by Andrew Lelechenko.
......@@ -16,12 +16,6 @@
#include "text_cbits.h"
int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
size_t n)
{
return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
}
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
......@@ -61,60 +55,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
return *state = utf8d[256 + *state + type];
}
/*
* The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
* code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
* an UTF16 array
*/
void
_hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
size_t
_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
const uint8_t *srcend)
{
const uint8_t *dest0 = dest;
const uint8_t *p = src;
#if defined(__i386__) || defined(__x86_64__)
/* This optimization works on a little-endian systems by using
(aligned) 32-bit loads instead of 8-bit loads
*/
/* consume unaligned prefix */
while (p != srcend && (uintptr_t)p & 0x3)
*dest++ = *p++;
#if defined(__x86_64__)
/* All the intrinsics used here are from SSE2,
* so every x86_64 CPU supports them.
*/
const __m128i zeros = _mm_set1_epi32(0);
while (p < srcend - 7) {
/* Load 8 bytes of ASCII data */
const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
/* Interleave with zeros */
const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
/* Store the resulting 16 bytes into destination */
_mm_storeu_si128((__m128i *)dest, utf16);
dest += 8;
p += 8;
}
#else
/* iterate over 32-bit aligned loads */
while (p < srcend - 3) {
const uint32_t w = *((const uint32_t *)p);
*dest++ = w & 0xff;
*dest++ = (w >> 8) & 0xff;
*dest++ = (w >> 16) & 0xff;
*dest++ = (w >> 24) & 0xff;
p += 4;
while (p != srcend){
uint8_t codepoint = *p++;
if(codepoint < 0x80){
*dest++ = (uint8_t)codepoint;
} else {
*dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
*dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}
}
#endif
#endif
/* handle unaligned suffix */
while (p != srcend)
*dest++ = *p++;
return (dest - dest0);
}
/*
......@@ -146,82 +104,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
*/
#if defined(__GNUC__) || defined(__clang__)
static inline uint8_t const *
_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
const uint8_t **src, const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
__attribute((always_inline));
#endif
static inline uint8_t const *
_hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8_int(uint8_t *const dest, size_t *destoff,
const uint8_t **src, const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
{
uint16_t *d = dest + *destoff;
uint8_t *d = dest + *destoff;
const uint8_t *s = *src, *last = *src;
uint32_t state = *state0;
uint32_t codepoint = *codepoint0;
while (s < srcend) {
#if defined(__i386__) || defined(__x86_64__)
/*
* This code will only work on a little-endian system that
* supports unaligned loads.
*
* It gives a substantial speed win on data that is purely or
* partly ASCII (e.g. HTML), at only a slight cost on purely
* non-ASCII text.
*/
if (state == UTF8_ACCEPT) {
#if defined(__x86_64__)
const __m128i zeros = _mm_set1_epi32(0);
while (s < srcend - 8) {
const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
break;
s += 8;
/* Load 8 bytes of ASCII data */
const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
/* Interleave with zeros */
const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
/* Store the resulting 16 bytes into destination */
_mm_storeu_si128((__m128i *)d, eight_utf16_chars);
d += 8;
}
#else
while (s < srcend - 4) {
codepoint = *((uint32_t *) s);
if ((codepoint & 0x80808080) != 0)
break;
s += 4;
/*
* Tried 32-bit stores here, but the extra bit-twiddling
* slowed the code down.
*/
*d++ = (uint16_t) (codepoint & 0xff);
*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
}
#endif
last = s;
} /* end if (state == UTF8_ACCEPT) */
#endif
if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
if (state != UTF8_REJECT)
continue;
continue;
break;
}
if (codepoint <= 0xffff)
*d++ = (uint16_t) codepoint;
else {
*d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
*d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
if(codepoint < 0x80){
*d++ = (uint8_t) codepoint;
} else if(codepoint < 0x800){
*d++ = (uint8_t) (0xC0 + (codepoint >> 6));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
} else if(codepoint < 0x10000){
*d++ = (uint8_t) (0xE0 + (codepoint >> 12));
*d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
} else {
*d++ = (uint8_t) (0xF0 + (codepoint >> 18));
*d++ = (uint8_t) (0x80 + ((codepoint >> 12) & 0x3F));
*d++ = (uint8_t) (0x80 + ((codepoint >> 6) & 0x3F));
*d++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}
last = s;
}
......@@ -234,7 +155,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
}
uint8_t const *
_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8_state(uint8_t *const dest, size_t *destoff,
const uint8_t **src,
const uint8_t *srcend,
uint32_t *codepoint0, uint32_t *state0)
......@@ -248,7 +169,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
* Helper to decode buffer and discard final decoder state
*/
const uint8_t *
_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
_hs_text_decode_utf8(uint8_t *const dest, size_t *destoff,
const uint8_t *src, const uint8_t *const srcend)
{
uint32_t codepoint;
......@@ -257,90 +178,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
&codepoint, &state);
return src;
}
void
_hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
size_t srclen)
{
const uint16_t *srcend;
uint8_t *dest = *destp;
src += srcoff;
srcend = src + srclen;
ascii:
#if defined(__x86_64__)
while (srcend - src >= 8) {
union { uint64_t halves[2]; __m128i whole; } eight_chars;
eight_chars.whole = _mm_loadu_si128((__m128i *) src);
const uint64_t w = eight_chars.halves[0];
if (w & 0xFF80FF80FF80FF80ULL) {
if (!(w & 0x000000000000FF80ULL)) {
*dest++ = w & 0xFFFF;
src++;
if (!(w & 0x00000000FF800000ULL)) {
*dest++ = (w >> 16) & 0xFFFF;
src++;
if (!(w & 0x0000FF8000000000ULL)) {
*dest++ = (w >> 32) & 0xFFFF;
src++;
}
}
}
break;
}
if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
break;
}
const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
_mm_storel_epi64((__m128i *)dest, eight_ascii_chars);
dest += 8;
src += 8;
}
#endif
#if defined(__i386__)
while (srcend - src >= 2) {
uint32_t w = *((uint32_t *) src);
if (w & 0xFF80FF80)
break;
*dest++ = w & 0xFFFF;
*dest++ = w >> 16;
src += 2;
}
#endif
while (src < srcend) {
uint16_t w = *src++;
if (w <= 0x7F) {
*dest++ = w;
/* An ASCII byte is likely to begin a run of ASCII bytes.
Falling back into the fast path really helps performance. */
goto ascii;
}
else if (w <= 0x7FF) {
*dest++ = (w >> 6) | 0xC0;
*dest++ = (w & 0x3f) | 0x80;
}
else if (w < 0xD800 || w > 0xDBFF) {
*dest++ = (w >> 12) | 0xE0;
*dest++ = ((w >> 6) & 0x3F) | 0x80;
*dest++ = (w & 0x3F) | 0x80;
} else {
uint32_t c = ((((uint32_t) w) - 0xD800) << 10) +
(((uint32_t) *src++) - 0xDC00) + 0x10000;
*dest++ = (c >> 18) | 0xF0;
*dest++ = ((c >> 12) & 0x3F) | 0x80;
*dest++ = ((c >> 6) & 0x3F) | 0x80;
*dest++ = (c & 0x3F) | 0x80;
}
}
*destp = dest;
}
/*
* Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
*/
#include <stdio.h>
#include <string.h>
int _hs_text_memcmp(const void *arr1, size_t off1, const void *arr2, size_t off2, size_t len)
{
return memcmp(arr1 + off1, arr2 + off2, len);
}
......@@ -9,6 +9,7 @@
-- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts,
-- (c) 2008, 2009 Tom Harper
-- (c) 2021 Andrew Lelechenko
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
......@@ -227,10 +228,8 @@ import Data.Text.Internal.Private (span_)
import Data.Text.Internal (Text(..), empty, firstf, mul, safe, text)
import Data.Text.Show (singleton, unpack, unpackCString#)
import qualified Prelude as P
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord8, reverseIter,
reverseIter_, unsafeHead, unsafeTail)
import Data.Text.Internal.Unsafe.Char (unsafeChr)
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import Data.Text.Internal.Search (indices)
#if defined(__HADDOCK__)
import Data.ByteString (ByteString)
......@@ -291,7 +290,8 @@ import Text.Printf (PrintfArg, formatArg, formatString)
-- points
-- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
-- as 'Char' values, including code points from this invalid range.
-- This means that there are some 'Char' values that are not valid
-- This means that there are some 'Char' values
-- (corresponding to 'Data.Char.Surrogate' category) that are not valid
-- Unicode scalar values, and the functions in this module must handle
-- those cases.
--
......@@ -300,12 +300,7 @@ import Text.Printf (PrintfArg, formatArg, formatString)
-- that are not valid Unicode scalar values with the replacement
-- character \"&#xfffd;\" (U+FFFD). Functions that perform this
-- inspection and replacement are documented with the phrase
-- \"Performs replacement on invalid scalar values\".
--
-- (One reason for this policy of replacement is that internally, a
-- 'Text' value is represented as packed UTF-16 data. Values in the
-- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
-- code points, and so cannot be represented. The functions replace
-- \"Performs replacement on invalid scalar values\". The functions replace
-- invalid scalar values, instead of dropping them, as a security
-- measure. For details, see
-- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
......@@ -487,12 +482,9 @@ second f (a, b) = (a, f b)
-- | /O(1)/ Returns the last character of a 'Text', which must be
-- non-empty.
last :: Text -> Char
last (Text arr off len)
| len <= 0 = emptyError "last"
| n < 0xDC00 || n > 0xDFFF = unsafeChr n
| otherwise = U16.chr2 n0 n
where n = A.unsafeIndex arr (off+len-1)
n0 = A.unsafeIndex arr (off+len-2)
last t@(Text _ _ len)
| len <= 0 = emptyError "last"
| otherwise = let Iter c _ = reverseIter t (len - 1) in c
{-# INLINE [1] last #-}
-- | /O(1)/ Returns all characters after the head of a 'Text', which
......@@ -507,11 +499,9 @@ tail t@(Text arr off len)
-- | /O(1)/ Returns all but the last character of a 'Text', which must
-- be non-empty.
init :: Text -> Text
init (Text arr off len) | len <= 0 = emptyError "init"
| n >= 0xDC00 && n <= 0xDFFF = text arr off (len-2)
| otherwise = text arr off (len-1)
where
n = A.unsafeIndex arr (off+len-1)
init t@(Text arr off len)
| len <= 0 = emptyError "init"
| otherwise = text arr off (len + reverseIter_ t (len - 1))
{-# INLINE [1] init #-}
-- | /O(1)/ Returns all but the last character and the last character of a
......@@ -519,12 +509,11 @@ init (Text arr off len) | len <= 0 = emptyError "init"
--
-- @since 1.2.3.0
unsnoc :: Text -> Maybe (Text, Char)
unsnoc (Text arr off len)
| len <= 0 = Nothing
| n < 0xDC00 || n > 0xDFFF = Just (text arr off (len-1), unsafeChr n)
| otherwise = Just (text arr off (len-2), U16.chr2 n0 n)
where n = A.unsafeIndex arr (off+len-1)
n0 = A.unsafeIndex arr (off+len-2)
unsnoc t@(Text arr off len)
| len <= 0 = Nothing
| otherwise = Just (text arr off (len + d), c)
where
Iter c d = reverseIter t (len - 1)
{-# INLINE [1] unsnoc #-}
-- | /O(1)/ Tests whether a 'Text' is empty or not.
......@@ -911,7 +900,7 @@ concat ts = case ts' of
_ -> Text (A.run go) 0 len
where
ts' = L.filter (not . null) ts
len = sumP "concat" $ L.map lengthWord16 ts'
len = sumP "concat" $ L.map lengthWord8 ts'
go :: ST s (A.MArray s)
go = do
arr <- A.new len
......@@ -1263,7 +1252,7 @@ groupBy p = loop
where Iter c d = iter t 0
n = d + findAIndexOrEnd (not . p c) (Text arr (off+d) (len-d))
-- | Returns the /array/ index (in units of 'Word16') at which a
-- | Returns the /array/ index (in units of 'Word8') at which a
-- character may be found. This is /not/ the same as the logical
-- index returned by e.g. 'findIndex'.
findAIndexOrEnd :: (Char -> Bool) -> Text -> Int
......@@ -1569,9 +1558,10 @@ words t@(Text arr off len) = loop 0 0
| n >= len = if start == n
then []
else [Text arr (start+off) (n-start)]
-- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
| isSpace c =
if start == n
then loop (start+1) (start+1)
then loop (n+d) (n+d)
else Text arr (start+off) (n-start) : loop (n+d) (n+d)
| otherwise = loop start (n+d)
where Iter c d = iter t n
......
......@@ -44,14 +44,14 @@ module Data.Text.Array
import Control.Exception (assert)
import GHC.Stack (HasCallStack)
#endif
import Data.Bits ((.&.), xor, shiftL, shiftR)
import Data.Bits ((.&.), xor, shiftR)
#if !MIN_VERSION_base(4,11,0)
import Data.Text.Internal.Unsafe (inlinePerformIO)
#endif
import Foreign.C.Types (CInt(..))
#endif
import GHC.Exts hiding (toList)
import GHC.ST (ST(..), runST)
import GHC.Word (Word16(..))
import GHC.Word (Word8(..))
import Prelude hiding (length, read)
-- | Immutable array type.
......@@ -88,7 +88,7 @@ unsafeFreeze MArray{..} = ST $ \s1# ->
-- | Indicate how many bytes would be used for an array of the given
-- size.
bytesInArray :: Int -> Int
bytesInArray n = n `shiftL` 1
bytesInArray n = n
{-# INLINE bytesInArray #-}
-- | Unchecked read of an immutable array. May return garbage or
......@@ -97,15 +97,15 @@ unsafeIndex ::
#if defined(ASSERTS)
HasCallStack =>
#endif
Array -> Int -> Word16
Array -> Int -> Word8
unsafeIndex a@Array{..} i@(I# i#) =
#if defined(ASSERTS)
let word16len = I# (sizeofByteArray# aBA) `quot` 2 in
if i < 0 || i >= word16len
then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word16len)
let word8len = I# (sizeofByteArray# aBA) in
if i < 0 || i >= word8len
then error ("Data.Text.Array.unsafeIndex: bounds error, offset " ++ show i ++ ", length " ++ show word8len)
else
#endif
case indexWord16Array# aBA i# of r# -> (W16# r#)
case indexWord8Array# aBA i# of r# -> (W8# r#)
{-# INLINE unsafeIndex #-}
#if defined(ASSERTS)
......@@ -130,17 +130,17 @@ unsafeWrite ::
#if defined(ASSERTS)
HasCallStack =>
#endif
MArray s -> Int -> Word16 -> ST s ()
unsafeWrite ma@MArray{..} i@(I# i#) (W16# e#) =
MArray s -> Int -> Word8 -> ST s ()
unsafeWrite ma@MArray{..} i@(I# i#) (W8# e#) =
#if defined(ASSERTS)
checkBoundsM ma (i * 2) 2 >>
checkBoundsM ma i 1 >>
#endif
(ST $ \s1# -> case writeWord16Array# maBA i# e# s1# of
(ST $ \s1# -> case writeWord8Array# maBA i# e# s1# of
s2# -> (# s2#, () #))
{-# INLINE unsafeWrite #-}
-- | Convert an immutable array to a list.
toList :: Array -> Int -> Int -> [Word16]
toList :: Array -> Int -> Int -> [Word8]
toList ary off len = loop 0
where loop i | i < len = unsafeIndex ary (off+i) : loop (i+1)
| otherwise = []
......@@ -176,10 +176,10 @@ copyM dst@(MArray dst#) dstOff@(I# dstOff#) src@(MArray src#) srcOff@(I# srcOff#
#if defined(ASSERTS)
srcLen <- getSizeofMArray src
dstLen <- getSizeofMArray dst
assert (srcOff + count <= srcLen `quot` 2) .
assert (dstOff + count <= dstLen `quot` 2) .
assert (srcOff + count <= srcLen) .
assert (dstOff + count <= dstLen) .
#endif
ST $ \s1# -> case copyMutableByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# count#) s1# of
ST $ \s1# -> case copyMutableByteArray# src# srcOff# dst# dstOff# count# s1# of
s2# -> (# s2#, () #)
{-# INLINE copyM #-}
......@@ -194,7 +194,7 @@ copyI :: MArray s -- ^ Destination
copyI (MArray dst#) dstOff@(I# dstOff#) (Array src#) (I# srcOff#) top@(I# top#)
| dstOff >= top = return ()
| otherwise = ST $ \s1# ->
case copyByteArray# src# (2# *# srcOff#) dst# (2# *# dstOff#) (2# *# (top# -# dstOff#)) s1# of
case copyByteArray# src# srcOff# dst# dstOff# (top# -# dstOff#) s1# of
s2# -> (# s2#, () #)
{-# INLINE copyI #-}
......@@ -209,7 +209,7 @@ equal :: Array -- ^ First
equal (Array src1#) (I# off1#) (Array src2#) (I# off2#) (I# count#) = i == 0
where
#if MIN_VERSION_base(4,11,0)
i = I# (compareByteArrays# src1# (2# *# off1#) src2# (2# *# off2#) (2# *# count#))
i = I# (compareByteArrays# src1# off1# src2# off2# count#)
#else
i = fromIntegral (inlinePerformIO (memcmp src1# off1# src2# off2# count#))
......
......@@ -7,6 +7,7 @@
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts,
-- (c) 2008, 2009 Tom Harper
-- (c) 2021 Andrew Lelechenko
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
......@@ -63,30 +64,29 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
import Control.Monad.ST (runST)
import Data.Bits ((.&.), shiftR)
import Data.ByteString as B
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Short.Internal as SBS
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal.Private (runText)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.Unsafe.Char (ord, unsafeWrite)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show ()
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8, Word16, Word32)
import Foreign.C.Types (CSize(CSize))
import Data.Word (Word8, Word32)
import Foreign.C.Types (CSize)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
import GHC.Base (ByteArray#, MutableByteArray#)
import GHC.Base (MutableByteArray#)
import qualified Data.ByteString.Builder as B
import qualified Data.ByteString.Builder.Internal as B hiding (empty, append)
import qualified Data.ByteString.Builder.Prim as BP
import qualified Data.ByteString.Builder.Prim.Internal as BP
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
import qualified Data.Text.Array as A
import qualified Data.Text.Internal.Encoding.Fusion as E
import qualified Data.Text.Internal.Encoding.Utf16 as U16
import qualified Data.Text.Internal.Fusion as F
import Data.Text.Internal.ByteStringCompat
#if defined(ASSERTS)
......@@ -124,12 +124,12 @@ decodeLatin1 ::
#endif
ByteString -> Text
decodeLatin1 bs = withBS bs aux where
aux fp len = text a 0 len
aux fp len = text a 0 actualLen
where
a = A.run (A.new len >>= unsafeIOToST . go)
go dest = unsafeWithForeignPtr fp $ \ptr -> do
c_decode_latin1 (A.maBA dest) ptr (ptr `plusPtr` len)
return dest
(a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
go dest = unsafeWithForeignPtr fp $ \src -> do
destLen <- c_decode_latin1 (A.maBA dest) src (src `plusPtr` len)
return (dest, destLen)
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
......@@ -161,6 +161,8 @@ decodeUtf8With onErr bs = withBS bs aux
case onErr desc (Just x) of
Nothing -> loop $ curPtr' `plusPtr` 1
Just c
-- TODO This is problematic, because even BMP replacement characters
-- can take longer than one UTF8 code unit (which is byte).
| c > '\xFFFF' -> throwUnsupportedReplChar
| otherwise -> do
destOff <- peek destOffPtr
......@@ -170,43 +172,14 @@ decodeUtf8With onErr bs = withBS bs aux
poke destOffPtr (destOff + intToCSize w)
loop $ curPtr' `plusPtr` 1
loop ptr
(unsafeIOToST . go) =<< A.new len
-- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.