Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Glasgow Haskell Compiler
Packages
text
Commits
4b3667e0
Commit
4b3667e0
authored
May 21, 2021
by
Bodigrim
Browse files
Switch internal representation to UTF-8
parent
d615c996
Changes
28
Hide whitespace changes
Inline
Side-by-side
README.markdown
View file @
4b3667e0
...
...
@@ -29,4 +29,4 @@ based on the stream fusion framework developed by Roman Leshchinskiy,
Duncan Coutts, and Don Stewart.
The core library was fleshed out, debugged, and tested by Bryan
O'Sullivan
<bos@serpentine.com>
, and he is the current maintainer
.
O'Sullivan
. Transition from UTF-16 to UTF-8 is by Andrew Lelechenko
.
cbits/cbits.c
View file @
4b3667e0
...
...
@@ -16,12 +16,6 @@
#include "text_cbits.h"
int
_hs_text_memcmp
(
const
void
*
a
,
size_t
aoff
,
const
void
*
b
,
size_t
boff
,
size_t
n
)
{
return
memcmp
(
a
+
(
aoff
<<
1
),
b
+
(
boff
<<
1
),
n
<<
1
);
}
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
...
...
@@ -61,60 +55,24 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
return
*
state
=
utf8d
[
256
+
*
state
+
type
];
}
/*
* The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
* code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
* an UTF16 array
*/
void
_hs_text_decode_latin1
(
uint16_t
*
dest
,
const
uint8_t
*
src
,
size_t
_hs_text_decode_latin1
(
uint8_t
*
dest
,
const
uint8_t
*
src
,
const
uint8_t
*
srcend
)
{
const
uint8_t
*
dest0
=
dest
;
const
uint8_t
*
p
=
src
;
#if defined(__i386__) || defined(__x86_64__)
/* This optimization works on a little-endian systems by using
(aligned) 32-bit loads instead of 8-bit loads
*/
/* consume unaligned prefix */
while
(
p
!=
srcend
&&
(
uintptr_t
)
p
&
0x3
)
*
dest
++
=
*
p
++
;
#if defined(__x86_64__)
/* All the intrinsics used here are from SSE2,
* so every x86_64 CPU supports them.
*/
const
__m128i
zeros
=
_mm_set1_epi32
(
0
);
while
(
p
<
srcend
-
7
)
{
/* Load 8 bytes of ASCII data */
const
__m128i
ascii
=
_mm_cvtsi64_si128
(
*
((
const
uint64_t
*
)
p
));
/* Interleave with zeros */
const
__m128i
utf16
=
_mm_unpacklo_epi8
(
ascii
,
zeros
);
/* Store the resulting 16 bytes into destination */
_mm_storeu_si128
((
__m128i
*
)
dest
,
utf16
);
dest
+=
8
;
p
+=
8
;
}
#else
/* iterate over 32-bit aligned loads */
while
(
p
<
srcend
-
3
)
{
const
uint32_t
w
=
*
((
const
uint32_t
*
)
p
);
*
dest
++
=
w
&
0xff
;
*
dest
++
=
(
w
>>
8
)
&
0xff
;
*
dest
++
=
(
w
>>
16
)
&
0xff
;
*
dest
++
=
(
w
>>
24
)
&
0xff
;
p
+=
4
;
while
(
p
!=
srcend
){
uint8_t
codepoint
=
*
p
++
;
if
(
codepoint
<
0x80
){
*
dest
++
=
(
uint8_t
)
codepoint
;
}
else
{
*
dest
++
=
(
uint8_t
)
(
0xC0
+
(
codepoint
>>
6
));
*
dest
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
}
#endif
#endif
/* handle unaligned suffix */
while
(
p
!=
srcend
)
*
dest
++
=
*
p
++
;
return
(
dest
-
dest0
);
}
/*
...
...
@@ -146,82 +104,45 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
*/
#if defined(__GNUC__) || defined(__clang__)
static
inline
uint8_t
const
*
_hs_text_decode_utf8_int
(
uint
16
_t
*
const
dest
,
size_t
*
destoff
,
_hs_text_decode_utf8_int
(
uint
8
_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
**
src
,
const
uint8_t
*
srcend
,
uint32_t
*
codepoint0
,
uint32_t
*
state0
)
__attribute
((
always_inline
));
#endif
static
inline
uint8_t
const
*
_hs_text_decode_utf8_int
(
uint
16
_t
*
const
dest
,
size_t
*
destoff
,
_hs_text_decode_utf8_int
(
uint
8
_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
**
src
,
const
uint8_t
*
srcend
,
uint32_t
*
codepoint0
,
uint32_t
*
state0
)
{
uint
16
_t
*
d
=
dest
+
*
destoff
;
uint
8
_t
*
d
=
dest
+
*
destoff
;
const
uint8_t
*
s
=
*
src
,
*
last
=
*
src
;
uint32_t
state
=
*
state0
;
uint32_t
codepoint
=
*
codepoint0
;
while
(
s
<
srcend
)
{
#if defined(__i386__) || defined(__x86_64__)
/*
* This code will only work on a little-endian system that
* supports unaligned loads.
*
* It gives a substantial speed win on data that is purely or
* partly ASCII (e.g. HTML), at only a slight cost on purely
* non-ASCII text.
*/
if
(
state
==
UTF8_ACCEPT
)
{
#if defined(__x86_64__)
const
__m128i
zeros
=
_mm_set1_epi32
(
0
);
while
(
s
<
srcend
-
8
)
{
const
uint64_t
hopefully_eight_ascii_chars
=
*
((
uint64_t
*
)
s
);
if
((
hopefully_eight_ascii_chars
&
0x8080808080808080LL
)
!=
0LL
)
break
;
s
+=
8
;
/* Load 8 bytes of ASCII data */
const
__m128i
eight_ascii_chars
=
_mm_cvtsi64_si128
(
hopefully_eight_ascii_chars
);
/* Interleave with zeros */
const
__m128i
eight_utf16_chars
=
_mm_unpacklo_epi8
(
eight_ascii_chars
,
zeros
);
/* Store the resulting 16 bytes into destination */
_mm_storeu_si128
((
__m128i
*
)
d
,
eight_utf16_chars
);
d
+=
8
;
}
#else
while
(
s
<
srcend
-
4
)
{
codepoint
=
*
((
uint32_t
*
)
s
);
if
((
codepoint
&
0x80808080
)
!=
0
)
break
;
s
+=
4
;
/*
* Tried 32-bit stores here, but the extra bit-twiddling
* slowed the code down.
*/
*
d
++
=
(
uint16_t
)
(
codepoint
&
0xff
);
*
d
++
=
(
uint16_t
)
((
codepoint
>>
8
)
&
0xff
);
*
d
++
=
(
uint16_t
)
((
codepoint
>>
16
)
&
0xff
);
*
d
++
=
(
uint16_t
)
((
codepoint
>>
24
)
&
0xff
);
}
#endif
last
=
s
;
}
/* end if (state == UTF8_ACCEPT) */
#endif
if
(
decode
(
&
state
,
&
codepoint
,
*
s
++
)
!=
UTF8_ACCEPT
)
{
if
(
state
!=
UTF8_REJECT
)
continue
;
continue
;
break
;
}
if
(
codepoint
<=
0xffff
)
*
d
++
=
(
uint16_t
)
codepoint
;
else
{
*
d
++
=
(
uint16_t
)
(
0xD7C0
+
(
codepoint
>>
10
));
*
d
++
=
(
uint16_t
)
(
0xDC00
+
(
codepoint
&
0x3FF
));
if
(
codepoint
<
0x80
){
*
d
++
=
(
uint8_t
)
codepoint
;
}
else
if
(
codepoint
<
0x800
){
*
d
++
=
(
uint8_t
)
(
0xC0
+
(
codepoint
>>
6
));
*
d
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
else
if
(
codepoint
<
0x10000
){
*
d
++
=
(
uint8_t
)
(
0xE0
+
(
codepoint
>>
12
));
*
d
++
=
(
uint8_t
)
(
0x80
+
((
codepoint
>>
6
)
&
0x3F
));
*
d
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
else
{
*
d
++
=
(
uint8_t
)
(
0xF0
+
(
codepoint
>>
18
));
*
d
++
=
(
uint8_t
)
(
0x80
+
((
codepoint
>>
12
)
&
0x3F
));
*
d
++
=
(
uint8_t
)
(
0x80
+
((
codepoint
>>
6
)
&
0x3F
));
*
d
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
last
=
s
;
}
...
...
@@ -234,7 +155,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
}
uint8_t
const
*
_hs_text_decode_utf8_state
(
uint
16
_t
*
const
dest
,
size_t
*
destoff
,
_hs_text_decode_utf8_state
(
uint
8
_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
**
src
,
const
uint8_t
*
srcend
,
uint32_t
*
codepoint0
,
uint32_t
*
state0
)
...
...
@@ -248,7 +169,7 @@ _hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff,
* Helper to decode buffer and discard final decoder state
*/
const
uint8_t
*
_hs_text_decode_utf8
(
uint
16
_t
*
const
dest
,
size_t
*
destoff
,
_hs_text_decode_utf8
(
uint
8
_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
*
src
,
const
uint8_t
*
const
srcend
)
{
uint32_t
codepoint
;
...
...
@@ -257,90 +178,3 @@ _hs_text_decode_utf8(uint16_t *const dest, size_t *destoff,
&
codepoint
,
&
state
);
return
src
;
}
void
_hs_text_encode_utf8
(
uint8_t
**
destp
,
const
uint16_t
*
src
,
size_t
srcoff
,
size_t
srclen
)
{
const
uint16_t
*
srcend
;
uint8_t
*
dest
=
*
destp
;
src
+=
srcoff
;
srcend
=
src
+
srclen
;
ascii:
#if defined(__x86_64__)
while
(
srcend
-
src
>=
8
)
{
union
{
uint64_t
halves
[
2
];
__m128i
whole
;
}
eight_chars
;
eight_chars
.
whole
=
_mm_loadu_si128
((
__m128i
*
)
src
);
const
uint64_t
w
=
eight_chars
.
halves
[
0
];
if
(
w
&
0xFF80FF80FF80FF80ULL
)
{
if
(
!
(
w
&
0x000000000000FF80ULL
))
{
*
dest
++
=
w
&
0xFFFF
;
src
++
;
if
(
!
(
w
&
0x00000000FF800000ULL
))
{
*
dest
++
=
(
w
>>
16
)
&
0xFFFF
;
src
++
;
if
(
!
(
w
&
0x0000FF8000000000ULL
))
{
*
dest
++
=
(
w
>>
32
)
&
0xFFFF
;
src
++
;
}
}
}
break
;
}
if
(
eight_chars
.
halves
[
1
]
&
0xFF80FF80FF80FF80ULL
)
{
break
;
}
const
__m128i
eight_ascii_chars
=
_mm_packus_epi16
(
eight_chars
.
whole
,
eight_chars
.
whole
);
_mm_storel_epi64
((
__m128i
*
)
dest
,
eight_ascii_chars
);
dest
+=
8
;
src
+=
8
;
}
#endif
#if defined(__i386__)
while
(
srcend
-
src
>=
2
)
{
uint32_t
w
=
*
((
uint32_t
*
)
src
);
if
(
w
&
0xFF80FF80
)
break
;
*
dest
++
=
w
&
0xFFFF
;
*
dest
++
=
w
>>
16
;
src
+=
2
;
}
#endif
while
(
src
<
srcend
)
{
uint16_t
w
=
*
src
++
;
if
(
w
<=
0x7F
)
{
*
dest
++
=
w
;
/* An ASCII byte is likely to begin a run of ASCII bytes.
Falling back into the fast path really helps performance. */
goto
ascii
;
}
else
if
(
w
<=
0x7FF
)
{
*
dest
++
=
(
w
>>
6
)
|
0xC0
;
*
dest
++
=
(
w
&
0x3f
)
|
0x80
;
}
else
if
(
w
<
0xD800
||
w
>
0xDBFF
)
{
*
dest
++
=
(
w
>>
12
)
|
0xE0
;
*
dest
++
=
((
w
>>
6
)
&
0x3F
)
|
0x80
;
*
dest
++
=
(
w
&
0x3F
)
|
0x80
;
}
else
{
uint32_t
c
=
((((
uint32_t
)
w
)
-
0xD800
)
<<
10
)
+
(((
uint32_t
)
*
src
++
)
-
0xDC00
)
+
0x10000
;
*
dest
++
=
(
c
>>
18
)
|
0xF0
;
*
dest
++
=
((
c
>>
12
)
&
0x3F
)
|
0x80
;
*
dest
++
=
((
c
>>
6
)
&
0x3F
)
|
0x80
;
*
dest
++
=
(
c
&
0x3F
)
|
0x80
;
}
}
*
destp
=
dest
;
}
cbits/utils.c
0 → 100644
View file @
4b3667e0
/*
* Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
*/
#include <stdio.h>
#include <string.h>
int
_hs_text_memcmp
(
const
void
*
arr1
,
size_t
off1
,
const
void
*
arr2
,
size_t
off2
,
size_t
len
)
{
return
memcmp
(
arr1
+
off1
,
arr2
+
off2
,
len
);
}
src/Data/Text.hs
View file @
4b3667e0
...
...
@@ -9,6 +9,7 @@
-- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts,
-- (c) 2008, 2009 Tom Harper
-- (c) 2021 Andrew Lelechenko
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
...
...
@@ -227,10 +228,8 @@ import Data.Text.Internal.Private (span_)
import
Data.Text.Internal
(
Text
(
..
),
empty
,
firstf
,
mul
,
safe
,
text
)
import
Data.Text.Show
(
singleton
,
unpack
,
unpackCString
#
)
import
qualified
Prelude
as
P
import
Data.Text.Unsafe
(
Iter
(
..
),
iter
,
iter_
,
lengthWord
16
,
reverseIter
,
import
Data.Text.Unsafe
(
Iter
(
..
),
iter
,
iter_
,
lengthWord
8
,
reverseIter
,
reverseIter_
,
unsafeHead
,
unsafeTail
)
import
Data.Text.Internal.Unsafe.Char
(
unsafeChr
)
import
qualified
Data.Text.Internal.Encoding.Utf16
as
U16
import
Data.Text.Internal.Search
(
indices
)
#
if
defined
(
__HADDOCK__
)
import
Data.ByteString
(
ByteString
)
...
...
@@ -291,7 +290,8 @@ import Text.Printf (PrintfArg, formatArg, formatString)
-- points
-- (<http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=13 §3.4, definition D10 >)
-- as 'Char' values, including code points from this invalid range.
-- This means that there are some 'Char' values that are not valid
-- This means that there are some 'Char' values
-- (corresponding to 'Data.Char.Surrogate' category) that are not valid
-- Unicode scalar values, and the functions in this module must handle
-- those cases.
--
...
...
@@ -300,12 +300,7 @@ import Text.Printf (PrintfArg, formatArg, formatString)
-- that are not valid Unicode scalar values with the replacement
-- character \"�\" (U+FFFD). Functions that perform this
-- inspection and replacement are documented with the phrase
-- \"Performs replacement on invalid scalar values\".
--
-- (One reason for this policy of replacement is that internally, a
-- 'Text' value is represented as packed UTF-16 data. Values in the
-- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
-- code points, and so cannot be represented. The functions replace
-- \"Performs replacement on invalid scalar values\". The functions replace
-- invalid scalar values, instead of dropping them, as a security
-- measure. For details, see
-- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters Unicode Technical Report 36, §3.5 >.)
...
...
@@ -487,12 +482,9 @@ second f (a, b) = (a, f b)
-- | /O(1)/ Returns the last character of a 'Text', which must be
-- non-empty.
last
::
Text
->
Char
last
(
Text
arr
off
len
)
|
len
<=
0
=
emptyError
"last"
|
n
<
0xDC00
||
n
>
0xDFFF
=
unsafeChr
n
|
otherwise
=
U16
.
chr2
n0
n
where
n
=
A
.
unsafeIndex
arr
(
off
+
len
-
1
)
n0
=
A
.
unsafeIndex
arr
(
off
+
len
-
2
)
last
t
@
(
Text
_
_
len
)
|
len
<=
0
=
emptyError
"last"
|
otherwise
=
let
Iter
c
_
=
reverseIter
t
(
len
-
1
)
in
c
{-# INLINE [1] last #-}
-- | /O(1)/ Returns all characters after the head of a 'Text', which
...
...
@@ -507,11 +499,9 @@ tail t@(Text arr off len)
-- | /O(1)/ Returns all but the last character of a 'Text', which must
-- be non-empty.
init
::
Text
->
Text
init
(
Text
arr
off
len
)
|
len
<=
0
=
emptyError
"init"
|
n
>=
0xDC00
&&
n
<=
0xDFFF
=
text
arr
off
(
len
-
2
)
|
otherwise
=
text
arr
off
(
len
-
1
)
where
n
=
A
.
unsafeIndex
arr
(
off
+
len
-
1
)
init
t
@
(
Text
arr
off
len
)
|
len
<=
0
=
emptyError
"init"
|
otherwise
=
text
arr
off
(
len
+
reverseIter_
t
(
len
-
1
))
{-# INLINE [1] init #-}
-- | /O(1)/ Returns all but the last character and the last character of a
...
...
@@ -519,12 +509,11 @@ init (Text arr off len) | len <= 0 = emptyError "init"
--
-- @since 1.2.3.0
unsnoc
::
Text
->
Maybe
(
Text
,
Char
)
unsnoc
(
Text
arr
off
len
)
|
len
<=
0
=
Nothing
|
n
<
0xDC00
||
n
>
0xDFFF
=
Just
(
text
arr
off
(
len
-
1
),
unsafeChr
n
)
|
otherwise
=
Just
(
text
arr
off
(
len
-
2
),
U16
.
chr2
n0
n
)
where
n
=
A
.
unsafeIndex
arr
(
off
+
len
-
1
)
n0
=
A
.
unsafeIndex
arr
(
off
+
len
-
2
)
unsnoc
t
@
(
Text
arr
off
len
)
|
len
<=
0
=
Nothing
|
otherwise
=
Just
(
text
arr
off
(
len
+
d
),
c
)
where
Iter
c
d
=
reverseIter
t
(
len
-
1
)
{-# INLINE [1] unsnoc #-}
-- | /O(1)/ Tests whether a 'Text' is empty or not.
...
...
@@ -911,7 +900,7 @@ concat ts = case ts' of
_
->
Text
(
A
.
run
go
)
0
len
where
ts'
=
L
.
filter
(
not
.
null
)
ts
len
=
sumP
"concat"
$
L
.
map
lengthWord
16
ts'
len
=
sumP
"concat"
$
L
.
map
lengthWord
8
ts'
go
::
ST
s
(
A
.
MArray
s
)
go
=
do
arr
<-
A
.
new
len
...
...
@@ -1263,7 +1252,7 @@ groupBy p = loop
where
Iter
c
d
=
iter
t
0
n
=
d
+
findAIndexOrEnd
(
not
.
p
c
)
(
Text
arr
(
off
+
d
)
(
len
-
d
))
-- | Returns the /array/ index (in units of 'Word
16
') at which a
-- | Returns the /array/ index (in units of 'Word
8
') at which a
-- character may be found. This is /not/ the same as the logical
-- index returned by e.g. 'findIndex'.
findAIndexOrEnd
::
(
Char
->
Bool
)
->
Text
->
Int
...
...
@@ -1569,9 +1558,10 @@ words t@(Text arr off len) = loop 0 0
|
n
>=
len
=
if
start
==
n
then
[]
else
[
Text
arr
(
start
+
off
)
(
n
-
start
)]
-- Spaces in UTF-8 can take from 1 byte for 0x09 and up to 3 bytes for 0x3000.
|
isSpace
c
=
if
start
==
n
then
loop
(
start
+
1
)
(
start
+
1
)
then
loop
(
n
+
d
)
(
n
+
d
)
else
Text
arr
(
start
+
off
)
(
n
-
start
)
:
loop
(
n
+
d
)
(
n
+
d
)
|
otherwise
=
loop
start
(
n
+
d
)
where
Iter
c
d
=
iter
t
n
...
...
src/Data/Text/Array.hs
View file @
4b3667e0
...
...
@@ -44,14 +44,14 @@ module Data.Text.Array
import
Control.Exception
(
assert
)
import
GHC.Stack
(
HasCallStack
)
#
endif
import
Data.Bits
((
.&.
),
xor
,
shiftL
,
shiftR
)
import
Data.Bits
((
.&.
),
xor
,
shiftR
)
#
if
!
MIN_VERSION_base
(
4
,
11
,
0
)
import
Data.Text.Internal.Unsafe
(
inlinePerformIO
)
#
endif
import
Foreign.C.Types
(
CInt
(
..
))
#
endif
import
GHC.Exts
hiding
(
toList
)
import
GHC.ST
(
ST
(
..
),
runST
)
import
GHC.Word
(
Word
16
(
..
))
import
GHC.Word
(
Word
8
(
..
))
import
Prelude
hiding
(
length
,
read
)
-- | Immutable array type.
...
...
@@ -88,7 +88,7 @@ unsafeFreeze MArray{..} = ST $ \s1# ->
-- | Indicate how many bytes would be used for an array of the given
-- size.
bytesInArray
::
Int
->
Int
bytesInArray
n
=
n
`
shiftL
`
1
bytesInArray
n
=
n
{-# INLINE bytesInArray #-}
-- | Unchecked read of an immutable array. May return garbage or
...
...
@@ -97,15 +97,15 @@ unsafeIndex ::
#
if
defined
(
ASSERTS
)
HasCallStack
=>
#
endif
Array
->
Int
->
Word
16
Array
->
Int
->
Word
8
unsafeIndex
a
@
Array
{
..
}
i
@
(
I
#
i
#
)
=
#
if
defined
(
ASSERTS
)
let
word
16
len
=
I
#
(
sizeofByteArray
#
aBA
)
`
quot
`
2
in
if
i
<
0
||
i
>=
word
16
len
then
error
(
"Data.Text.Array.unsafeIndex: bounds error, offset "
++
show
i
++
", length "
++
show
word
16
len
)
let
word
8
len
=
I
#
(
sizeofByteArray
#
aBA
)
in
if
i
<
0
||
i
>=
word
8
len
then
error
(
"Data.Text.Array.unsafeIndex: bounds error, offset "
++
show
i
++
", length "
++
show
word
8
len
)
else
#
endif
case
indexWord
16
Array
#
aBA
i
#
of
r
#
->
(
W
16
#
r
#
)
case
indexWord
8
Array
#
aBA
i
#
of
r
#
->
(
W
8
#
r
#
)
{-# INLINE unsafeIndex #-}
#
if
defined
(
ASSERTS
)
...
...
@@ -130,17 +130,17 @@ unsafeWrite ::
#
if
defined
(
ASSERTS
)
HasCallStack
=>
#
endif
MArray
s
->
Int
->
Word
16
->
ST
s
()
unsafeWrite
ma
@
MArray
{
..
}
i
@
(
I
#
i
#
)
(
W
16
#
e
#
)
=
MArray
s
->
Int
->
Word
8
->
ST
s
()
unsafeWrite
ma
@
MArray
{
..
}
i
@
(
I
#
i
#
)
(
W
8
#
e
#
)
=
#
if
defined
(
ASSERTS
)
checkBoundsM
ma
(
i
*
2
)
2
>>
checkBoundsM
ma
i
1
>>
#
endif
(
ST
$
\
s1
#
->
case
writeWord
16
Array
#
maBA
i
#
e
#
s1
#
of
(
ST
$
\
s1
#
->
case
writeWord
8
Array
#
maBA
i
#
e
#
s1
#
of
s2
#
->
(
#
s2
#
,
()
#
))
{-# INLINE unsafeWrite #-}
-- | Convert an immutable array to a list.
toList
::
Array
->
Int
->
Int
->
[
Word
16
]
toList
::
Array
->
Int
->
Int
->
[
Word
8
]
toList
ary
off
len
=
loop
0
where
loop
i
|
i
<
len
=
unsafeIndex
ary
(
off
+
i
)
:
loop
(
i
+
1
)
|
otherwise
=
[]
...
...
@@ -176,10 +176,10 @@ copyM dst@(MArray dst#) dstOff@(I# dstOff#) src@(MArray src#) srcOff@(I# srcOff#
#
if
defined
(
ASSERTS
)
srcLen
<-
getSizeofMArray
src
dstLen
<-
getSizeofMArray
dst
assert
(
srcOff
+
count
<=
srcLen
`
quot
`
2
)
.
assert
(
dstOff
+
count
<=
dstLen
`
quot
`
2
)
.
assert
(
srcOff
+
count
<=
srcLen
)
.
assert
(
dstOff
+
count
<=
dstLen
)
.
#
endif
ST
$
\
s1
#
->
case
copyMutableByteArray
#
src
#
(
2
#
*#
srcOff
#
)
dst
#
(
2
#
*#
dstOff
#
)
(
2
#
*
#
count
#
)
s1
#
of
ST
$
\
s1
#
->
case
copyMutableByteArray
#
src
#
srcOff
#
dst
#
dstOff
#
count
#
s1
#
of
s2
#
->
(
#
s2
#
,
()
#
)
{-# INLINE copyM #-}
...
...
@@ -194,7 +194,7 @@ copyI :: MArray s -- ^ Destination
copyI
(
MArray
dst
#
)
dstOff
@
(
I
#
dstOff
#
)
(
Array
src
#
)
(
I
#
srcOff
#
)
top
@
(
I
#
top
#
)
|
dstOff
>=
top
=
return
()
|
otherwise
=
ST
$
\
s1
#
->
case
copyByteArray
#
src
#
(
2
#
*#
srcOff
#
)
dst
#
(
2
#
*#
dstOff
#
)
(
2
#
*
#
(
top
#
-#
dstOff
#
)
)
s1
#
of
case
copyByteArray
#
src
#
srcOff
#
dst
#
dstOff
#
(
top
#
-#
dstOff
#
)
s1
#
of
s2
#
->
(
#
s2
#
,
()
#
)
{-# INLINE copyI #-}
...
...
@@ -209,7 +209,7 @@ equal :: Array -- ^ First
equal
(
Array
src1
#
)
(
I
#
off1
#
)
(
Array
src2
#
)
(
I
#
off2
#
)
(
I
#
count
#
)
=
i
==
0
where
#
if
MIN_VERSION_base
(
4
,
11
,
0
)
i
=
I
#
(
compareByteArrays
#
src1
#
(
2
#
*#
off1
#
)
src2
#
(
2
#
*#
off2
#
)
(
2
#
*
#
count
#
)
)
i
=
I
#
(
compareByteArrays
#
src1
#
off1
#
src2
#
off2
#
count
#
)
#
else
i
=
fromIntegral
(
inlinePerformIO
(
memcmp
src1
#
off1
#
src2
#
off2
#
count
#
))
...
...
src/Data/Text/Encoding.hs
View file @
4b3667e0
...
...
@@ -7,6 +7,7 @@
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts,
-- (c) 2008, 2009 Tom Harper
-- (c) 2021 Andrew Lelechenko
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
...
...
@@ -63,30 +64,29 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import
Control.Exception
(
evaluate
,
try
,
throwIO
,
ErrorCall
(
ErrorCall
))
import
Control.Monad.ST
(
runST
)
import
Data.Bits
((
.&.
),
shiftR
)
import
Data.ByteString
as
B
import
qualified
Data.ByteString.Internal
as
B
import
qualified
Data.ByteString.
Short.
Internal
as
SBS
import
Data.Foldable
(
traverse_
)
import
Data.Text.Encoding.Error
(
OnDecodeError
,
UnicodeException
,
strictDecode
,
lenientDecode
)
import
Data.Text.Internal
(
Text
(
..
),
safe
,
text
)
import
Data.Text.Internal.Private
(
runText
)
import
Data.Text.Internal.Unsafe
(
unsafeWithForeignPtr
)
import
Data.Text.Internal.Unsafe.Char
(
ord
,
unsafeWrite
)
import
Data.Text.Internal.Unsafe.Char
(
unsafeWrite
)
import
Data.Text.Show
()
import
Data.Text.Unsafe
(
unsafeDupablePerformIO
)
import
Data.Word
(
Word8
,
Word16
,
Word32
)
import
Foreign.C.Types
(
CSize
(
CSize
)
)
import
Data.Word
(
Word8
,
Word32
)
import
Foreign.C.Types
(
CSize
)
import
Foreign.Marshal.Utils
(
with
)
import
Foreign.Ptr
(
Ptr
,
minusPtr
,
nullPtr
,
plusPtr
)
import
Foreign.Storable
(
Storable
,
peek
,
poke
)
import
GHC.Base
(
ByteArray
#
,
MutableByteArray
#
)
import
GHC.Base
(
MutableByteArray
#
)
import
qualified
Data.ByteString.Builder
as
B
import
qualified
Data.ByteString.Builder.Internal
as
B
hiding
(
empty
,
append
)
import
qualified
Data.ByteString.Builder.Prim
as
BP
import
qualified
Data.ByteString.Builder.Prim.Internal
as
BP
import
Data.Text.Internal.Encoding.Utf8
(
utf8LengthByLeader
)
import
qualified
Data.Text.Array
as
A
import
qualified
Data.Text.Internal.Encoding.Fusion
as
E
import
qualified
Data.Text.Internal.Encoding.Utf16
as
U16
import
qualified
Data.Text.Internal.Fusion
as
F
import
Data.Text.Internal.ByteStringCompat
#
if
defined
(
ASSERTS
)
...
...
@@ -124,12 +124,12 @@ decodeLatin1 ::
#
endif
ByteString
->
Text
decodeLatin1
bs
=
withBS
bs
aux
where
aux
fp
len
=
text
a
0
l
en
aux
fp
len
=
text
a
0
actualL
en
where
a
=
A
.
run
(
A
.
new
len
>>=
unsafeIOToST
.
go
)
go
dest
=
unsafeWithForeignPtr
fp
$
\
ptr
->
do
c_decode_latin1
(
A
.
maBA
dest
)
ptr
(
ptr
`
plusPtr
`
len
)
return
dest
(
a
,
actualLen
)
=
A
.
run
2
(
A
.
new
(
2
*
len
)
>>=
unsafeIOToST
.
go
)
go
dest
=
unsafeWithForeignPtr
fp
$
\
src
->
do
destLen
<-
c_decode_latin1
(
A
.
maBA
dest
)
src
(
src
`
plusPtr
`
len
)
return
(
dest
,
destLen
)
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
...
...
@@ -161,6 +161,8 @@ decodeUtf8With onErr bs = withBS bs aux
case
onErr
desc
(
Just
x
)
of
Nothing
->
loop
$
curPtr'
`
plusPtr
`
1
Just
c
-- TODO This is problematic, because even BMP replacement characters
-- can take longer than one UTF8 code unit (which is byte).
|
c
>
'
\xFFFF
'
->
throwUnsupportedReplChar
|
otherwise
->
do
destOff
<-
peek
destOffPtr
...
...
@@ -170,43 +172,14 @@ decodeUtf8With onErr bs = withBS bs aux
poke
destOffPtr
(
destOff
+
intToCSize
w
)
loop
$
curPtr'
`
plusPtr
`
1
loop
ptr
(
unsafeIOToST
.
go
)
=<<
A
.
new
len
-- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.