Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
Glasgow Haskell Compiler
Packages
text
Commits
cdc80a84
Commit
cdc80a84
authored
Aug 21, 2021
by
Bodigrim
Browse files
Use native (and naive) UTF8 decoder
parent
14c6ae11
Changes
9
Hide whitespace changes
Inline
Side-by-side
cbits/cbits.c
deleted
100644 → 0
View file @
14c6ae11
/*
* Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
*
* Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
*
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
*/
#include <string.h>
#include <stdint.h>
#include <stdio.h>
#if defined(__x86_64__)
#include <emmintrin.h>
#include <xmmintrin.h>
#endif
#include "text_cbits.h"
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
static
const
uint8_t
utf8d
[]
=
{
/*
* The first part of the table maps bytes to character classes that
* to reduce the size of the transition table and create bitmasks.
*/
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
9
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
7
,
8
,
8
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
2
,
10
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
3
,
4
,
3
,
3
,
11
,
6
,
6
,
6
,
5
,
8
,
8
,
8
,
8
,
8
,
8
,
8
,
8
,
8
,
8
,
8
,
/*
* The second part is a transition table that maps a combination of
* a state of the automaton and a character class to a state.
*/
0
,
12
,
24
,
36
,
60
,
96
,
84
,
12
,
12
,
12
,
48
,
72
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
0
,
12
,
12
,
12
,
12
,
12
,
0
,
12
,
0
,
12
,
12
,
12
,
24
,
12
,
12
,
12
,
12
,
12
,
24
,
12
,
24
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
24
,
12
,
12
,
12
,
12
,
12
,
24
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
24
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
36
,
12
,
36
,
12
,
12
,
12
,
36
,
12
,
12
,
12
,
12
,
12
,
36
,
12
,
36
,
12
,
12
,
12
,
36
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
12
,
};
static
inline
uint32_t
decode
(
uint32_t
*
state
,
uint32_t
*
codep
,
uint32_t
byte
)
{
uint32_t
type
=
utf8d
[
byte
];
*
codep
=
(
*
state
!=
UTF8_ACCEPT
)
?
(
byte
&
0x3fu
)
|
(
*
codep
<<
6
)
:
(
0xff
>>
type
)
&
(
byte
);
return
*
state
=
utf8d
[
256
+
*
state
+
type
];
}
/*
* A best-effort decoder. Runs until it hits either end of input or
* the start of an invalid byte sequence.
*
* At exit, we update *destoff with the next offset to write to, *src
* with the next source location past the last one successfully
* decoded, and return the next source location to read from.
*
* Moreover, we expose the internal decoder state (state0 and
* codepoint0), allowing one to restart the decoder after it
* terminates (say, due to a partial codepoint).
*
* In particular, there are a few possible outcomes,
*
* 1) We decoded the buffer entirely:
* In this case we return srcend
* state0 == UTF8_ACCEPT
*
* 2) We met an invalid encoding
* In this case we return the address of the first invalid byte
* state0 == UTF8_REJECT
*
* 3) We reached the end of the buffer while decoding a codepoint
* In this case we return a pointer to the first byte of the partial codepoint
* state0 != UTF8_ACCEPT, UTF8_REJECT
*
*/
#if defined(__GNUC__) || defined(__clang__)
static
inline
uint8_t
const
*
_hs_text_decode_utf8_int
(
uint8_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
**
src
,
const
uint8_t
*
srcend
,
uint32_t
*
codepoint0
,
uint32_t
*
state0
)
__attribute
((
always_inline
));
#endif
static
inline
uint8_t
const
*
_hs_text_decode_utf8_int
(
uint8_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
**
src
,
const
uint8_t
*
srcend
,
uint32_t
*
codepoint0
,
uint32_t
*
state0
)
{
uint8_t
*
d
=
dest
+
*
destoff
;
const
uint8_t
*
s
=
*
src
,
*
last
=
*
src
;
uint32_t
state
=
*
state0
;
uint32_t
codepoint
=
*
codepoint0
;
while
(
s
<
srcend
)
{
if
(
decode
(
&
state
,
&
codepoint
,
*
s
++
)
!=
UTF8_ACCEPT
)
{
if
(
state
!=
UTF8_REJECT
)
continue
;
break
;
}
if
(
codepoint
<
0x80
){
*
d
++
=
(
uint8_t
)
codepoint
;
}
else
if
(
codepoint
<
0x800
){
*
d
++
=
(
uint8_t
)
(
0xC0
+
(
codepoint
>>
6
));
*
d
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
else
if
(
codepoint
<
0x10000
){
*
d
++
=
(
uint8_t
)
(
0xE0
+
(
codepoint
>>
12
));
*
d
++
=
(
uint8_t
)
(
0x80
+
((
codepoint
>>
6
)
&
0x3F
));
*
d
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
else
{
*
d
++
=
(
uint8_t
)
(
0xF0
+
(
codepoint
>>
18
));
*
d
++
=
(
uint8_t
)
(
0x80
+
((
codepoint
>>
12
)
&
0x3F
));
*
d
++
=
(
uint8_t
)
(
0x80
+
((
codepoint
>>
6
)
&
0x3F
));
*
d
++
=
(
uint8_t
)
(
0x80
+
(
codepoint
&
0x3F
));
}
last
=
s
;
}
*
destoff
=
d
-
dest
;
*
codepoint0
=
codepoint
;
*
state0
=
state
;
*
src
=
last
;
return
s
;
}
uint8_t
const
*
_hs_text_decode_utf8_state
(
uint8_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
**
src
,
const
uint8_t
*
srcend
,
uint32_t
*
codepoint0
,
uint32_t
*
state0
)
{
_hs_text_decode_utf8_int
(
dest
,
destoff
,
src
,
srcend
,
codepoint0
,
state0
);
return
*
src
;
}
/*
* Helper to decode buffer and discard final decoder state
*/
const
uint8_t
*
_hs_text_decode_utf8
(
uint8_t
*
const
dest
,
size_t
*
destoff
,
const
uint8_t
*
src
,
const
uint8_t
*
const
srcend
)
{
uint32_t
codepoint
;
uint32_t
state
=
UTF8_ACCEPT
;
_hs_text_decode_utf8_int
(
dest
,
destoff
,
&
src
,
srcend
,
&
codepoint
,
&
state
);
return
src
;
}
include/text_cbits.h
deleted
100644 → 0
View file @
14c6ae11
/*
* Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>.
*/
#ifndef _text_cbits_h
#define _text_cbits_h
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12
#endif
src/Data/Text.hs
View file @
cdc80a84
...
...
@@ -231,7 +231,7 @@ import qualified Data.Text.Internal.Fusion.Common as S
import
Data.Text.Encoding
(
decodeUtf8'
,
encodeUtf8
)
import
Data.Text.Internal.Fusion
(
stream
,
reverseStream
,
unstream
)
import
Data.Text.Internal.Private
(
span_
)
import
Data.Text.Internal
(
Text
(
..
),
empty
,
firstf
,
mul
,
safe
,
text
)
import
Data.Text.Internal
(
Text
(
..
),
empty
,
firstf
,
mul
,
safe
,
text
,
append
)
import
Data.Text.Internal.Unsafe.Char
(
unsafeWrite
,
unsafeChr8
)
import
Data.Text.Show
(
singleton
,
unpack
,
unpackCString
#
)
import
qualified
Prelude
as
P
...
...
@@ -446,24 +446,6 @@ snoc :: Text -> Char -> Text
snoc
t
c
=
unstream
(
S
.
snoc
(
stream
t
)
(
safe
c
))
{-# INLINE snoc #-}
-- | /O(n)/ Appends one 'Text' to the other by copying both of them
-- into a new 'Text'.
append
::
Text
->
Text
->
Text
append
a
@
(
Text
arr1
off1
len1
)
b
@
(
Text
arr2
off2
len2
)
|
len1
==
0
=
b
|
len2
==
0
=
a
|
len
>
0
=
Text
(
A
.
run
x
)
0
len
|
otherwise
=
overflowError
"append"
where
len
=
len1
+
len2
x
::
ST
s
(
A
.
MArray
s
)
x
=
do
arr
<-
A
.
new
len
A
.
copyI
len1
arr
0
arr1
off1
A
.
copyI
len2
arr
len1
arr2
off2
return
arr
{-# NOINLINE append #-}
-- | /O(1)/ Returns the first character of a 'Text', which must be
-- non-empty.
head
::
Text
->
Char
...
...
src/Data/Text/Encoding.hs
View file @
cdc80a84
...
...
@@ -2,6 +2,8 @@
UnliftedFFITypes #-}
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE ViewPatterns #-}
-- |
-- Module : Data.Text.Encoding
-- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan,
...
...
@@ -62,33 +64,30 @@ module Data.Text.Encoding
import
Control.Monad.ST.Unsafe
(
unsafeIOToST
,
unsafeSTToIO
)
import
Control.Exception
(
evaluate
,
try
,
throwIO
,
ErrorCall
(
ErrorCall
)
)
import
Control.Monad.ST
(
runST
)
import
Control.Exception
(
evaluate
,
try
)
import
Control.Monad.ST
(
runST
,
ST
)
import
Data.Bits
(
shiftR
,
(
.&.
))
import
Data.ByteString
(
ByteString
)
import
qualified
Data.ByteString
as
B
import
qualified
Data.ByteString.Internal
as
B
import
qualified
Data.ByteString.Short.Internal
as
SBS
import
Data.Foldable
(
traverse_
)
import
Data.Text.Encoding.Error
(
OnDecodeError
,
UnicodeException
,
strictDecode
,
lenientDecode
)
import
Data.Text.Internal
(
Text
(
..
),
safe
,
empty
,
text
)
import
Data.Text.Internal.Private
(
runText
)
import
Data.Text.Internal
(
Text
(
..
),
safe
,
empty
,
append
)
import
Data.Text.Internal.Unsafe
(
unsafeWithForeignPtr
)
import
Data.Text.Internal.Unsafe.Char
(
unsafeWrite
)
import
Data.Text.Show
(
)
import
Data.Text.Show
as
T
(
singleton
)
import
Data.Text.Unsafe
(
unsafeDupablePerformIO
)
import
Data.Word
(
Word8
,
Word32
)
import
Data.Word
(
Word8
)
import
Foreign.C.Types
(
CSize
(
..
))
import
Foreign.Marshal.Utils
(
with
)
import
Foreign.Ptr
(
Ptr
,
minusPtr
,
nullPtr
,
plusPtr
)
import
Foreign.Storable
(
Storable
,
peek
,
poke
,
peekByteOff
)
import
GHC.Exts
(
MutableByteArray
#
,
byteArrayContents
#
,
unsafeCoerce
#
)
import
Foreign.Ptr
(
Ptr
,
minusPtr
,
plusPtr
)
import
Foreign.Storable
(
poke
,
peekByteOff
)
import
GHC.Exts
(
byteArrayContents
#
,
unsafeCoerce
#
)
import
GHC.ForeignPtr
(
ForeignPtr
(
..
),
ForeignPtrContents
(
PlainPtr
))
import
qualified
Data.ByteString.Builder
as
B
import
qualified
Data.ByteString.Builder.Internal
as
B
hiding
(
empty
,
append
)
import
qualified
Data.ByteString.Builder.Prim
as
BP
import
qualified
Data.ByteString.Builder.Prim.Internal
as
BP
import
Data.Text.Internal.Encoding.Utf8
(
utf8LengthByLeader
)
import
Data.Text.Internal.Encoding.Utf8
(
utf8LengthByLeader
,
utf8DecodeStart
,
utf8DecodeContinue
,
DecoderResult
(
..
)
)
import
qualified
Data.Text.Array
as
A
import
qualified
Data.Text.Internal.Encoding.Fusion
as
E
import
qualified
Data.Text.Internal.Fusion
as
F
...
...
@@ -97,8 +96,6 @@ import Data.Text.Internal.ByteStringCompat
import
GHC.Stack
(
HasCallStack
)
#
endif
#
include
"text_cbits.h"
-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
...
...
@@ -159,53 +156,77 @@ foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- __NOTE__: The replacement character returned by 'OnDecodeError'
-- MUST be within the BMP plane; surrogate code points will
-- automatically be remapped to the replacement char @U+FFFD@
-- (/since 0.11.3.0/), whereas code points beyond the BMP will throw an
-- 'error' (/since 1.2.3.1/); For earlier versions of @text@ using
-- those unsupported code points would result in undefined behavior.
-- Surrogate code points in replacement character returned by 'OnDecodeError'
-- will be automatically remapped to the replacement char @U+FFFD@.
decodeUtf8With
::
#
if
defined
(
ASSERTS
)
HasCallStack
=>
#
endif
OnDecodeError
->
ByteString
->
Text
decodeUtf8With
onErr
bs
=
withBS
bs
aux
where
aux
fp
len
=
runText
$
\
done
->
do
let
go
(
A
.
MutableByteArray
dest
)
=
unsafeWithForeignPtr
fp
$
\
ptr
->
with
(
0
::
CSize
)
$
\
destOffPtr
->
do
let
end
=
ptr
`
plusPtr
`
len
loop
curPtr
=
do
curPtr'
<-
c_decode_utf8
dest
destOffPtr
curPtr
end
if
curPtr'
==
end
then
do
n
<-
peek
destOffPtr
unsafeSTToIO
(
done
(
A
.
MutableByteArray
dest
)
(
cSizeToInt
n
))
else
do
x
<-
peek
curPtr'
case
onErr
desc
(
Just
x
)
of
Nothing
->
loop
$
curPtr'
`
plusPtr
`
1
Just
c
-- TODO This is problematic, because even BMP replacement characters
-- can take longer than one UTF8 code unit (which is byte).
|
c
>
'
\xFFFF
'
->
throwUnsupportedReplChar
|
otherwise
->
do
destOff
<-
peek
destOffPtr
w
<-
unsafeSTToIO
$
unsafeWrite
(
A
.
MutableByteArray
dest
)
(
cSizeToInt
destOff
)
(
safe
c
)
poke
destOffPtr
(
destOff
+
intToCSize
w
)
loop
$
curPtr'
`
plusPtr
`
1
loop
ptr
-- TODO (len * 2 + 100) assumes that invalid input is asymptotically rare.
-- This is incorrect in general, but for now we just want to pass tests.
(
unsafeIOToST
.
go
)
=<<
A
.
new
(
len
*
2
+
100
)
where
desc
=
"Data.Text.Internal.Encoding.decodeUtf8: Invalid UTF-8 stream"
throwUnsupportedReplChar
=
throwIO
$
ErrorCall
"decodeUtf8With: non-BMP replacement characters not supported"
decodeUtf8With
onErr
bs
|
B
.
null
undecoded
=
txt
|
otherwise
=
txt
`
append
`
(
case
onErr
desc
(
Just
(
B
.
head
undecoded
))
of
Nothing
->
txt'
Just
c
->
T
.
singleton
c
`
append
`
txt'
)
where
(
txt
,
undecoded
)
=
decodeUtf8With2
onErr
mempty
bs
txt'
=
decodeUtf8With
onErr
(
B
.
tail
undecoded
)
desc
=
"Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-- | Decode two consecutive bytestrings, returning Text and undecoded remainder.
decodeUtf8With2
::
#
if
defined
(
ASSERTS
)
HasCallStack
=>
#
endif
OnDecodeError
->
ByteString
->
ByteString
->
(
Text
,
ByteString
)
decodeUtf8With2
onErr
bs1
@
(
B
.
length
->
len1
)
bs2
@
(
B
.
length
->
len2
)
=
runST
$
do
marr
<-
A
.
new
len'
outer
marr
len'
0
0
where
len
=
len1
+
len2
len'
=
len
+
4
index
i
|
i
<
len1
=
B
.
index
bs1
i
|
otherwise
=
B
.
index
bs2
(
i
-
len1
)
decodeFrom
::
Int
->
DecoderResult
decodeFrom
off
=
step
(
off
+
1
)
(
utf8DecodeStart
(
index
off
))
where
step
i
(
Incomplete
a
b
)
|
i
<
len
=
step
(
i
+
1
)
(
utf8DecodeContinue
(
index
i
)
a
b
)
step
_
st
=
st
outer
::
forall
s
.
A
.
MArray
s
->
Int
->
Int
->
Int
->
ST
s
(
Text
,
ByteString
)
outer
dst
dstLen
=
inner
where
inner
srcOff
dstOff
|
srcOff
>=
len
=
do
A
.
shrinkM
dst
dstOff
arr
<-
A
.
unsafeFreeze
dst
return
(
Text
arr
0
dstOff
,
mempty
)
|
dstOff
+
4
>
dstLen
=
do
let
dstLen'
=
dstLen
+
4
dst'
<-
A
.
resizeM
dst
dstLen'
outer
dst'
dstLen'
srcOff
dstOff
|
otherwise
=
case
decodeFrom
srcOff
of
Accept
c
->
do
d
<-
unsafeWrite
dst
dstOff
c
inner
(
srcOff
+
d
)
(
dstOff
+
d
)
Reject
->
case
onErr
desc
(
Just
(
index
srcOff
))
of
Nothing
->
inner
(
srcOff
+
1
)
dstOff
Just
c
->
do
d
<-
unsafeWrite
dst
dstOff
(
safe
c
)
inner
(
srcOff
+
1
)
(
dstOff
+
d
)
Incomplete
{}
->
do
A
.
shrinkM
dst
dstOff
arr
<-
A
.
unsafeFreeze
dst
let
bs
=
if
srcOff
>=
len1
then
B
.
drop
(
srcOff
-
len1
)
bs2
else
B
.
drop
srcOff
(
bs1
`
B
.
append
`
bs2
)
return
(
Text
arr
0
dstOff
,
bs
)
desc
=
"Data.Text.Internal.Encoding: Invalid UTF-8 stream"
-- $stream
--
...
...
@@ -272,9 +293,6 @@ instance Show Decoding where
showString
" _"
where
prec
=
10
;
prec'
=
prec
+
1
newtype
CodePoint
=
CodePoint
Word32
deriving
(
Eq
,
Show
,
Num
,
Storable
)
newtype
DecoderState
=
DecoderState
Word32
deriving
(
Eq
,
Show
,
Num
,
Storable
)
-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8
-- encoded text that is known to be valid.
--
...
...
@@ -300,72 +318,11 @@ streamDecodeUtf8With ::
HasCallStack
=>
#
endif
OnDecodeError
->
ByteString
->
Decoding
streamDecodeUtf8With
onErr
=
decodeChunk
B
.
empty
0
0
where
-- We create a slightly larger than necessary buffer to accommodate a
-- potential code point started in the last buffer (@undecoded0@), or
-- replacement characters for each byte in @undecoded0@ if the
-- sequence turns out to be invalid. There can be up to three bytes there,
-- hence we allocate @len+3@ bytes.
decodeChunk
::
ByteString
->
CodePoint
->
DecoderState
->
ByteString
->
Decoding
decodeChunk
undecoded0
codepoint0
state0
bs
=
withBS
bs
aux
where
-- TODO Replace (+100) with something sensible.
aux
fp
len
=
runST
$
(
unsafeIOToST
.
decodeChunkToBuffer
)
=<<
A
.
new
(
len
+
100
)
where
decodeChunkToBuffer
::
A
.
MArray
s
->
IO
Decoding
decodeChunkToBuffer
(
A
.
MutableByteArray
dest
)
=
unsafeWithForeignPtr
fp
$
\
ptr
->
with
(
0
::
CSize
)
$
\
destOffPtr
->
with
codepoint0
$
\
codepointPtr
->
with
state0
$
\
statePtr
->
with
nullPtr
$
\
curPtrPtr
->
let
end
=
ptr
`
plusPtr
`
len
loop
curPtr
=
do
prevState
<-
peek
statePtr
poke
curPtrPtr
curPtr
lastPtr
<-
c_decode_utf8_with_state
dest
destOffPtr
curPtrPtr
end
codepointPtr
statePtr
state
<-
peek
statePtr
case
state
of
UTF8_REJECT
->
do
-- We encountered an encoding error
poke
statePtr
0
let
skipByte
x
=
case
onErr
desc
(
Just
x
)
of
Nothing
->
return
()
Just
c
->
do
destOff
<-
peek
destOffPtr
w
<-
unsafeSTToIO
$
unsafeWrite
(
A
.
MutableByteArray
dest
)
(
cSizeToInt
destOff
)
(
safe
c
)
poke
destOffPtr
(
destOff
+
intToCSize
w
)
if
ptr
==
lastPtr
&&
prevState
/=
UTF8_ACCEPT
then
do
-- If we can't complete the sequence @undecoded0@ from
-- the previous chunk, we invalidate the bytes from
-- @undecoded0@ and retry decoding the current chunk from
-- the initial state.
traverse_
skipByte
(
B
.
unpack
undecoded0
)
loop
lastPtr
else
do
peek
lastPtr
>>=
skipByte
loop
(
lastPtr
`
plusPtr
`
1
)
_
->
do
-- We encountered the end of the buffer while decoding
n
<-
peek
destOffPtr
codepoint
<-
peek
codepointPtr
chunkText
<-
unsafeSTToIO
$
do
let
l
=
cSizeToInt
n
A
.
shrinkM
(
A
.
MutableByteArray
dest
)
l
arr
<-
A
.
unsafeFreeze
(
A
.
MutableByteArray
dest
)
return
$!
text
arr
0
l
let
left
=
lastPtr
`
minusPtr
`
ptr
!
undecoded
=
case
state
of
UTF8_ACCEPT
->
B
.
empty
_
|
left
==
0
&&
prevState
/=
UTF8_ACCEPT
->
B
.
append
undecoded0
bs
|
otherwise
->
B
.
drop
left
bs
return
$
Some
chunkText
undecoded
(
decodeChunk
undecoded
codepoint
state
)
in
loop
ptr
desc
=
"Data.Text.Internal.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream"
streamDecodeUtf8With
onErr
=
go
mempty
where
go
bs1
bs2
=
Some
txt
undecoded
(
go
undecoded
)
where
(
txt
,
undecoded
)
=
decodeUtf8With2
onErr
bs1
bs2
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
...
...
@@ -551,15 +508,3 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
cSizeToInt
::
CSize
->
Int
cSizeToInt
=
fromIntegral
intToCSize
::
Int
->
CSize
intToCSize
=
fromIntegral
foreign
import
ccall
unsafe
"_hs_text_decode_utf8"
c_decode_utf8
::
MutableByteArray
#
s
->
Ptr
CSize
->
Ptr
Word8
->
Ptr
Word8
->
IO
(
Ptr
Word8
)
foreign
import
ccall
unsafe
"_hs_text_decode_utf8_state"
c_decode_utf8_with_state
::
MutableByteArray
#
s
->
Ptr
CSize
->
Ptr
(
Ptr
Word8
)
->
Ptr
Word8
->
Ptr
CodePoint
->
Ptr
DecoderState
->
IO
(
Ptr
Word8
)
src/Data/Text/Internal.hs
View file @
cdc80a84
...
...
@@ -33,6 +33,7 @@ module Data.Text.Internal
-- * Code that must be here for accessibility
,
empty
,
empty_
,
append
-- * Utilities
,
firstf
-- * Checked multiplication
...
...
@@ -47,6 +48,7 @@ module Data.Text.Internal
import
Control.Exception
(
assert
)
import
GHC.Stack
(
HasCallStack
)
#
endif
import
Control.Monad.ST
(
ST
)
import
Data.Bits
import
Data.Int
(
Int32
,
Int64
)
import
Data.Text.Internal.Unsafe.Char
(
ord
)
...
...
@@ -89,6 +91,24 @@ empty_ :: Text
empty_
=
Text
A
.
empty
0
0
{-# NOINLINE empty_ #-}
-- | /O(n)/ Appends one 'Text' to the other by copying both of them
-- into a new 'Text'.
append
::
Text
->
Text
->
Text
append
a
@
(
Text
arr1
off1
len1
)
b
@
(
Text
arr2
off2
len2
)
|
len1
==
0
=
b
|
len2
==
0
=
a
|
len
>
0
=
Text
(
A
.
run
x
)
0
len
|
otherwise
=
error
$
"Data.Text.append: size overflow"
where
len
=
len1
+
len2
x
::
ST
s
(
A
.
MArray
s
)
x
=
do
arr
<-
A
.
new
len
A
.
copyI
len1
arr
0
arr1
off1
A
.
copyI
len2
arr
len1
arr2
off2
return
arr
{-# NOINLINE append #-}
-- | Construct a 'Text' without invisibly pinning its byte array in
-- memory if its length has dwindled to zero.
text
::
...
...
src/Data/Text/Internal/Encoding/Utf8.hs
View file @
cdc80a84
...
...
@@ -33,6 +33,10 @@ module Data.Text.Internal.Encoding.Utf8
,
validate2
,
validate3
,
validate4
-- * Naive decoding
,
DecoderResult
(
..
)
,
utf8DecodeStart
,
utf8DecodeContinue
)
where
#
if
defined
(
ASSERTS
)
...
...
@@ -40,7 +44,7 @@ import Control.Exception (assert)
import
GHC.Stack
(
HasCallStack
)
#
endif
import
Data.Bits
(
Bits
(
..
),
FiniteBits
(
..
))
import
Data.Char
(
ord
)
import
Data.Char
(
ord
,
chr
)
import
GHC.Exts
import
GHC.Word
(
Word8
(
..
))
...
...
@@ -213,3 +217,66 @@ validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
intToWord8
::
Int
->
Word8
intToWord8
=
fromIntegral
word8ToInt
::
Word8
->
Int
word8ToInt
=
fromIntegral
-------------------------------------------------------------------------------
-- Naive UTF8 decoder.
-- See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for the explanation of the state machine.
newtype
ByteClass
=
ByteClass
Word8
byteToClass
::
Word8
->
ByteClass
byteToClass
n
=
ByteClass
(
W8
#
el
#
)
where
!
(
I
#
n
#
)
=
word8ToInt
n
el
#
=
indexWord8OffAddr
#
table
#
n
#
table
#
::
Addr
#
table
#
=
"
\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\NUL\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\SOH\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\a\b\b\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\STX\n\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\ETX\EOT\ETX\ETX\v\ACK\ACK\ACK\ENQ\b\b\b\b\b\b\b\b\b\b\b
"
#
newtype
DecoderState
=
DecoderState
Word8
deriving
(
Eq
)
utf8AcceptState
::
DecoderState
utf8AcceptState
=
DecoderState
0
utf8RejectState
::
DecoderState
utf8RejectState
=
DecoderState
12
updateState
::
ByteClass
->
DecoderState
->
DecoderState
updateState
(
ByteClass
c
)
(
DecoderState
s
)
=
DecoderState
(
W8
#
el
#
)
where
!
(
I
#
n
#
)
=
word8ToInt
(
c
+
s
)
el
#
=
indexWord8OffAddr
#
table
#
n
#
table
#
::
Addr
#
table
#
=
"
\NUL\f\CAN
$<`T
\f\f\f
0H
\f\f\f\f\f\f\f\f\f\f\f\f\f\NUL\f\f\f\f\f\NUL\f\NUL\f\f\f\CAN\f\f\f\f\f\CAN\f\CAN\f\f\f\f\f\f\f\f\f\CAN\f\f\f\f\f\CAN\f\f\f\f\f\f\f\CAN\f\f\f\f\f\f\f\f\f
$
\f
$
\f\f\f
$
\f\f\f\f\f
$
\f
$
\f\f\f
$
\f\f\f\f\f\f\f\f\f\f
"
#
newtype
CodePoint
=
CodePoint
Int
data
DecoderResult
=
Accept
!
Char
|
Incomplete
!
DecoderState
!
CodePoint
|
Reject
utf8DecodeStart
::
Word8
->
DecoderResult
utf8DecodeStart
w
|
st
==
utf8AcceptState
=
Accept
(
chr
(
word8ToInt
w
))
|
st
==
utf8RejectState
=
Reject
|
otherwise
=
Incomplete
st
(
CodePoint
cp
)
where