Commit 4b3fdd4f authored by Bodigrim's avatar Bodigrim
Browse files

Use simdutf for UTF8 validation

parent 601a5730
This diff is collapsed.
#include "simdutf.h"
extern "C"
int _hs_text_is_valid_utf8(const char* str, size_t len){
return simdutf::validate_utf8(str, len);
}
This diff is collapsed.
......@@ -78,7 +78,7 @@ import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show as T (singleton)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8)
import Foreign.C.Types (CSize(..))
import Foreign.C.Types (CSize(..), CInt(..))
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (poke, peekByteOff)
import GHC.Exts (byteArrayContents#, unsafeCoerce#)
......@@ -154,6 +154,10 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
:: Ptr Word8 -> Ptr Word8 -> IO CSize
isValidBS :: ByteString -> Bool
isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len)
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
-- Surrogate code points in replacement character returned by 'OnDecodeError'
......@@ -164,6 +168,9 @@ decodeUtf8With ::
#endif
OnDecodeError -> ByteString -> Text
decodeUtf8With onErr bs
| isValidBS bs =
let !(SBS.SBS arr) = SBS.toShort bs in
(Text (A.ByteArray arr) 0 (B.length bs))
| B.null undecoded = txt
| otherwise = txt `append` (case onErr desc (Just (B.head undecoded)) of
Nothing -> txt'
......@@ -190,6 +197,21 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
| i < len1 = B.index bs1 i
| otherwise = B.index bs2 (i - len1)
-- We need Data.ByteString.findIndexEnd, but it is unavailable before bytestring-0.10.12.0
guessUtf8Boundary :: Int
guessUtf8Boundary
| len2 >= 1 && w0 < 0x80 = len2 -- last char is ASCII
| len2 >= 1 && w0 >= 0xC0 = len2 - 1 -- last char starts a code point
| len2 >= 2 && w1 >= 0xC0 = len2 - 2 -- pre-last char starts a code point
| len2 >= 3 && w2 >= 0xC0 = len2 - 3
| len2 >= 4 && w3 >= 0xC0 = len2 - 4
| otherwise = 0
where
w0 = B.index bs2 (len2 - 1)
w1 = B.index bs2 (len2 - 2)
w2 = B.index bs2 (len2 - 3)
w3 = B.index bs2 (len2 - 4)
decodeFrom :: Int -> DecoderResult
decodeFrom off = step (off + 1) (utf8DecodeStart (index off))
where
......@@ -205,10 +227,21 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
A.shrinkM dst dstOff
arr <- A.unsafeFreeze dst
return (Text arr 0 dstOff, mempty)
| srcOff >= len1
, srcOff < len1 + guessUtf8Boundary
, dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
, bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
, isValidBS bs = do
withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
unsafeSTToIO $ A.copyP dst dstOff src (len1 + guessUtf8Boundary - srcOff)
inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
| dstOff + 4 > dstLen = do
let dstLen' = dstLen + 4
dst' <- A.resizeM dst dstLen'
outer dst' dstLen' srcOff dstOff
| otherwise = case decodeFrom srcOff of
Accept c -> do
d <- unsafeWrite dst dstOff c
......@@ -508,3 +541,6 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
cSizeToInt :: CSize -> Int
cSizeToInt = fromIntegral
foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
:: Ptr Word8 -> CSize -> IO CInt
cabal-version: >= 1.10
cabal-version: 2.2
name: text
version: 1.2.5.0
......@@ -38,7 +38,7 @@ description:
based on the well-respected and liberally
licensed [ICU library](http://site.icu-project.org/).
license: BSD2
license: BSD-2-Clause
license-file: LICENSE
author: Bryan O'Sullivan <bos@serpentine.com>
maintainer: Haskell Text Team <andrew.lelechenko@gmail.com>, Core Libraries Committee
......@@ -54,6 +54,7 @@ extra-source-files:
README.markdown
changelog.md
scripts/*.hs
include/*.h
tests/literal-rule-test.sh
tests/LiteralRuleTest.hs
......@@ -67,7 +68,22 @@ library
cbits/measure_off.c
cbits/reverse.c
cbits/utils.c
cxx-sources: cbits/simdutf.cpp
cbits/validate_utf8.cpp
include-dirs: include
hs-source-dirs: src
cxx-options: -std=c++17
if os(windows)
if arch(x86_64)
extra-libraries: stdc++-6 gcc_s_seh-1
else
extra-libraries: stdc++-6 gcc_s_dw2-1
else
if os(darwin)
extra-libraries: c++
else
extra-libraries: stdc++
exposed-modules:
Data.Text
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment