Commit 14c6ae11 authored by Bodigrim's avatar Bodigrim
Browse files

Reimplement decodeASCII and decodeLatin1 to share C code

parent 5945cafa
......@@ -55,26 +55,6 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
return *state = utf8d[256 + *state + type];
}
size_t
_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
const uint8_t *srcend)
{
const uint8_t *dest0 = dest;
const uint8_t *p = src;
while (p != srcend){
uint8_t codepoint = *p++;
if(codepoint < 0x80){
*dest++ = (uint8_t)codepoint;
} else {
*dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
*dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}
}
return (dest - dest0);
}
/*
* A best-effort decoder. Runs until it hits either end of input or
* the start of an invalid byte sequence.
......
/*
* Copyright (c) 2021 Andrew Lelechenko <andrew.lelechenko@gmail.com>
*/
#include <string.h>
#include <stdint.h>
#include <sys/types.h>
#ifdef __x86_64__
#include <emmintrin.h>
#include <xmmintrin.h>
#endif
#include <stdbool.h>
/*
_hs_text_is_ascii takes a UTF-8 encoded buffer,
and returns the length of the ASCII-compatible prefix.
*/
const size_t _hs_text_is_ascii(const uint8_t *src0, const uint8_t *srcend){
const uint8_t *src = src0;
#ifdef __x86_64__
// I experimented with larger vector registers,
// but did not notice any measurable speed up, so let's keep it simple.
while (src < srcend - 15){
__m128i w128 = _mm_loadu_si128((__m128i *)src);
// Which bytes are < 128?
uint16_t mask = _mm_movemask_epi8(w128);
if (mask) break;
src+= 16;
}
#endif
while (src < srcend - 7){
uint64_t w64;
memcpy(&w64, src, sizeof(uint64_t));
if (w64 & 0x8080808080808080ULL) break;
src+= 8;
}
while (src < srcend){
uint8_t leadByte = *src;
if(leadByte >= 0x80) break;
src++;
}
return src - src0;
}
......@@ -64,21 +64,24 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
import Control.Monad.ST (runST)
import Data.ByteString as B
import Data.Bits (shiftR, (.&.))
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Short.Internal as SBS
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
import Data.Text.Internal (Text(..), safe, text)
import Data.Text.Internal (Text(..), safe, empty, text)
import Data.Text.Internal.Private (runText)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show ()
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8, Word32)
import Foreign.C.Types (CSize)
import Foreign.C.Types (CSize(..))
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
import Foreign.Storable (Storable, peek, poke, peekByteOff)
import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
import qualified Data.ByteString.Builder as B
......@@ -112,7 +115,13 @@ import GHC.Stack (HasCallStack)
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: ByteString -> Text
decodeASCII = decodeUtf8
decodeASCII bs = withBS bs $ \fp len -> if len == 0 then empty else runST $ do
asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
c_is_ascii src (src `plusPtr` len)
if asciiPrefixLen == len
then let !(SBS.SBS arr) = SBS.toShort bs in
return (Text (A.ByteArray arr) 0 len)
else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
......@@ -124,13 +133,29 @@ decodeLatin1 ::
HasCallStack =>
#endif
ByteString -> Text
decodeLatin1 bs = withBS bs aux where
aux fp len = text a 0 actualLen
where
(a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \src -> do
destLen <- c_decode_latin1 dest src (src `plusPtr` len)
return (A.MutableByteArray dest, destLen)
decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
dst <- A.new (2 * len)
let inner srcOff dstOff = if srcOff >= len then return dstOff else do
asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
if asciiPrefixLen == 0
then do
byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
inner (srcOff + 1) (dstOff + 2)
else do
unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
unsafeSTToIO $ A.copyFromPointer dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
actualLen <- inner 0 0
dst' <- A.resizeM dst actualLen
arr <- A.unsafeFreeze dst'
return $ Text arr 0 actualLen
foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
:: Ptr Word8 -> Ptr Word8 -> IO CSize
-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
......@@ -538,6 +563,3 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
:: MutableByteArray# s -> Ptr CSize
-> Ptr (Ptr Word8) -> Ptr Word8
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
:: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int
{-# LANGUAGE BangPatterns,CPP #-}
{-# LANGUAGE Trustworthy #-}
{-# OPTIONS_GHC -fno-warn-deprecations #-}
-- |
-- Module : Data.Text.Lazy.Encoding
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
......@@ -80,7 +83,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: B.ByteString -> Text
decodeASCII = decodeUtf8
decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
......
......@@ -65,6 +65,7 @@ flag developer
library
c-sources: cbits/cbits.c
cbits/is_ascii.c
cbits/measure_off.c
cbits/reverse.c
cbits/utils.c
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment