diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..316009b8ee09fa710a8d76fcd42e8a6ec1e20714 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +dist +cabal-dev diff --git a/Data/Text/Encoding.hs b/Data/Text/Encoding.hs index 533f604a8baafbe1d5ce47910b77a93e6a9c774f..73645244dd0e6ef8828fbb64d3a6a09be6f17000 100644 --- a/Data/Text/Encoding.hs +++ b/Data/Text/Encoding.hs @@ -54,6 +54,14 @@ module Data.Text.Encoding , encodeUtf16BE , encodeUtf32LE , encodeUtf32BE + +#if MIN_VERSION_bytestring(0,10,4) + -- * Generic encoding of Text + -- , encodeStreamWithB + -- , encodeTextWithB + -- , encodeUtf8Builder + , encodeUtf8Escaped +#endif ) where import Control.Exception (evaluate, try) @@ -66,6 +74,13 @@ import Control.Monad.ST (runST) import Data.Bits ((.&.)) import Data.ByteString as B import Data.ByteString.Internal as B +#if MIN_VERSION_bytestring(0,10,4) +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Builder.Internal as B +import qualified Data.ByteString.Builder.Prim as BP +import qualified Data.ByteString.Builder.Prim.Internal as BP +import qualified Data.ByteString.Lazy as BL +#endif import Data.Text () import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Internal (Text(..), safe, textP) @@ -291,6 +306,61 @@ decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDec -- | Encode text using UTF-8 encoding. encodeUtf8 :: Text -> ByteString +#if MIN_VERSION_bytestring(0,10,4) + +encodeUtf8 = + BL.toStrict . B.toLazyByteString + . encodeUtf8Escaped (BP.liftFixedToBounded BP.word8) + +-- | Encode text using UTF-8 encoding and escape the ASCII characters using +-- a 'BP.PrimBounded'. +encodeUtf8Escaped :: BP.BoundedPrim Word8 -> Text -> B.Builder +encodeUtf8Escaped be (Text arr off len) = + B.builder step + where + bound = max 4 $ BP.sizeBound be + iend = off + len + step !k = + outerLoop off + where + outerLoop !i0 !br@(B.BufferRange op0 ope) + | i0 >= iend = k br + | op0 `plusPtr` bound < ope = + goPartial (i0 + min outRemaining inpRemaining) + | otherwise = return $ B.bufferFull bound op0 (outerLoop i0) + where + outRemaining = (ope `minusPtr` op0) `div` bound + inpRemaining = iend - i0 + + goPartial !iendTmp = go i0 op0 + where + go !i !op + | i < iendTmp = case A.unsafeIndex arr i of + w | w <= 0x7F -> do + BP.runB be (fromIntegral w) op >>= go (i + 1) + | w <= 0x7FF -> do + poke8 0 $ (w `shiftR` 6) + 0xC0 + poke8 1 $ (w .&. 0x3f) + 0x80 + go (i + 1) (op `plusPtr` 2) + | 0xD800 <= w && w <= 0xDBFF -> do + let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1)) + poke8 0 $ (c `shiftR` 18) + 0xF0 + poke8 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80 + poke8 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80 + poke8 3 $ (c .&. 0x3F) + 0x80 + go (i + 2) (op `plusPtr` 4) + | otherwise -> do + poke8 0 $ (w `shiftR` 12) + 0xE0 + poke8 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80 + poke8 2 $ (w .&. 0x3F) + 0x80 + go (i + 1) (op `plusPtr` 3) + | otherwise = + outerLoop i (B.BufferRange op ope) + where + poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8) + +#else + encodeUtf8 (Text arr off len) = unsafeDupablePerformIO $ do let size0 = max len 4 mallocByteString size0 >>= start size0 off 0 @@ -344,6 +414,7 @@ encodeUtf8 (Text arr off len) = unsafeDupablePerformIO $ do poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80 poke8 (m+2) $ (w .&. 0x3F) + 0x80 go (n+1) (m+3) +#endif -- | Decode text from little endian UTF-16 encoding. decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text @@ -432,3 +503,36 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1 :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO () + + +{- +-- | Encode all elements of a 'F.Stream' using a 'B.BoundedEncoding'. +{-# INLINE encodeStreamWithB #-} +encodeStreamWithB :: B.BoundedEncoding a -> F.Stream a -> B.Builder +encodeStreamWithB be = + \(F.Stream next s0 _) -> B.builder $ step next s0 + where + bound = B.sizeBound be + step next s0 k (B.BufferRange op0 ope0) = + go s0 op0 + where + go s !op = case next s of + F.Done -> k (B.BufferRange op ope0) + F.Skip s' -> go s' op + F.Yield x s' + | op `plusPtr` bound <= ope0 -> B.runB be x op >>= go s' + | otherwise -> + return $ B.bufferFull bound op (step next s k) + + +-- | +-- | /Subject to fusion./ +-- Encode all 'Char's of a 'T.Text' using a 'B.BoundedEncoding'. +{-# INLINE encodeTextWithB #-} +encodeTextWithB :: B.BoundedEncoding Char -> Text -> B.Builder +encodeTextWithB be = encodeStreamWithB be . F.stream + +-- | Encode text using UTF-8 encoding. +encodeUtf8Builder :: Text -> B.Builder +encodeUtf8Builder = encodeUtf8Escaped (B.fromF B.word8) +-} diff --git a/Data/Text/Lazy/Encoding.hs b/Data/Text/Lazy/Encoding.hs index aaf50af14fd57e6987b7e2988ce2a9f907a2431e..d4f97e76f969c60495e0bd9202c07e4c2ee9a076 100644 --- a/Data/Text/Lazy/Encoding.hs +++ b/Data/Text/Lazy/Encoding.hs @@ -54,6 +54,11 @@ import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B import qualified Data.ByteString.Unsafe as B +#if MIN_VERSION_bytestring(0,10,4) +import Data.Monoid (mempty, (<>)) +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Builder.Prim as BP +#endif import qualified Data.Text.Encoding as TE import qualified Data.Text.Lazy as L import qualified Data.Text.Internal.Lazy.Encoding.Fusion as E @@ -135,8 +140,17 @@ decodeUtf8' bs = unsafeDupablePerformIO $ do {-# INLINE decodeUtf8' #-} encodeUtf8 :: Text -> B.ByteString +#if MIN_VERSION_bytestring(0,10,4) +encodeUtf8 = + B.toLazyByteString . go + where + go Empty = mempty + go (Chunk c cs) = + TE.encodeUtf8Escaped (BP.liftFixedToBounded BP.word8) c <> go cs +#else encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs) encodeUtf8 Empty = B.Empty +#endif -- | Decode text from little endian UTF-16 encoding. decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text diff --git a/text.cabal b/text.cabal index 27ad3e10a1a575b09d7d88006017135a800b9720..62c8218e1544ebe81731db311439d2ffd8aa3d75 100644 --- a/text.cabal +++ b/text.cabal @@ -121,10 +121,14 @@ library build-depends: array >= 0.3, base >= 4.2 && < 5, - bytestring >= 0.9, deepseq >= 1.1.0.0, ghc-prim >= 0.2 + if impl(ghc >= 7.7) + build-depends: bytestring >= 0.10.4.0 + else + build-depends: bytestring >= 0.9 + cpp-options: -DHAVE_DEEPSEQ ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer)