diff --git a/.hgignore b/.hgignore index 4f96a314af80af422647add2909dd9709df76175..3550e5beb476c4a83a6570d16f0b4947488bd3f3 100644 --- a/.hgignore +++ b/.hgignore @@ -1,8 +1,12 @@ -^(?:dist|tests/benchmarks/dist|tests/tests/dist)$ -^tests/benchmarks/.*\.txt$ +^(?:dist|benchmarks/dist|tests/coverage|tests/dist)$ +^benchmarks/.*\.txt$ ^tests/text-testdata.tar.bz2$ ^tests/(?:\.hpc|bm|qc|qc-hpc|stdio-hpc|text/test)$ -\.(?:aux|eventlog|h[ip]|log|[oa]|orig|prof|ps|rej|swp)$ +\.(?:aux|eventlog|h[ip]|log|[oa]|orig|prof|ps|rej|swp|tix)$ ~$ + syntax: glob .\#* +cabal-dev +cabal.sandbox.config +\.cabal-sandbox diff --git a/.hgtags b/.hgtags index ba773dc170b71822804801a5c4d2e3e3d4907662..6790a54981b0f26a53ca5db0428cd15452bda380 100644 --- a/.hgtags +++ b/.hgtags @@ -36,3 +36,9 @@ b75d3041d275e8e76b26605b0d426a572ddf1249 0.11.1.3 407937739e9e764f1ae0f1f9ca454c42dca38772 0.11.1.10 8b981edd27befa4c2dd334fcb7db22ac67e22b67 0.11.1.11 204da16b5098531bdf858c388e2620238ef2aa5e 0.11.1.12 +6a3d847a56a69d0514a79cb212cb218271ad0917 0.11.1.13 +1d2c6fa9092c6a4000b2abdd9d01f3efcd477be5 0.11.2.0 +78219784cf3652cc662805bf2971bd62d80210a9 0.11.2.1 +4297307ebc11ad677cfba6b40319e7e5e2c0cfee 0.11.2.3 +7fa79662b66aade97fe49394977213fe6432942e 0.11.3.0 +d99cd091cdf71ce807a4255f6cc509c3154f51ea 0.11.3.1 diff --git a/Data/Text.hs b/Data/Text.hs index 805454a278cf2cc07055e13cedf971871ccd7776..f8d21ffbe537399d6675e422a4ac590d614fcf3e 100644 --- a/Data/Text.hs +++ b/Data/Text.hs @@ -1,9 +1,12 @@ -{-# LANGUAGE BangPatterns, CPP, Rank2Types, UnboxedTuples #-} +{-# LANGUAGE BangPatterns, CPP, MagicHash, Rank2Types, UnboxedTuples #-} {-# OPTIONS_GHC -fno-warn-orphans #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text --- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, +-- Copyright : (c) 2009, 2010, 2011, 2012 Bryan O'Sullivan, -- (c) 2009 Duncan Coutts, -- (c) 2008, 2009 Tom Harper -- @@ -167,26 +170,29 @@ module Data.Text , partition -- , findSubstring - + -- * Indexing -- $index , index , findIndex , count - -- * Zipping and unzipping + -- * Zipping , zip , zipWith -- -* Ordered text -- , sort + + -- * Low level operations + , copy ) where import Prelude (Char, Bool(..), Int, Maybe(..), String, Eq(..), Ord(..), Ordering(..), (++), Read(..), Show(..), (&&), (||), (+), (-), (.), ($), ($!), (>>), (*), - div, maxBound, not, return, otherwise) + maxBound, not, return, otherwise, quot) #if defined(HAVE_DEEPSEQ) import Control.DeepSeq (NFData) #endif @@ -195,11 +201,7 @@ import Control.Exception (assert) #endif import Data.Char (isSpace) import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf)) -#if __GLASGOW_HASKELL__ >= 612 import Data.Data (mkNoRepType) -#else -import Data.Data (mkNorepType) -#endif import Control.Monad (foldM) import qualified Data.Text.Array as A import qualified Data.List as L @@ -222,6 +224,12 @@ import Data.ByteString (ByteString) import qualified Data.Text.Lazy as L import Data.Int (Int64) #endif +#if __GLASGOW_HASKELL__ >= 702 +import qualified GHC.CString as GHC +#else +import qualified GHC.Base as GHC +#endif +import GHC.Prim (Addr#) -- $strict -- @@ -324,7 +332,7 @@ instance NFData Text -- This instance preserves data abstraction at the cost of inefficiency. -- We omit reflection services for the sake of data abstraction. --- +-- -- This instance was created by copying the behavior of Data.Set and -- Data.Map. If you feel a mistake has been made, please feel free to -- submit improvements. @@ -338,11 +346,7 @@ instance Data Text where gfoldl f z txt = z pack `f` (unpack txt) toConstr _ = P.error "Data.Text.Text.toConstr" gunfold _ _ = P.error "Data.Text.Text.gunfold" -#if __GLASGOW_HASKELL__ >= 612 dataTypeOf _ = mkNoRepType "Data.Text.Text" -#else - dataTypeOf _ = mkNorepType "Data.Text.Text" -#endif -- | /O(n)/ Compare two 'Text' values lexicographically. compareText :: Text -> Text -> Ordering @@ -364,7 +368,7 @@ compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB) -- | /O(n)/ Convert a 'String' into a 'Text'. Subject to -- fusion. Performs replacement on invalid scalar values. pack :: String -> Text -pack = unstream . S.streamList . L.map safe +pack = unstream . S.map safe . S.streamList {-# INLINE [1] pack #-} -- | /O(n)/ Convert a Text into a String. Subject to fusion. @@ -372,6 +376,19 @@ unpack :: Text -> String unpack = S.unstreamList . stream {-# INLINE [1] unpack #-} +-- | /O(n)/ Convert a literal string into a Text. Subject to fusion. +unpackCString# :: Addr# -> Text +unpackCString# addr# = unstream (S.streamCString# addr#) +{-# NOINLINE unpackCString# #-} + +{-# RULES "TEXT literal" forall a. + unstream (S.map safe (S.streamList (GHC.unpackCString# a))) + = unpackCString# a #-} + +{-# RULES "TEXT literal UTF8" forall a. + unstream (S.map safe (S.streamList (GHC.unpackCStringUtf8# a))) + = unpackCString# a #-} + -- | /O(1)/ Convert a character into a Text. Subject to fusion. -- Performs replacement on invalid scalar values. singleton :: Char -> Text @@ -712,7 +729,7 @@ center k c t | otherwise = replicateChar l c `append` t `append` replicateChar r c where len = length t d = k - len - r = d `div` 2 + r = d `quot` 2 l = d - r {-# INLINE center #-} @@ -884,11 +901,11 @@ mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream -- @t@ repeated @n@ times. replicate :: Int -> Text -> Text replicate n t@(Text a o l) - | n <= 0 || l <= 0 = empty - | n == 1 = t - | isSingleton t = replicateChar n (unsafeHead t) - | n <= maxBound `div` l = Text (A.run x) 0 len - | otherwise = overflowError "replicate" + | n <= 0 || l <= 0 = empty + | n == 1 = t + | isSingleton t = replicateChar n (unsafeHead t) + | n <= maxBound `quot` l = Text (A.run x) 0 len + | otherwise = overflowError "replicate" where len = l * n x = do @@ -1110,7 +1127,7 @@ findAIndexOrEnd q t@(Text _arr _off len) = go 0 where go !i | i >= len || q c = i | otherwise = go (i+d) where Iter c d = iter t i - + -- | /O(n)/ Group characters in a string by equality. group :: Text -> [Text] group = groupBy (==) @@ -1143,7 +1160,7 @@ tails t | null t = [empty] -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"] -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""] -- > splitOn "x" "x" == ["",""] --- +-- -- and -- -- > intercalate s . splitOn s == id @@ -1439,8 +1456,6 @@ isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) = {-# RULES "TEXT isPrefixOf -> fused" [~1] forall s t. isPrefixOf s t = S.isPrefixOf (stream s) (stream t) -"TEXT isPrefixOf -> unfused" [1] forall s t. - S.isPrefixOf (stream s) (stream t) = isPrefixOf s t #-} -- | /O(n)/ The 'isSuffixOf' function takes two 'Text's and returns @@ -1557,3 +1572,19 @@ emptyError fun = P.error $ "Data.Text." ++ fun ++ ": empty input" overflowError :: String -> a overflowError fun = P.error $ "Data.Text." ++ fun ++ ": size overflow" + +-- | /O(n)/ Make a distinct copy of the given string, sharing no +-- storage with the original string. +-- +-- As an example, suppose you read a large string, of which you need +-- only a small portion. If you do not use 'copy', the entire original +-- array will be kept alive in memory by the smaller string. Making a +-- copy \"breaks the link\" to the original array, allowing it to be +-- garbage collected if there are no other live references to it. +copy :: Text -> Text +copy (Text arr off len) = Text (A.run go) 0 len + where + go = do + marr <- A.new len + A.copyI marr 0 arr off len + return marr diff --git a/Data/Text/Array.hs b/Data/Text/Array.hs index 046eb9dda10d281e36ee1a6ca42f9e2f079c59a7..e2c4274fcd7a3baa9cdd7966719f0d7a08fc8c0b 100644 --- a/Data/Text/Array.hs +++ b/Data/Text/Array.hs @@ -114,7 +114,7 @@ instance IArray (MArray s) where -- | Create an uninitialized mutable array. new :: forall s. Int -> ST s (MArray s) new n - | n < 0 || n .&. highBit /= 0 = error $ "Data.Text.Array.new: size overflow" + | n < 0 || n .&. highBit /= 0 = array_size_error | otherwise = ST $ \s1# -> case newByteArray# len# s1# of (# s2#, marr# #) -> (# s2#, MArray marr# @@ -126,6 +126,9 @@ new n highBit = maxBound `xor` (maxBound `shiftR` 1) {-# INLINE new #-} +array_size_error :: a +array_size_error = error "Data.Text.Array.new: size overflow" + -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards! unsafeFreeze :: MArray s -> ST s Array unsafeFreeze MArray{..} = ST $ \s# -> @@ -181,6 +184,7 @@ run2 k = runST (do (marr,b) <- k arr <- unsafeFreeze marr return (arr,b)) +{-# INLINE run2 #-} -- | Copy some elements of a mutable array. copyM :: MArray s -- ^ Destination diff --git a/Data/Text/Encoding.hs b/Data/Text/Encoding.hs index 256b68724eb5bd3a8c629f5dffbbef54bd2ace92..4d7fc4cd427c2c5e83d63b5259fe491d0906a875 100644 --- a/Data/Text/Encoding.hs +++ b/Data/Text/Encoding.hs @@ -1,5 +1,8 @@ -{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, MagicHash, +{-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Encoding -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan, @@ -23,6 +26,7 @@ module Data.Text.Encoding -- * Decoding ByteStrings to Text -- $strict decodeASCII + , decodeLatin1 , decodeUtf8 , decodeUtf16LE , decodeUtf16BE @@ -39,17 +43,23 @@ module Data.Text.Encoding , decodeUtf32LEWith , decodeUtf32BEWith + -- ** Stream oriented decoding + -- $stream + , streamDecodeUtf8 + , streamDecodeUtf8With + , Decoding(..) + -- * Encoding Text to ByteStrings , encodeUtf8 , encodeUtf16LE , encodeUtf16BE , encodeUtf32LE , encodeUtf32BE - + -- * Generic encoding of Text - , encodeStreamWithB - , encodeTextWithB - , encodeUtf8Builder + -- , encodeStreamWithB + -- , encodeTextWithB + -- , encodeUtf8Builder , encodeUtf8Escaped ) where @@ -59,28 +69,35 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO) #else import Control.Monad.ST (unsafeIOToST, unsafeSTToIO) #endif +import Control.Monad.ST (runST) import Data.Bits ((.&.)) import Data.ByteString as B import Data.ByteString.Internal as B -import Data.ByteString.Lazy.Builder.Internal as B -import qualified Data.ByteString.Lazy.Builder.BasicEncoding.Internal as B -import qualified Data.ByteString.Lazy.Builder.BasicEncoding as B +import qualified Data.ByteString.Lazy as BL +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Builder.Internal as B +import qualified Data.ByteString.Builder.Prim.Internal as BP +import qualified Data.ByteString.Builder.Prim as BP +import Data.Text () import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) -import Data.Text.Internal (Text(..), textP) +import Data.Text.Internal (Text(..), safe, textP) +import Data.Text.Private (runText) import Data.Text.UnsafeChar (ord, unsafeWrite) import Data.Text.UnsafeShift (shiftL, shiftR) -import Data.Word (Word8) +import Data.Word (Word8, Word32) import Foreign.C.Types (CSize) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Utils (with) -import Foreign.Ptr (Ptr, minusPtr, plusPtr) -import Foreign.Storable (peek, poke) +import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr) +import Foreign.Storable (Storable, peek, poke) import GHC.Base (MutableByteArray#) -import System.IO.Unsafe (unsafePerformIO) import qualified Data.Text.Array as A import qualified Data.Text.Encoding.Fusion as E import qualified Data.Text.Encoding.Utf16 as U16 import qualified Data.Text.Fusion as F +import Data.Text.Unsafe (unsafeDupablePerformIO) + +#include "text_cbits.h" -- $strict -- @@ -98,39 +115,171 @@ import qualified Data.Text.Fusion as F -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII -- encoded text. -- --- This function is deprecated. Use 'decodeUtf8' instead. +-- This function is deprecated. Use 'decodeLatin1' instead. decodeASCII :: ByteString -> Text decodeASCII = decodeUtf8 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-} +-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. +-- +-- 'decodeLatin1' is semantically equivalent to +-- @Data.Text.pack . Data.ByteString.Char8.unpack@ +decodeLatin1 :: ByteString -> Text +decodeLatin1 (PS fp off len) = textP a 0 len + where + a = A.run (A.new len >>= unsafeIOToST . go) + go dest = withForeignPtr fp $ \ptr -> do + c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len)) + return dest + -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> ByteString -> Text -decodeUtf8With onErr (PS fp off len) = textP (fst a) 0 (snd a) +decodeUtf8With onErr (PS fp off len) = runText $ \done -> do + let go dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> do + let end = ptr `plusPtr` (off + len) + loop curPtr = do + curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end + if curPtr' == end + then do + n <- peek destOffPtr + unsafeSTToIO (done dest (fromIntegral n)) + else do + x <- peek curPtr' + case onErr desc (Just x) of + Nothing -> loop $ curPtr' `plusPtr` 1 + Just c -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) (safe c) + poke destOffPtr (destOff + fromIntegral w) + loop $ curPtr' `plusPtr` 1 + loop (ptr `plusPtr` off) + (unsafeIOToST . go) =<< A.new len where - a = A.run2 (A.new len >>= unsafeIOToST . go) desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream" - go dest = withForeignPtr fp $ \ptr -> - with (0::CSize) $ \destOffPtr -> do - let end = ptr `plusPtr` (off + len) - loop curPtr = do - curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end - if curPtr' == end - then do - n <- peek destOffPtr - return (dest,fromIntegral n) - else do - x <- peek curPtr' - case onErr desc (Just x) of - Nothing -> loop $ curPtr' `plusPtr` 1 - Just c -> do - destOff <- peek destOffPtr - w <- unsafeSTToIO $ - unsafeWrite dest (fromIntegral destOff) c - poke destOffPtr (destOff + fromIntegral w) - loop $ curPtr' `plusPtr` 1 - loop (ptr `plusPtr` off) {- INLINE[0] decodeUtf8With #-} +-- $stream +-- +-- The 'streamDecodeUtf8' and 'streamDecodeUtf8With' functions accept +-- a 'ByteString' that represents a possibly incomplete input (e.g. a +-- packet from a network stream) that may not end on a UTF-8 boundary. +-- +-- The first element of the result is the maximal chunk of 'Text' that +-- can be decoded from the given input. The second is a function which +-- accepts another 'ByteString'. That string will be assumed to +-- directly follow the string that was passed as input to the original +-- function, and it will in turn be decoded. +-- +-- To help understand the use of these functions, consider the Unicode +-- string @\"hi ☃\"@. If encoded as UTF-8, this becomes @\"hi +-- \\xe2\\x98\\x83\"@; the final @\'☃\'@ is encoded as 3 bytes. +-- +-- Now suppose that we receive this encoded string as 3 packets that +-- are split up on untidy boundaries: @[\"hi \\xe2\", \"\\x98\", +-- \"\\x83\"]@. We cannot decode the entire Unicode string until we +-- have received all three packets, but we would like to make progress +-- as we receive each one. +-- +-- @ +-- let 'Some' t0 f0 = 'streamDecodeUtf8' \"hi \\xe2\" +-- t0 == \"hi \" :: 'Text' +-- @ +-- +-- We use the continuation @f0@ to decode our second packet. +-- +-- @ +-- let 'Some' t1 f1 = f0 \"\\x98\" +-- t1 == \"\" +-- @ +-- +-- We could not give @f0@ enough input to decode anything, so it +-- returned an empty string. Once we feed our second continuation @f1@ +-- the last byte of input, it will make progress. +-- +-- @ +-- let 'Some' t2 f2 = f1 \"\\x83\" +-- t2 == \"☃\" +-- @ +-- +-- If given invalid input, an exception will be thrown by the function +-- or continuation where it is encountered. + +-- | A stream oriented decoding result. +data Decoding = Some Text ByteString (ByteString -> Decoding) + +instance Show Decoding where + showsPrec d (Some t bs _) = showParen (d > prec) $ + showString "Some " . showsPrec prec' t . + showChar ' ' . showsPrec prec' bs . + showString " _" + where prec = 10; prec' = prec + 1 + +newtype CodePoint = CodePoint Word32 deriving (Eq, Show, Num, Storable) +newtype DecoderState = DecoderState Word32 deriving (Eq, Show, Num, Storable) + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 +-- encoded text that is known to be valid. +-- +-- If the input contains any invalid UTF-8 data, an exception will be +-- thrown (either by this function or a continuation) that cannot be +-- caught in pure code. For more control over the handling of invalid +-- data, use 'streamDecodeUtf8With'. +streamDecodeUtf8 :: ByteString -> Decoding +streamDecodeUtf8 = streamDecodeUtf8With strictDecode + +-- | Decode, in a stream oriented way, a 'ByteString' containing UTF-8 +-- encoded text. +streamDecodeUtf8With :: OnDecodeError -> ByteString -> Decoding +streamDecodeUtf8With onErr = decodeChunk 0 0 + where + -- We create a slightly larger than necessary buffer to accommodate a + -- potential surrogate pair started in the last buffer + decodeChunk :: CodePoint -> DecoderState -> ByteString -> Decoding + decodeChunk codepoint0 state0 bs@(PS fp off len) = + runST $ (unsafeIOToST . decodeChunkToBuffer) =<< A.new (len+1) + where + decodeChunkToBuffer :: A.MArray s -> IO Decoding + decodeChunkToBuffer dest = withForeignPtr fp $ \ptr -> + with (0::CSize) $ \destOffPtr -> + with codepoint0 $ \codepointPtr -> + with state0 $ \statePtr -> + with nullPtr $ \curPtrPtr -> + let end = ptr `plusPtr` (off + len) + loop curPtr = do + poke curPtrPtr curPtr + curPtr' <- c_decode_utf8_with_state (A.maBA dest) destOffPtr + curPtrPtr end codepointPtr statePtr + state <- peek statePtr + case state of + UTF8_REJECT -> do + -- We encountered an encoding error + x <- peek curPtr' + case onErr desc (Just x) of + Nothing -> loop $ curPtr' `plusPtr` 1 + Just c -> do + destOff <- peek destOffPtr + w <- unsafeSTToIO $ + unsafeWrite dest (fromIntegral destOff) (safe c) + poke destOffPtr (destOff + fromIntegral w) + poke statePtr 0 + loop $ curPtr' `plusPtr` 1 + + _ -> do + -- We encountered the end of the buffer while decoding + n <- peek destOffPtr + codepoint <- peek codepointPtr + chunkText <- unsafeSTToIO $ do + arr <- A.unsafeFreeze dest + return $! textP arr 0 (fromIntegral n) + lastPtr <- peek curPtrPtr + let left = lastPtr `minusPtr` curPtr + return $ Some chunkText (B.drop left bs) + (decodeChunk codepoint state) + in loop (ptr `plusPtr` off) + desc = "Data.Text.Encoding.streamDecodeUtf8With: Invalid UTF-8 stream" + -- | Decode a 'ByteString' containing UTF-8 encoded text that is known -- to be valid. -- @@ -144,17 +293,22 @@ decodeUtf8 = decodeUtf8With strictDecode {-# RULES "STREAM stream/decodeUtf8 fusion" [1] forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-} --- | Decode a 'ByteString' containing UTF-8 encoded text.. +-- | Decode a 'ByteString' containing UTF-8 encoded text. -- -- If the input contains any invalid UTF-8 data, the relevant -- exception will be returned, otherwise the decoded text. decodeUtf8' :: ByteString -> Either UnicodeException Text -decodeUtf8' = unsafePerformIO . try . evaluate . decodeUtf8With strictDecode +decodeUtf8' = unsafeDupablePerformIO . try . evaluate . decodeUtf8With strictDecode {-# INLINE decodeUtf8' #-} -- | Encode text using UTF-8 encoding. encodeUtf8 :: Text -> ByteString -encodeUtf8 (Text arr off len) = unsafePerformIO $ do +encodeUtf8 = + BL.toStrict . B.toLazyByteString + . encodeUtf8Escaped (BP.liftFixedToBounded BP.word8) + +{- +encodeUtf8 (Text arr off len) = unsafeDupablePerformIO $ do let size0 = max len 4 mallocByteString size0 >>= start size0 off 0 where @@ -207,6 +361,55 @@ encodeUtf8 (Text arr off len) = unsafePerformIO $ do poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80 poke8 (m+2) $ (w .&. 0x3F) + 0x80 go (n+1) (m+3) +-} + +-- | Encode text using UTF-8 encoding and escape the ASCII characters using +-- a 'BP.PrimBounded'. +encodeUtf8Escaped :: BP.BoundedPrim Word8 -> Text -> B.Builder +encodeUtf8Escaped be (Text arr off len) = + B.builder step + where + bound = max 4 $ BP.sizeBound be + iend = off + len + step !k = + outerLoop off + where + outerLoop !i0 !br@(B.BufferRange op0 ope) + | i0 >= iend = k br + | op0 `plusPtr` bound < ope = + goPartial (i0 + min outRemaining inpRemaining) + | otherwise = return $ B.bufferFull bound op0 (outerLoop i0) + where + outRemaining = (ope `minusPtr` op0) `div` bound + inpRemaining = iend - i0 + + goPartial !iendTmp = go i0 op0 + where + go !i !op + | i < iendTmp = case A.unsafeIndex arr i of + w | w <= 0x7F -> do + BP.runB be (fromIntegral w) op >>= go (i + 1) + | w <= 0x7FF -> do + poke8 0 $ (w `shiftR` 6) + 0xC0 + poke8 1 $ (w .&. 0x3f) + 0x80 + go (i + 1) (op `plusPtr` 2) + | 0xD800 <= w && w <= 0xDBFF -> do + let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1)) + poke8 0 $ (c `shiftR` 18) + 0xF0 + poke8 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80 + poke8 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80 + poke8 3 $ (c .&. 0x3F) + 0x80 + go (i + 2) (op `plusPtr` 4) + | otherwise -> do + poke8 0 $ (w `shiftR` 12) + 0xE0 + poke8 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80 + poke8 2 $ (w .&. 0x3F) + 0x80 + go (i + 1) (op `plusPtr` 3) + | otherwise = + outerLoop i (B.BufferRange op ope) + where + poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8) + -- | Decode text from little endian UTF-16 encoding. decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text @@ -288,14 +491,24 @@ foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8 :: MutableByteArray# s -> Ptr CSize -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8) +foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_state + :: MutableByteArray# s -> Ptr CSize + -> Ptr (Ptr Word8) -> Ptr Word8 + -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8) + +foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1 + :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO () + + +{- -- | Encode all elements of a 'F.Stream' using a 'B.BoundedEncoding'. {-# INLINE encodeStreamWithB #-} encodeStreamWithB :: B.BoundedEncoding a -> F.Stream a -> B.Builder -encodeStreamWithB be = +encodeStreamWithB be = \(F.Stream next s0 _) -> B.builder $ step next s0 where bound = B.sizeBound be - step next s0 k (B.BufferRange op0 ope0) = + step next s0 k (B.BufferRange op0 ope0) = go s0 op0 where go s !op = case next s of @@ -303,11 +516,11 @@ encodeStreamWithB be = F.Skip s' -> go s' op F.Yield x s' | op `plusPtr` bound <= ope0 -> B.runB be x op >>= go s' - | otherwise -> + | otherwise -> return $ B.bufferFull bound op (step next s k) --- | +-- | -- | /Subject to fusion./ -- Encode all 'Char's of a 'T.Text' using a 'B.BoundedEncoding'. {-# INLINE encodeTextWithB #-} @@ -317,51 +530,6 @@ encodeTextWithB be = encodeStreamWithB be . F.stream -- | Encode text using UTF-8 encoding. encodeUtf8Builder :: Text -> B.Builder encodeUtf8Builder = encodeUtf8Escaped (B.fromF B.word8) +-} --- | Encode text using UTF-8 encoding and escape the ASCII characters using --- a 'BoundedEncoding'. -encodeUtf8Escaped :: B.BoundedEncoding Word8 -> Text -> B.Builder -encodeUtf8Escaped be (Text arr off len) = - B.builder step - where - bound = max 4 $ B.sizeBound be - iend = off + len - step !k = - outerLoop off - where - outerLoop !i0 !br@(BufferRange op0 ope) - | i0 >= iend = k br - | op0 `plusPtr` bound < ope = - goPartial (i0 + min outRemaining inpRemaining) - | otherwise = return $ bufferFull bound op0 (outerLoop i0) - where - outRemaining = (ope `minusPtr` op0) `div` bound - inpRemaining = iend - i0 - - goPartial !iendTmp = go i0 op0 - where - go !i !op - | i < iendTmp = case A.unsafeIndex arr i of - w | w <= 0x7F -> do - B.runB be (fromIntegral w) op >>= go (i + 1) - | w <= 0x7FF -> do - poke8 0 $ (w `shiftR` 6) + 0xC0 - poke8 1 $ (w .&. 0x3f) + 0x80 - go (i + 1) (op `plusPtr` 2) - | 0xD800 <= w && w <= 0xDBFF -> do - let c = ord $ U16.chr2 w (A.unsafeIndex arr (i+1)) - poke8 0 $ (c `shiftR` 18) + 0xF0 - poke8 1 $ ((c `shiftR` 12) .&. 0x3F) + 0x80 - poke8 2 $ ((c `shiftR` 6) .&. 0x3F) + 0x80 - poke8 3 $ (c .&. 0x3F) + 0x80 - go (i + 2) (op `plusPtr` 4) - | otherwise -> do - poke8 0 $ (w `shiftR` 12) + 0xE0 - poke8 1 $ ((w `shiftR` 6) .&. 0x3F) + 0x80 - poke8 2 $ (w .&. 0x3F) + 0x80 - go (i + 1) (op `plusPtr` 3) - | otherwise = - outerLoop i (BufferRange op ope) - where - poke8 j v = poke (op `plusPtr` j) (fromIntegral v :: Word8) diff --git a/Data/Text/Encoding/Error.hs b/Data/Text/Encoding/Error.hs index 732fa7518f4660d02190e8b777f32c917ea3ed5f..d38d9a107c6b1ec58aca282e08770b6c31732862 100644 --- a/Data/Text/Encoding/Error.hs +++ b/Data/Text/Encoding/Error.hs @@ -1,4 +1,7 @@ {-# LANGUAGE CPP, DeriveDataTypeable #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Encoding.Error -- Copyright : (c) Bryan O'Sullivan 2009 @@ -35,11 +38,8 @@ module Data.Text.Encoding.Error , replace ) where -#if __GLASGOW_HASKELL__ >= 610 +import Control.DeepSeq (NFData (..)) import Control.Exception (Exception, throw) -#else -import Control.Exception.Extensible (Exception, throw) -#endif import Data.Typeable (Typeable) import Data.Word (Word8) import Numeric (showHex) @@ -88,12 +88,16 @@ showUnicodeException (EncodeError desc (Just c)) = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc) showUnicodeException (EncodeError desc Nothing) = "Cannot encode input: " ++ desc - + instance Show UnicodeException where show = showUnicodeException instance Exception UnicodeException +instance NFData UnicodeException where + rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` () + rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` () + -- | Throw a 'UnicodeException' if decoding fails. strictDecode :: OnDecodeError strictDecode desc c = throw (DecodeError desc c) diff --git a/Data/Text/Encoding/Fusion.hs b/Data/Text/Encoding/Fusion.hs index ee580507b3dd7cb282a6ed06b75f35cbd8a749bc..b9e5862dc7a0d52fa8c83f731f87706fd283cb21 100644 --- a/Data/Text/Encoding/Fusion.hs +++ b/Data/Text/Encoding/Fusion.hs @@ -44,12 +44,12 @@ import Data.Text.UnsafeShift (shiftL, shiftR) import Data.Word (Word8, Word16, Word32) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) import Foreign.Storable (pokeByteOff) -import System.IO.Unsafe (unsafePerformIO) import qualified Data.ByteString as B import qualified Data.ByteString.Unsafe as B import qualified Data.Text.Encoding.Utf8 as U8 import qualified Data.Text.Encoding.Utf16 as U16 import qualified Data.Text.Encoding.Utf32 as U32 +import Data.Text.Unsafe (unsafeDupablePerformIO) streamASCII :: ByteString -> Stream Char streamASCII bs = Stream next 0 (maxSize l) @@ -163,7 +163,7 @@ streamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2)) -- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'. unstream :: Stream Word8 -> ByteString -unstream (Stream next s0 len) = unsafePerformIO $ do +unstream (Stream next s0 len) = unsafeDupablePerformIO $ do let mlen = upperBound 4 len mallocByteString mlen >>= loop mlen 0 s0 where diff --git a/Data/Text/Encoding/Fusion/Common.hs b/Data/Text/Encoding/Fusion/Common.hs index 06807a7fbbef2f33b84d3ae5ecd31f1b3d211b58..30e4b07ca5ff70972f32bc52b7ff1e8f1949f64d 100644 --- a/Data/Text/Encoding/Fusion/Common.hs +++ b/Data/Text/Encoding/Fusion/Common.hs @@ -21,8 +21,7 @@ module Data.Text.Encoding.Fusion.Common -- * Restreaming -- Restreaming is the act of converting from one 'Stream' -- representation to another. - restreamUtf8 - , restreamUtf16LE + restreamUtf16LE , restreamUtf16BE , restreamUtf32LE , restreamUtf32BE @@ -34,31 +33,6 @@ import Data.Text.Fusion.Internal (RS(..)) import Data.Text.UnsafeChar (ord) import Data.Text.UnsafeShift (shiftR) import Data.Word (Word8) -import qualified Data.Text.Encoding.Utf8 as U8 - --- | /O(n)/ Convert a Stream Char into a UTF-8 encoded Stream Word8. -restreamUtf8 :: Stream Char -> Stream Word8 -restreamUtf8 (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2) - where - next (RS0 s) = case next0 s of - Done -> Done - Skip s' -> Skip (RS0 s') - Yield x s' - | n <= 0x7F -> Yield c (RS0 s') - | n <= 0x07FF -> Yield a2 (RS1 s' b2) - | n <= 0xFFFF -> Yield a3 (RS2 s' b3 c3) - | otherwise -> Yield a4 (RS3 s' b4 c4 d4) - where - n = ord x - c = fromIntegral n - (a2,b2) = U8.ord2 x - (a3,b3,c3) = U8.ord3 x - (a4,b4,c4,d4) = U8.ord4 x - next (RS1 s x2) = Yield x2 (RS0 s) - next (RS2 s x2 x3) = Yield x2 (RS1 s x3) - next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4) - {-# INLINE next #-} -{-# INLINE restreamUtf8 #-} restreamUtf16BE :: Stream Char -> Stream Word8 restreamUtf16BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2) diff --git a/Data/Text/Encoding/Utf8.hs b/Data/Text/Encoding/Utf8.hs index ba0d73664e12643841b4322fbfff390823ab0417..6a1bd53d0705b024e3a766dfac2933bbadd7ca9e 100644 --- a/Data/Text/Encoding/Utf8.hs +++ b/Data/Text/Encoding/Utf8.hs @@ -146,7 +146,7 @@ validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4 validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool {-# INLINE validate4 #-} validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3 - where + where validate4_1 = x1 == 0xF0 && between x2 0x90 0xBF && between x3 0x80 0xBF && diff --git a/Data/Text/Foreign.hs b/Data/Text/Foreign.hs index 58760e0d3d02ef6a93e676ab3630e42be05ca43d..9f006bb6aaa7eaeb5cc676b18fcc0c92307d5783 100644 --- a/Data/Text/Foreign.hs +++ b/Data/Text/Foreign.hs @@ -21,6 +21,9 @@ module Data.Text.Foreign , fromPtr , useAsPtr , asForeignPtr + -- ** Encoding as UTF-8 + , peekCStringLen + , withCStringLen -- * Unsafe conversion code , lengthWord16 , unsafeCopyToPtr @@ -38,14 +41,17 @@ import Control.Monad.ST.Unsafe (unsafeIOToST) #else import Control.Monad.ST (unsafeIOToST) #endif +import Data.ByteString.Unsafe (unsafePackCStringLen, unsafeUseAsCStringLen) +import Data.Text.Encoding (decodeUtf8, encodeUtf8) import Data.Text.Internal (Text(..), empty) import Data.Text.Unsafe (lengthWord16) -import qualified Data.Text.Array as A import Data.Word (Word16) +import Foreign.C.String (CStringLen) +import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray, withForeignPtr) import Foreign.Marshal.Alloc (allocaBytes) import Foreign.Ptr (Ptr, castPtr, plusPtr) -import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray, withForeignPtr) import Foreign.Storable (peek, poke) +import qualified Data.Text.Array as A -- $interop -- @@ -148,3 +154,21 @@ asForeignPtr t@(Text _arr _off len) = do fp <- mallocForeignPtrArray len withForeignPtr fp $ unsafeCopyToPtr t return (fp, I16 len) + +-- | /O(n)/ Decode a C string with explicit length, which is assumed +-- to have been encoded as UTF-8. If decoding fails, a +-- 'UnicodeException' is thrown. +peekCStringLen :: CStringLen -> IO Text +peekCStringLen cs = do + bs <- unsafePackCStringLen cs + return $! decodeUtf8 bs + +-- | Marshal a 'Text' into a C string encoded as UTF-8 in temporary +-- storage, with explicit length information. The encoded string may +-- contain NUL bytes, and is not followed by a trailing NUL byte. +-- +-- The temporary storage is freed when the subcomputation terminates +-- (either normally or via an exception), so the pointer to the +-- temporary storage must /not/ be used after this function returns. +withCStringLen :: Text -> (CStringLen -> IO a) -> IO a +withCStringLen t act = unsafeUseAsCStringLen (encodeUtf8 t) act diff --git a/Data/Text/Fusion.hs b/Data/Text/Fusion.hs index 6e1fe253189d0f17a87b29ea000d33846f2515c1..275530f4cb472f8b1d4baca181c2dc8676177aca 100644 --- a/Data/Text/Fusion.hs +++ b/Data/Text/Fusion.hs @@ -51,6 +51,7 @@ import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int, fromIntegral, otherwise) import Data.Bits ((.&.)) import Data.Text.Internal (Text(..)) +import Data.Text.Private (runText) import Data.Text.UnsafeChar (ord, unsafeChr, unsafeWrite) import Data.Text.UnsafeShift (shiftL, shiftR) import qualified Data.Text.Array as A @@ -59,7 +60,6 @@ import Data.Text.Fusion.Internal import Data.Text.Fusion.Size import qualified Data.Text.Internal as I import qualified Data.Text.Encoding.Utf16 as U16 -import qualified Prelude as P default(Int) @@ -94,15 +94,14 @@ reverseStream (Text arr off len) = Stream next (off+len-1) (maxSize len) -- | /O(n)/ Convert a 'Stream Char' into a 'Text'. unstream :: Stream Char -> Text -unstream (Stream next0 s0 len) = I.textP (P.fst a) 0 (P.snd a) - where - a = A.run2 (A.new mlen >>= \arr -> outer arr mlen s0 0) - where mlen = upperBound 4 len - outer arr top = loop - where +unstream (Stream next0 s0 len) = runText $ \done -> do + let mlen = upperBound 4 len + arr0 <- A.new mlen + let outer arr top = loop + where loop !s !i = case next0 s of - Done -> return (arr, i) + Done -> done arr i Skip s' -> loop s' i Yield x s' | j >= top -> {-# SCC "unstream/resize" #-} do @@ -114,6 +113,7 @@ unstream (Stream next0 s0 len) = I.textP (P.fst a) 0 (P.snd a) loop s' (i+d) where j | ord x < 0x10000 = i | otherwise = i + 1 + outer arr0 mlen s0 0 {-# INLINE [0] unstream #-} {-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-} @@ -223,9 +223,9 @@ mapAccumL f z0 (Stream next0 s0 len) = (nz,I.textP na 0 nl) arr' <- A.new top' A.copyM arr' 0 arr 0 top outer arr' top' z s i - | otherwise -> do let (z',c) = f z x - d <- unsafeWrite arr i c + | otherwise -> do d <- unsafeWrite arr i c loop z' s' (i+d) - where j | ord x < 0x10000 = i + where (z',c) = f z x + j | ord c < 0x10000 = i | otherwise = i + 1 {-# INLINE [0] mapAccumL #-} diff --git a/Data/Text/Fusion/Common.hs b/Data/Text/Fusion/Common.hs index 4ce008291c1fe4d9b7ffa4b7afa3f2ee5347d921..ade520783257111aa45e16783ba4c8fcc7feb866 100644 --- a/Data/Text/Fusion/Common.hs +++ b/Data/Text/Fusion/Common.hs @@ -1,7 +1,7 @@ -{-# LANGUAGE BangPatterns, Rank2Types #-} +{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-} -- | -- Module : Data.Text.Fusion.Common --- Copyright : (c) Bryan O'Sullivan 2009 +-- Copyright : (c) Bryan O'Sullivan 2009, 2012 -- -- License : BSD-style -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, @@ -17,6 +17,7 @@ module Data.Text.Fusion.Common singleton , streamList , unstreamList + , streamCString# -- * Basic interface , cons @@ -104,10 +105,13 @@ import Prelude (Bool(..), Char, Eq(..), Int, Integral, Maybe(..), (&&), fromIntegral, otherwise) import qualified Data.List as L import qualified Prelude as P +import Data.Bits (shiftL) import Data.Int (Int64) import Data.Text.Fusion.Internal import Data.Text.Fusion.CaseMapping (foldMapping, lowerMapping, upperMapping) import Data.Text.Fusion.Size +import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#) +import GHC.Types (Char(..), Int(..)) singleton :: Char -> Stream Char singleton c = Stream next False 1 @@ -131,6 +135,35 @@ unstreamList (Stream next s0 _len) = unfold s0 {-# RULES "STREAM streamList/unstreamList fusion" forall s. streamList (unstreamList s) = s #-} +-- | Stream the UTF-8-like packed encoding used by GHC to represent +-- constant strings in generated code. +-- +-- This encoding uses the byte sequence "\xc0\x80" to represent NUL, +-- and the string is NUL-terminated. +streamCString# :: Addr# -> Stream Char +streamCString# addr = Stream step 0 unknownSize + where + step !i + | b == 0 = Done + | b <= 0x7f = Yield (C# b#) (i+1) + | b <= 0xdf = let !c = chr $ ((b-0xc0) `shiftL` 6) + next 1 + in Yield c (i+2) + | b <= 0xef = let !c = chr $ ((b-0xe0) `shiftL` 12) + + (next 1 `shiftL` 6) + + next 2 + in Yield c (i+3) + | otherwise = let !c = chr $ ((b-0xf0) `shiftL` 18) + + (next 1 `shiftL` 12) + + (next 2 `shiftL` 6) + + next 3 + in Yield c (i+4) + where b = I# (ord# b#) + next n = I# (ord# (at# (i+n))) - 0x80 + !b# = at# i + at# (I# i#) = indexCharOffAddr# addr i# + chr (I# i#) = C# (chr# i#) +{-# INLINE [0] streamCString# #-} + -- ---------------------------------------------------------------------------- -- * Basic stream functions @@ -139,7 +172,7 @@ data C s = C0 !s -- | /O(n)/ Adds a character to the front of a Stream Char. cons :: Char -> Stream Char -> Stream Char -cons w (Stream next0 s0 len) = Stream next (C1 s0) (len+1) +cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len+1) where next (C1 s) = Yield w (C0 s) next (C0 s) = case next0 s of @@ -185,9 +218,13 @@ head (Stream next s0 _len) = loop_head s0 loop_head !s = case next s of Yield x _ -> x Skip s' -> loop_head s' - Done -> streamError "head" "Empty stream" + Done -> head_empty {-# INLINE [0] head #-} +head_empty :: a +head_empty = streamError "head" "Empty stream" +{-# NOINLINE head_empty #-} + -- | /O(1)/ Returns the first character and remainder of a 'Stream -- Char', or 'Nothing' if empty. Subject to array fusion. uncons :: Stream Char -> Maybe (Char, Stream Char) @@ -274,7 +311,7 @@ lengthI (Stream next s0 _len) = loop_length 0 s0 -- of 'lengthI', but can short circuit if the count of characters is -- greater than the number, and hence be more efficient. compareLengthI :: Integral a => Stream Char -> a -> Ordering -compareLengthI (Stream next s0 len) n = +compareLengthI (Stream next s0 len) n = case exactly len of Nothing -> loop_cmp 0 s0 Just i -> compare (fromIntegral i) n @@ -743,7 +780,6 @@ isPrefixOf (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2) loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 && loop (next1 s1') (next2 s2') {-# INLINE [0] isPrefixOf #-} -{-# SPECIALISE isPrefixOf :: Stream Char -> Stream Char -> Bool #-} -- ---------------------------------------------------------------------------- -- * Searching diff --git a/Data/Text/Fusion/Internal.hs b/Data/Text/Fusion/Internal.hs index e28d7a82ae302d9d82ad64e5785d1abf0d549ccf..dfb105a0265f800cc1695da67e14a1a236cd1492 100644 --- a/Data/Text/Fusion/Internal.hs +++ b/Data/Text/Fusion/Internal.hs @@ -101,7 +101,6 @@ eq (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2) loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 && loop (next1 s1') (next2 s2') {-# INLINE [0] eq #-} -{-# SPECIALISE eq :: Stream Char -> Stream Char -> Bool #-} cmp :: (Ord a) => Stream a -> Stream a -> Ordering cmp (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2) @@ -117,7 +116,6 @@ cmp (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2) EQ -> loop (next1 s1') (next2 s2') other -> other {-# INLINE [0] cmp #-} -{-# SPECIALISE cmp :: Stream Char -> Stream Char -> Ordering #-} -- | The empty stream. empty :: Stream a diff --git a/Data/Text/Fusion/Size.hs b/Data/Text/Fusion/Size.hs index 37e4a1694ddfc610f7fff42d98325cb22c71dcd3..0f9d79b2d824ad7be2f4be6671e3d2cb333079c8 100644 --- a/Data/Text/Fusion/Size.hs +++ b/Data/Text/Fusion/Size.hs @@ -93,8 +93,8 @@ subtractSize _ _ = Unknown mul :: Int -> Int -> Int mul m n - | m <= maxBound `div` n = m * n - | otherwise = overflowError + | m <= maxBound `quot` n = m * n + | otherwise = overflowError {-# INLINE mul #-} mulSize :: Size -> Size -> Size diff --git a/Data/Text/IO.hs b/Data/Text/IO.hs index 4cd4d710c377d4c92f58df56808afe2bed580948..815d4c47608e7f736e6a672ee342f053f364577c 100644 --- a/Data/Text/IO.hs +++ b/Data/Text/IO.hs @@ -1,4 +1,7 @@ {-# LANGUAGE BangPatterns, CPP, RecordWildCards, ScopedTypeVariables #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.IO -- Copyright : (c) 2009, 2010 Bryan O'Sullivan, @@ -16,7 +19,7 @@ module Data.Text.IO ( -- * Performance - -- $performance + -- $performance -- * Locale support -- $locale @@ -26,6 +29,7 @@ module Data.Text.IO , appendFile -- * Operations on handles , hGetContents + , hGetChunk , hGetLine , hPutStr , hPutStrLn @@ -38,15 +42,11 @@ module Data.Text.IO ) where import Data.Text (Text) -import Prelude hiding (appendFile, catch, getContents, getLine, interact, +import Prelude hiding (appendFile, getContents, getLine, interact, putStr, putStrLn, readFile, writeFile) import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout, withFile) -#if __GLASGOW_HASKELL__ <= 610 -import qualified Data.ByteString.Char8 as B -import Data.Text.Encoding (decodeUtf8, encodeUtf8) -#else -import Control.Exception (catch, throwIO) +import qualified Control.Exception as E import Control.Monad (liftM2, when) import Data.IORef (readIORef, writeIORef) import qualified Data.Text as T @@ -64,7 +64,6 @@ import GHC.IO.Handle.Types (BufferList(..), BufferMode(..), Handle__(..), HandleType(..), Newline(..)) import System.IO (hGetBuffering, hFileSize, hSetBuffering, hTell) import System.IO.Error (isEOFError) -#endif -- $performance -- #performance# @@ -97,6 +96,30 @@ writeFile p = withFile p WriteMode . flip hPutStr appendFile :: FilePath -> Text -> IO () appendFile p = withFile p AppendMode . flip hPutStr +catchError :: String -> Handle -> Handle__ -> IOError -> IO Text +catchError caller h Handle__{..} err + | isEOFError err = do + buf <- readIORef haCharBuffer + return $ if isEmptyBuffer buf + then T.empty + else T.singleton '\r' + | otherwise = E.throwIO (augmentIOError err caller h) + +-- | /Experimental./ Read a single chunk of strict text from a +-- 'Handle'. The size of the chunk depends on the amount of input +-- currently buffered. +-- +-- This function blocks only if there is no data available, and EOF +-- has not yet been reached. Once EOF is reached, this function +-- returns an empty string instead of throwing an exception. +hGetChunk :: Handle -> IO Text +hGetChunk h = wantReadableHandle "hGetChunk" h readSingleChunk + where + readSingleChunk hh@Handle__{..} = do + buf <- readIORef haCharBuffer + t <- readChunk hh buf `E.catch` catchError "hGetChunk" h hh + return (hh, t) + -- | Read the remaining contents of a 'Handle' as a string. The -- 'Handle' is closed once the contents have been read, or if an -- exception is thrown. @@ -109,31 +132,21 @@ appendFile p = withFile p AppendMode . flip hPutStr -- result to construct its result. For files more than a half of -- available RAM in size, this may result in memory exhaustion. hGetContents :: Handle -> IO Text -#if __GLASGOW_HASKELL__ <= 610 -hGetContents = fmap decodeUtf8 . B.hGetContents -#else hGetContents h = do chooseGoodBuffering h wantReadableHandle "hGetContents" h readAll where readAll hh@Handle__{..} = do - let catchError e - | isEOFError e = do - buf <- readIORef haCharBuffer - return $ if isEmptyBuffer buf - then T.empty - else T.singleton '\r' - | otherwise = throwIO (augmentIOError e "hGetContents" h) - readChunks = do + let readChunks = do buf <- readIORef haCharBuffer - t <- readChunk hh buf `catch` catchError + t <- readChunk hh buf `E.catch` catchError "hGetContents" h hh if T.null t then return [t] else (t:) `fmap` readChunks ts <- readChunks (hh', _) <- hClose_help hh return (hh'{haType=ClosedHandle}, T.concat ts) - + -- | Use a more efficient buffer size if we're reading in -- block-buffered mode with the default buffer size. When we can -- determine the size of the handle we're reading, set the buffer size @@ -144,30 +157,22 @@ chooseGoodBuffering h = do bufMode <- hGetBuffering h case bufMode of BlockBuffering Nothing -> do - d <- catch (liftM2 (-) (hFileSize h) (hTell h)) $ \(e::IOException) -> + d <- E.catch (liftM2 (-) (hFileSize h) (hTell h)) $ \(e::IOException) -> if ioe_type e == InappropriateType then return 16384 -- faster than the 2KB default - else throwIO e + else E.throwIO e when (d > 0) . hSetBuffering h . BlockBuffering . Just . fromIntegral $ d _ -> return () -#endif -- | Read a single line from a handle. hGetLine :: Handle -> IO Text -#if __GLASGOW_HASKELL__ <= 610 -hGetLine = fmap decodeUtf8 . B.hGetLine -#else hGetLine = hGetLineWith T.concat -#endif -- | Write a string to a handle. hPutStr :: Handle -> Text -> IO () -#if __GLASGOW_HASKELL__ <= 610 -hPutStr h = B.hPutStr h . encodeUtf8 -#else -- This function is lifted almost verbatim from GHC.IO.Handle.Text. hPutStr h t = do - (buffer_mode, nl) <- + (buffer_mode, nl) <- wantWritableHandle "hPutStr" h $ \h_ -> do bmode <- getSpareBuffer h_ return (bmode, haOutputNL h_) @@ -249,7 +254,7 @@ writeBlocksRaw h buf0 (Stream next0 s0 _len) = outer s0 buf0 -- This function is completely lifted from GHC.IO.Handle.Text. getSpareBuffer :: Handle__ -> IO (BufferMode, CharBuffer) -getSpareBuffer Handle__{haCharBuffer=ref, +getSpareBuffer Handle__{haCharBuffer=ref, haBuffers=spare_ref, haBufferMode=mode} = do @@ -270,11 +275,10 @@ getSpareBuffer Handle__{haCharBuffer=ref, -- This function is completely lifted from GHC.IO.Handle.Text. commitBuffer :: Handle -> RawCharBuffer -> Int -> Int -> Bool -> Bool -> IO CharBuffer -commitBuffer hdl !raw !sz !count flush release = +commitBuffer hdl !raw !sz !count flush release = wantWritableHandle "commitAndReleaseBuffer" hdl $ commitBuffer' raw sz count flush release {-# INLINE commitBuffer #-} -#endif -- | Write a string to a handle, followed by a newline. hPutStrLn :: Handle -> Text -> IO () diff --git a/Data/Text/IO/Internal.hs b/Data/Text/IO/Internal.hs index bc4e74b2fa4a17573abe1d9d7ca862a332592424..0f40368f12c8c03d21ca90f692f010857c4c54bb 100644 --- a/Data/Text/IO/Internal.hs +++ b/Data/Text/IO/Internal.hs @@ -12,14 +12,11 @@ module Data.Text.IO.Internal ( -#if __GLASGOW_HASKELL__ >= 612 hGetLineWith , readChunk -#endif ) where -#if __GLASGOW_HASKELL__ >= 612 -import Control.Exception (catch) +import qualified Control.Exception as E import Data.IORef (readIORef, writeIORef) import Data.Text (Text) import Data.Text.Fusion (unstream) @@ -32,7 +29,6 @@ import GHC.IO.Buffer (Buffer(..), CharBuffer, RawCharBuffer, bufferAdjustL, withRawBuffer, writeCharBuf) import GHC.IO.Handle.Internals (ioe_EOF, readTextDevice, wantReadableHandle_) import GHC.IO.Handle.Types (Handle__(..), Newline(..)) -import Prelude hiding (catch) import System.IO (Handle) import System.IO.Error (isEOFError) import qualified Data.Text as T @@ -84,9 +80,9 @@ hGetLineLoop hh@Handle__{..} = go where -- This function is lifted almost verbatim from GHC.IO.Handle.Text. maybeFillReadBuffer :: Handle__ -> CharBuffer -> IO (Maybe CharBuffer) maybeFillReadBuffer handle_ buf - = catch (Just `fmap` getSomeCharacters handle_ buf) $ \e -> - if isEOFError e - then return Nothing + = E.catch (Just `fmap` getSomeCharacters handle_ buf) $ \e -> + if isEOFError e + then return Nothing else ioError e unpack :: RawCharBuffer -> Int -> Int -> IO Text @@ -164,4 +160,3 @@ readChunk hh@Handle__{..} buf = do sizeError :: String -> a sizeError loc = error $ "Data.Text.IO." ++ loc ++ ": bad internal buffer size" -#endif diff --git a/Data/Text/Internal.hs b/Data/Text/Internal.hs index 474b0d2ba443610c0f470fcf5f586958edef2886..221d1e7ddabb643f5f11d69bfe64385cc480affb 100644 --- a/Data/Text/Internal.hs +++ b/Data/Text/Internal.hs @@ -23,6 +23,7 @@ module Data.Text.Internal ( -- * Types + -- $internals Text(..) -- * Construction , text @@ -47,9 +48,9 @@ import Data.Typeable (Typeable) -- | A space efficient, packed, unboxed Unicode text type. data Text = Text - {-# UNPACK #-} !A.Array -- payload - {-# UNPACK #-} !Int -- offset - {-# UNPACK #-} !Int -- length + {-# UNPACK #-} !A.Array -- payload (Word16 elements) + {-# UNPACK #-} !Int -- offset (units of Word16, not Char) + {-# UNPACK #-} !Int -- length (units of Word16, not Char) deriving (Typeable) -- | Smart constructor. @@ -90,7 +91,8 @@ showText (Text arr off len) = -- scalar values, but are unfortunately admitted as valid 'Char' -- values by Haskell. They cannot be represented in a 'Text'. This -- function remaps those code points to the Unicode replacement --- character \"�\", and leaves other code points unchanged. +-- character (U+FFFD, \'�\'), and leaves other code points +-- unchanged. safe :: Char -> Char safe c | ord c .&. 0x1ff800 /= 0xd800 = c @@ -101,3 +103,22 @@ safe c firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b) firstf f (Just (a, b)) = Just (f a, b) firstf _ Nothing = Nothing + +-- $internals +-- +-- Internally, the 'Text' type is represented as an array of 'Word16' +-- UTF-16 code units. The offset and length fields in the constructor +-- are in these units, /not/ units of 'Char'. +-- +-- Invariants that all functions must maintain: +-- +-- * Since the 'Text' type uses UTF-16 internally, it cannot represent +-- characters in the reserved surrogate code point range U+D800 to +-- U+DFFF. To maintain this invariant, the 'safe' function maps +-- 'Char' values in this range to the replacement character (U+FFFD, +-- \'�\'). +-- +-- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must +-- always be followed by a trailing (or \"low\") surrogate code unit +-- (0xDC00-0xDFFF). A trailing surrogate code unit must always be +-- preceded by a leading surrogate code unit. diff --git a/Data/Text/Lazy.hs b/Data/Text/Lazy.hs index a4b432e0fd5fb2e30035caf5c94c88b7e890cfc5..5a976cb9ee942609e25dc672cdcad94ca33fc9ec 100644 --- a/Data/Text/Lazy.hs +++ b/Data/Text/Lazy.hs @@ -1,8 +1,11 @@ {-# OPTIONS_GHC -fno-warn-orphans #-} -{-# LANGUAGE BangPatterns, CPP #-} +{-# LANGUAGE BangPatterns, MagicHash, CPP #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Lazy --- Copyright : (c) 2009, 2010 Bryan O'Sullivan +-- Copyright : (c) 2009, 2010, 2012 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, @@ -174,7 +177,7 @@ module Data.Text.Lazy , partition -- , findSubstring - + -- * Indexing , index , count @@ -190,7 +193,7 @@ module Data.Text.Lazy import Prelude (Char, Bool(..), Maybe(..), String, Eq(..), Ord(..), Ordering(..), Read(..), Show(..), (&&), (||), (+), (-), (.), ($), (++), - div, error, flip, fmap, fromIntegral, not, otherwise) + error, flip, fmap, fromIntegral, not, otherwise, quot) import qualified Prelude as P #if defined(HAVE_DEEPSEQ) import Control.DeepSeq (NFData(..)) @@ -199,11 +202,7 @@ import Data.Int (Int64) import qualified Data.List as L import Data.Char (isSpace) import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf)) -#if __GLASGOW_HASKELL__ >= 612 import Data.Data (mkNoRepType) -#else -import Data.Data (mkNorepType) -#endif import Data.Monoid (Monoid(..)) import Data.String (IsString(..)) import qualified Data.Text as T @@ -217,6 +216,12 @@ import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldlChunks, foldrChunks import Data.Text.Internal (firstf, safe, textP) import qualified Data.Text.Util as U import Data.Text.Lazy.Search (indices) +#if __GLASGOW_HASKELL__ >= 702 +import qualified GHC.CString as GHC +#else +import qualified GHC.Base as GHC +#endif +import GHC.Prim (Addr#) -- $fusion -- @@ -332,11 +337,7 @@ instance Data Text where gfoldl f z txt = z pack `f` (unpack txt) toConstr _ = error "Data.Text.Lazy.Text.toConstr" gunfold _ _ = error "Data.Text.Lazy.Text.gunfold" -#if __GLASGOW_HASKELL__ >= 612 dataTypeOf _ = mkNoRepType "Data.Text.Lazy.Text" -#else - dataTypeOf _ = mkNorepType "Data.Text.Lazy.Text" -#endif -- | /O(n)/ Convert a 'String' into a 'Text'. -- @@ -351,6 +352,19 @@ unpack :: Text -> String unpack t = S.unstreamList (stream t) {-# INLINE [1] unpack #-} +-- | /O(n)/ Convert a literal string into a Text. +unpackCString# :: Addr# -> Text +unpackCString# addr# = unstream (S.streamCString# addr#) +{-# NOINLINE unpackCString# #-} + +{-# RULES "TEXT literal" forall a. + unstream (S.streamList (L.map safe (GHC.unpackCString# a))) + = unpackCString# a #-} + +{-# RULES "TEXT literal UTF8" forall a. + unstream (S.streamList (L.map safe (GHC.unpackCStringUtf8# a))) + = unpackCString# a #-} + -- | /O(1)/ Convert a character into a Text. Subject to fusion. -- Performs replacement on invalid scalar values. singleton :: Char -> Text @@ -606,7 +620,7 @@ center k c t | otherwise = replicateChar l c `append` t `append` replicateChar r c where len = length t d = k - len - r = d `div` 2 + r = d `quot` 2 l = d - r {-# INLINE center #-} @@ -902,7 +916,7 @@ drop i t0 | otherwise = drop' i t0 where drop' 0 ts = ts drop' _ Empty = Empty - drop' n (Chunk t ts) + drop' n (Chunk t ts) | n < len = Chunk (T.drop (fromIntegral n) t) ts | otherwise = drop' (n - len) ts where len = fromIntegral (T.length t) @@ -1190,7 +1204,7 @@ tails ts@(Chunk t ts') -- > splitOn "\r\n" "a\r\nb\r\nd\r\ne" == ["a","b","d","e"] -- > splitOn "aaa" "aaaXaaaXaaaXaaa" == ["","X","X","X",""] -- > splitOn "x" "x" == ["",""] --- +-- -- and -- -- > intercalate s . splitOn s == id diff --git a/Data/Text/Lazy/Builder.hs b/Data/Text/Lazy/Builder.hs index ddf05546b7b23abdcf3a060feec9097006833c37..fc6eb6a51700d75d745e14bb2ed5bf88050d9e9c 100644 --- a/Data/Text/Lazy/Builder.hs +++ b/Data/Text/Lazy/Builder.hs @@ -1,11 +1,15 @@ {-# LANGUAGE BangPatterns, CPP, Rank2Types #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif ----------------------------------------------------------------------------- -- | -- Module : Data.Text.Lazy.Builder --- Copyright : (c) 2010 Johan Tibell +-- Copyright : (c) 2013 Bryan O'Sullivan +-- (c) 2010 Johan Tibell -- License : BSD3-style (see LICENSE) --- +-- -- Maintainer : Johan Tibell <johan.tibell@gmail.com> -- Stability : experimental -- Portability : portable to Hugs and GHC @@ -15,7 +19,9 @@ -- @fromLazyText@, which construct new builders, and 'mappend', which -- concatenates two builders. -- --- To get maximum performance when building lazy @Text@ values using a builder, associate @mappend@ calls to the right. For example, prefer +-- To get maximum performance when building lazy @Text@ values using a +-- builder, associate @mappend@ calls to the right. For example, +-- prefer -- -- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c') -- @@ -23,8 +29,14 @@ -- -- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c' -- --- as the latter associates @mappend@ to the left. +-- as the latter associates @mappend@ to the left. Or, equivalently, +-- prefer +-- +-- > singleton 'a' <> singleton 'b' <> singleton 'c' -- +-- since the '<>' from recent versions of 'Data.Monoid' associates +-- to the right. + ----------------------------------------------------------------------------- module Data.Text.Lazy.Builder @@ -43,270 +55,4 @@ module Data.Text.Lazy.Builder , flush ) where -import Control.Monad.ST (ST, runST) -import Data.Bits ((.&.)) -import Data.Monoid (Monoid(..)) -import Data.Text.Internal (Text(..)) -import Data.Text.Lazy.Internal (smallChunkSize) -import Data.Text.Unsafe (inlineInterleaveST) -import Data.Text.UnsafeChar (ord, unsafeWrite) -import Data.Text.UnsafeShift (shiftR) -import Prelude hiding (map, putChar) - -import qualified Data.String as String -import qualified Data.Text as S -import qualified Data.Text.Array as A -import qualified Data.Text.Lazy as L - ------------------------------------------------------------------------- - --- | A @Builder@ is an efficient way to build lazy @Text@ values. --- There are several functions for constructing builders, but only one --- to inspect them: to extract any data, you have to turn them into --- lazy @Text@ values using @toLazyText@. --- --- Internally, a builder constructs a lazy @Text@ by filling arrays --- piece by piece. As each buffer is filled, it is \'popped\' off, to --- become a new chunk of the resulting lazy @Text@. All this is --- hidden from the user of the @Builder@. -newtype Builder = Builder { - -- Invariant (from Data.Text.Lazy): - -- The lists include no null Texts. - runBuilder :: forall s. (Buffer s -> ST s [S.Text]) - -> Buffer s - -> ST s [S.Text] - } - -instance Monoid Builder where - mempty = empty - {-# INLINE mempty #-} - mappend = append - {-# INLINE mappend #-} - mconcat = foldr mappend mempty - {-# INLINE mconcat #-} - -instance String.IsString Builder where - fromString = fromString - {-# INLINE fromString #-} - -instance Show Builder where - show = show . toLazyText - -instance Eq Builder where - a == b = toLazyText a == toLazyText b - -instance Ord Builder where - a <= b = toLazyText a <= toLazyText b - ------------------------------------------------------------------------- - --- | /O(1)./ The empty @Builder@, satisfying --- --- * @'toLazyText' 'empty' = 'L.empty'@ --- -empty :: Builder -empty = Builder (\ k buf -> k buf) -{-# INLINE empty #-} - --- | /O(1)./ A @Builder@ taking a single character, satisfying --- --- * @'toLazyText' ('singleton' c) = 'L.singleton' c@ --- -singleton :: Char -> Builder -singleton c = writeAtMost 2 $ \ marr o -> - if n < 0x10000 - then A.unsafeWrite marr o (fromIntegral n) >> return 1 - else do - A.unsafeWrite marr o lo - A.unsafeWrite marr (o+1) hi - return 2 - where n = ord c - m = n - 0x10000 - lo = fromIntegral $ (m `shiftR` 10) + 0xD800 - hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 -{-# INLINE singleton #-} - ------------------------------------------------------------------------- - --- | /O(1)./ The concatenation of two builders, an associative --- operation with identity 'empty', satisfying --- --- * @'toLazyText' ('append' x y) = 'L.append' ('toLazyText' x) ('toLazyText' y)@ --- -append :: Builder -> Builder -> Builder -append (Builder f) (Builder g) = Builder (f . g) -{-# INLINE [0] append #-} - --- TODO: Experiment to find the right threshold. -copyLimit :: Int -copyLimit = 128 - --- This function attempts to merge small @Text@ values instead of --- treating each value as its own chunk. We may not always want this. - --- | /O(1)./ A @Builder@ taking a 'S.Text', satisfying --- --- * @'toLazyText' ('fromText' t) = 'L.fromChunks' [t]@ --- -fromText :: S.Text -> Builder -fromText t@(Text arr off l) - | S.null t = empty - | l <= copyLimit = writeN l $ \marr o -> A.copyI marr o arr off (l+o) - | otherwise = flush `append` mapBuilder (t :) -{-# INLINE [1] fromText #-} - -{-# RULES -"fromText/pack" forall s . - fromText (S.pack s) = fromString s - #-} - --- | /O(1)./ A Builder taking a @String@, satisfying --- --- * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@ --- -fromString :: String -> Builder -fromString str = Builder $ \k (Buffer p0 o0 u0 l0) -> - let loop !marr !o !u !l [] = k (Buffer marr o u l) - loop marr o u l s@(c:cs) - | l <= 1 = do - arr <- A.unsafeFreeze marr - let !t = Text arr o u - marr' <- A.new chunkSize - ts <- inlineInterleaveST (loop marr' 0 0 chunkSize s) - return $ t : ts - | otherwise = do - n <- unsafeWrite marr (o+u) c - loop marr o (u+n) (l-n) cs - in loop p0 o0 u0 l0 str - where - chunkSize = smallChunkSize -{-# INLINE fromString #-} - --- | /O(1)./ A @Builder@ taking a lazy @Text@, satisfying --- --- * @'toLazyText' ('fromLazyText' t) = t@ --- -fromLazyText :: L.Text -> Builder -fromLazyText ts = flush `append` mapBuilder (L.toChunks ts ++) -{-# INLINE fromLazyText #-} - ------------------------------------------------------------------------- - --- Our internal buffer type -data Buffer s = Buffer {-# UNPACK #-} !(A.MArray s) - {-# UNPACK #-} !Int -- offset - {-# UNPACK #-} !Int -- used units - {-# UNPACK #-} !Int -- length left - ------------------------------------------------------------------------- - --- | /O(n)./ Extract a lazy @Text@ from a @Builder@ with a default --- buffer size. The construction work takes place if and when the --- relevant part of the lazy @Text@ is demanded. -toLazyText :: Builder -> L.Text -toLazyText = toLazyTextWith smallChunkSize - --- | /O(n)./ Extract a lazy @Text@ from a @Builder@, using the given --- size for the initial buffer. The construction work takes place if --- and when the relevant part of the lazy @Text@ is demanded. --- --- If the initial buffer is too small to hold all data, subsequent --- buffers will be the default buffer size. -toLazyTextWith :: Int -> Builder -> L.Text -toLazyTextWith chunkSize m = L.fromChunks (runST $ - newBuffer chunkSize >>= runBuilder (m `append` flush) (const (return []))) - --- | /O(1)./ Pop the strict @Text@ we have constructed so far, if any, --- yielding a new chunk in the result lazy @Text@. -flush :: Builder -flush = Builder $ \ k buf@(Buffer p o u l) -> - if u == 0 - then k buf - else do arr <- A.unsafeFreeze p - let !b = Buffer p (o+u) 0 l - !t = Text arr o u - ts <- inlineInterleaveST (k b) - return $! t : ts - ------------------------------------------------------------------------- - --- | Sequence an ST operation on the buffer -withBuffer :: (forall s. Buffer s -> ST s (Buffer s)) -> Builder -withBuffer f = Builder $ \k buf -> f buf >>= k -{-# INLINE withBuffer #-} - --- | Get the size of the buffer -withSize :: (Int -> Builder) -> Builder -withSize f = Builder $ \ k buf@(Buffer _ _ _ l) -> - runBuilder (f l) k buf -{-# INLINE withSize #-} - --- | Map the resulting list of texts. -mapBuilder :: ([S.Text] -> [S.Text]) -> Builder -mapBuilder f = Builder (fmap f .) - ------------------------------------------------------------------------- - --- | Ensure that there are at least @n@ many elements available. -ensureFree :: Int -> Builder -ensureFree !n = withSize $ \ l -> - if n <= l - then empty - else flush `append'` withBuffer (const (newBuffer (max n smallChunkSize))) -{-# INLINE [0] ensureFree #-} - -writeAtMost :: Int -> (forall s. A.MArray s -> Int -> ST s Int) -> Builder -writeAtMost n f = ensureFree n `append'` withBuffer (writeBuffer f) -{-# INLINE [0] writeAtMost #-} - --- | Ensure that @n@ many elements are available, and then use @f@ to --- write some elements into the memory. -writeN :: Int -> (forall s. A.MArray s -> Int -> ST s ()) -> Builder -writeN n f = writeAtMost n (\ p o -> f p o >> return n) -{-# INLINE writeN #-} - -writeBuffer :: (A.MArray s -> Int -> ST s Int) -> Buffer s -> ST s (Buffer s) -writeBuffer f (Buffer p o u l) = do - n <- f p (o+u) - return $! Buffer p o (u+n) (l-n) -{-# INLINE writeBuffer #-} - -newBuffer :: Int -> ST s (Buffer s) -newBuffer size = do - arr <- A.new size - return $! Buffer arr 0 0 size -{-# INLINE newBuffer #-} - ------------------------------------------------------------------------- --- Some nice rules for Builder - --- This function makes GHC understand that 'writeN' and 'ensureFree' --- are *not* recursive in the precense of the rewrite rules below. --- This is not needed with GHC 7+. -append' :: Builder -> Builder -> Builder -append' (Builder f) (Builder g) = Builder (f . g) -{-# INLINE append' #-} - -{-# RULES - -"append/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int) - (g::forall s. A.MArray s -> Int -> ST s Int) ws. - append (writeAtMost a f) (append (writeAtMost b g) ws) = - append (writeAtMost (a+b) (\marr o -> f marr o >>= \ n -> - g marr (o+n) >>= \ m -> - let s = n+m in s `seq` return s)) ws - -"writeAtMost/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int) - (g::forall s. A.MArray s -> Int -> ST s Int). - append (writeAtMost a f) (writeAtMost b g) = - writeAtMost (a+b) (\marr o -> f marr o >>= \ n -> - g marr (o+n) >>= \ m -> - let s = n+m in s `seq` return s) - -"ensureFree/ensureFree" forall a b . - append (ensureFree a) (ensureFree b) = ensureFree (max a b) - -"flush/flush" - append flush flush = flush - - #-} +import Data.Text.Lazy.Builder.Internal diff --git a/Data/Text/Lazy/Builder/Int.hs b/Data/Text/Lazy/Builder/Int.hs index 9ec4c836853be2bfe8ae2ae6c45394b92ee3bfe2..c0f405c9be0efaadbbf185b6a2ac18b1bce33176 100644 --- a/Data/Text/Lazy/Builder/Int.hs +++ b/Data/Text/Lazy/Builder/Int.hs @@ -1,7 +1,11 @@ -{-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-} +{-# LANGUAGE BangPatterns, CPP, MagicHash, RankNTypes, UnboxedTuples #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- Module: Data.Text.Lazy.Builder.Int --- Copyright: (c) 2011 MailRank, Inc. +-- Copyright: (c) 2013 Bryan O'Sullivan +-- (c) 2011 MailRank, Inc. -- License: BSD3 -- Maintainer: Bryan O'Sullivan <bos@serpentine.com> -- Stability: experimental @@ -17,45 +21,134 @@ module Data.Text.Lazy.Builder.Int import Data.Int (Int8, Int16, Int32, Int64) import Data.Monoid (mempty) +import qualified Data.ByteString.Unsafe as B import Data.Text.Lazy.Builder.Functions ((<>), i2d) -import Data.Text.Lazy.Builder +import Data.Text.Lazy.Builder.Internal +import Data.Text.Lazy.Builder.Int.Digits (digits) +import Data.Text.Array import Data.Word (Word, Word8, Word16, Word32, Word64) import GHC.Base (quotInt, remInt) import GHC.Num (quotRemInteger) import GHC.Types (Int(..)) +import Control.Monad.ST #ifdef __GLASGOW_HASKELL__ -# if __GLASGOW_HASKELL__ < 611 -import GHC.Integer.Internals -# else +# if defined(INTEGER_GMP) import GHC.Integer.GMP.Internals +# elif defined(INTEGER_SIMPLE) +import GHC.Integer +# else +# error "You need to use either GMP or integer-simple." # endif #endif -#ifdef INTEGER_GMP +#if defined(INTEGER_GMP) || defined(INTEGER_SIMPLE) # define PAIR(a,b) (# a,b #) #else # define PAIR(a,b) (a,b) #endif decimal :: Integral a => a -> Builder -{-# SPECIALIZE decimal :: Int -> Builder #-} {-# SPECIALIZE decimal :: Int8 -> Builder #-} -{-# SPECIALIZE decimal :: Int16 -> Builder #-} -{-# SPECIALIZE decimal :: Int32 -> Builder #-} -{-# SPECIALIZE decimal :: Int64 -> Builder #-} -{-# SPECIALIZE decimal :: Word -> Builder #-} -{-# SPECIALIZE decimal :: Word8 -> Builder #-} -{-# SPECIALIZE decimal :: Word16 -> Builder #-} -{-# SPECIALIZE decimal :: Word32 -> Builder #-} -{-# SPECIALIZE decimal :: Word64 -> Builder #-} +{-# RULES "decimal/Int" decimal = boundedDecimal :: Int -> Builder #-} +{-# RULES "decimal/Int16" decimal = boundedDecimal :: Int16 -> Builder #-} +{-# RULES "decimal/Int32" decimal = boundedDecimal :: Int32 -> Builder #-} +{-# RULES "decimal/Int64" decimal = boundedDecimal :: Int64 -> Builder #-} +{-# RULES "decimal/Word" decimal = positive :: Word -> Builder #-} +{-# RULES "decimal/Word8" decimal = positive :: Word8 -> Builder #-} +{-# RULES "decimal/Word16" decimal = positive :: Word16 -> Builder #-} +{-# RULES "decimal/Word32" decimal = positive :: Word32 -> Builder #-} +{-# RULES "decimal/Word64" decimal = positive :: Word64 -> Builder #-} {-# RULES "decimal/Integer" decimal = integer 10 :: Integer -> Builder #-} -decimal i - | i < 0 = singleton '-' <> go (-i) - | otherwise = go i - where - go n | n < 10 = digit n - | otherwise = go (n `quot` 10) <> digit (n `rem` 10) +decimal i = decimal' (<= -128) i + +boundedDecimal :: (Integral a, Bounded a) => a -> Builder +{-# SPECIALIZE boundedDecimal :: Int -> Builder #-} +{-# SPECIALIZE boundedDecimal :: Int8 -> Builder #-} +{-# SPECIALIZE boundedDecimal :: Int16 -> Builder #-} +{-# SPECIALIZE boundedDecimal :: Int32 -> Builder #-} +{-# SPECIALIZE boundedDecimal :: Int64 -> Builder #-} +boundedDecimal i = decimal' (== minBound) i + +decimal' :: (Integral a) => (a -> Bool) -> a -> Builder +{-# INLINE decimal' #-} +decimal' p i + | i < 0 = if p i + then let (q, r) = i `quotRem` 10 + qq = -q + !n = countDigits qq + in writeN (n + 2) $ \marr off -> do + unsafeWrite marr off minus + posDecimal marr (off+1) n qq + unsafeWrite marr (off+n+1) (i2w (-r)) + else let j = -i + !n = countDigits j + in writeN (n + 1) $ \marr off -> + unsafeWrite marr off minus >> posDecimal marr (off+1) n j + | otherwise = positive i + +positive :: (Integral a) => a -> Builder +{-# SPECIALIZE positive :: Int -> Builder #-} +{-# SPECIALIZE positive :: Int8 -> Builder #-} +{-# SPECIALIZE positive :: Int16 -> Builder #-} +{-# SPECIALIZE positive :: Int32 -> Builder #-} +{-# SPECIALIZE positive :: Int64 -> Builder #-} +{-# SPECIALIZE positive :: Word -> Builder #-} +{-# SPECIALIZE positive :: Word8 -> Builder #-} +{-# SPECIALIZE positive :: Word16 -> Builder #-} +{-# SPECIALIZE positive :: Word32 -> Builder #-} +{-# SPECIALIZE positive :: Word64 -> Builder #-} +positive i + | i < 10 = writeN 1 $ \marr off -> unsafeWrite marr off (i2w i) + | otherwise = let !n = countDigits i + in writeN n $ \marr off -> posDecimal marr off n i + +posDecimal :: (Integral a) => + forall s. MArray s -> Int -> Int -> a -> ST s () +{-# INLINE posDecimal #-} +posDecimal marr off0 ds v0 = go (off0 + ds - 1) v0 + where go off v + | v >= 100 = do + let (q, r) = v `quotRem` 100 + write2 off r + go (off - 2) q + | v < 10 = unsafeWrite marr off (i2w v) + | otherwise = write2 off v + write2 off i0 = do + let i = fromIntegral i0; j = i + i + unsafeWrite marr off $ get (j + 1) + unsafeWrite marr (off - 1) $ get j + get = fromIntegral . B.unsafeIndex digits + +minus, zero :: Word16 +{-# INLINE minus #-} +{-# INLINE zero #-} +minus = 45 +zero = 48 + +i2w :: (Integral a) => a -> Word16 +{-# INLINE i2w #-} +i2w v = zero + fromIntegral v + +countDigits :: (Integral a) => a -> Int +{-# INLINE countDigits #-} +countDigits v0 = go 1 (fromIntegral v0 :: Word64) + where go !k v + | v < 10 = k + | v < 100 = k + 1 + | v < 1000 = k + 2 + | v < 1000000000000 = + k + if v < 100000000 + then if v < 1000000 + then if v < 10000 + then 3 + else 4 + fin v 100000 + else 6 + fin v 10000000 + else if v < 10000000000 + then 8 + fin v 1000000000 + else 10 + fin v 100000000000 + | otherwise = go (k + 12) (v `quot` 1000000000000) + fin v n = if v >= n then 1 else 0 hexadecimal :: Integral a => a -> Builder {-# SPECIALIZE hexadecimal :: Int -> Builder #-} @@ -68,17 +161,22 @@ hexadecimal :: Integral a => a -> Builder {-# SPECIALIZE hexadecimal :: Word16 -> Builder #-} {-# SPECIALIZE hexadecimal :: Word32 -> Builder #-} {-# SPECIALIZE hexadecimal :: Word64 -> Builder #-} -{-# RULES "hexadecimal/Integer" hexadecimal = integer 16 :: Integer -> Builder #-} +{-# RULES "hexadecimal/Integer" + hexadecimal = hexInteger :: Integer -> Builder #-} hexadecimal i - | i < 0 = singleton '-' <> go (-i) + | i < 0 = error hexErrMsg | otherwise = go i where go n | n < 16 = hexDigit n | otherwise = go (n `quot` 16) <> hexDigit (n `rem` 16) -digit :: Integral a => a -> Builder -digit n = singleton $! i2d (fromIntegral n) -{-# INLINE digit #-} +hexInteger :: Integer -> Builder +hexInteger i + | i < 0 = error hexErrMsg + | otherwise = integer 16 i + +hexErrMsg :: String +hexErrMsg = "Data.Text.Lazy.Builder.Int.hexadecimal: applied to negative number" hexDigit :: Integral a => a -> Builder hexDigit n @@ -93,8 +191,13 @@ int = decimal data T = T !Integer !Int integer :: Int -> Integer -> Builder +#ifdef INTEGER_GMP integer 10 (S# i#) = decimal (I# i#) integer 16 (S# i#) = hexadecimal (I# i#) +#else +integer 10 i = decimal i +integer 16 i = hexadecimal i +#endif integer base i | i < 0 = singleton '-' <> go (-i) | otherwise = go i @@ -146,7 +249,7 @@ integer base i pblock = loop maxDigits where loop !d !n - | d == 1 = digit n - | otherwise = loop (d-1) q <> digit r + | d == 1 = hexDigit n + | otherwise = loop (d-1) q <> hexDigit r where q = n `quotInt` base r = n `remInt` base diff --git a/Data/Text/Lazy/Builder/Int/Digits.hs b/Data/Text/Lazy/Builder/Int/Digits.hs new file mode 100644 index 0000000000000000000000000000000000000000..69fe26e8fd73a3af6cabbfa2c4436ab0c87104fa --- /dev/null +++ b/Data/Text/Lazy/Builder/Int/Digits.hs @@ -0,0 +1,22 @@ +{-# LANGUAGE OverloadedStrings #-} + +-- Module: Data.Text.Lazy.Builder.Int.Digits +-- Copyright: (c) 2013 Bryan O'Sullivan +-- License: BSD3 +-- Maintainer: Bryan O'Sullivan <bos@serpentine.com> +-- Stability: experimental +-- Portability: portable +-- +-- This module exists because the C preprocessor does things that we +-- shall not speak of when confronted with Haskell multiline strings. + +module Data.Text.Lazy.Builder.Int.Digits (digits) where + +import Data.ByteString.Char8 (ByteString) + +digits :: ByteString +digits = "0001020304050607080910111213141516171819\ + \2021222324252627282930313233343536373839\ + \4041424344454647484950515253545556575859\ + \6061626364656667686970717273747576777879\ + \8081828384858687888990919293949596979899" diff --git a/Data/Text/Lazy/Builder/Internal.hs b/Data/Text/Lazy/Builder/Internal.hs new file mode 100644 index 0000000000000000000000000000000000000000..14af638165737fa5ecb5f9c0310788f3386a9ce3 --- /dev/null +++ b/Data/Text/Lazy/Builder/Internal.hs @@ -0,0 +1,321 @@ +{-# LANGUAGE BangPatterns, CPP, Rank2Types #-} + +----------------------------------------------------------------------------- +-- | +-- Module : Data.Text.Lazy.Builder.Internal +-- Copyright : (c) 2013 Bryan O'Sullivan +-- (c) 2010 Johan Tibell +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Johan Tibell <johan.tibell@gmail.com> +-- Stability : experimental +-- Portability : portable to Hugs and GHC +-- +-- Efficient construction of lazy @Text@ values. The principal +-- operations on a @Builder@ are @singleton@, @fromText@, and +-- @fromLazyText@, which construct new builders, and 'mappend', which +-- concatenates two builders. +-- +-- To get maximum performance when building lazy @Text@ values using a +-- builder, associate @mappend@ calls to the right. For example, +-- prefer +-- +-- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c') +-- +-- to +-- +-- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c' +-- +-- as the latter associates @mappend@ to the left. +-- +----------------------------------------------------------------------------- + +module Data.Text.Lazy.Builder.Internal + ( -- * Public API + -- ** The Builder type + Builder + , toLazyText + , toLazyTextWith + + -- ** Constructing Builders + , singleton + , fromText + , fromLazyText + , fromString + + -- ** Flushing the buffer state + , flush + + -- * Internal functions + , append' + , ensureFree + , writeN + ) where + +import Control.Monad.ST (ST, runST) +import Data.Bits ((.&.)) +import Data.Monoid (Monoid(..)) +import Data.Text.Internal (Text(..)) +import Data.Text.Lazy.Internal (smallChunkSize) +import Data.Text.Unsafe (inlineInterleaveST) +import Data.Text.UnsafeChar (ord, unsafeWrite) +import Data.Text.UnsafeShift (shiftR) +import Prelude hiding (map, putChar) + +import qualified Data.String as String +import qualified Data.Text as S +import qualified Data.Text.Array as A +import qualified Data.Text.Lazy as L + +------------------------------------------------------------------------ + +-- | A @Builder@ is an efficient way to build lazy @Text@ values. +-- There are several functions for constructing builders, but only one +-- to inspect them: to extract any data, you have to turn them into +-- lazy @Text@ values using @toLazyText@. +-- +-- Internally, a builder constructs a lazy @Text@ by filling arrays +-- piece by piece. As each buffer is filled, it is \'popped\' off, to +-- become a new chunk of the resulting lazy @Text@. All this is +-- hidden from the user of the @Builder@. +newtype Builder = Builder { + -- Invariant (from Data.Text.Lazy): + -- The lists include no null Texts. + runBuilder :: forall s. (Buffer s -> ST s [S.Text]) + -> Buffer s + -> ST s [S.Text] + } + +instance Monoid Builder where + mempty = empty + {-# INLINE mempty #-} + mappend = append + {-# INLINE mappend #-} + mconcat = foldr mappend mempty + {-# INLINE mconcat #-} + +instance String.IsString Builder where + fromString = fromString + {-# INLINE fromString #-} + +instance Show Builder where + show = show . toLazyText + +instance Eq Builder where + a == b = toLazyText a == toLazyText b + +instance Ord Builder where + a <= b = toLazyText a <= toLazyText b + +------------------------------------------------------------------------ + +-- | /O(1)./ The empty @Builder@, satisfying +-- +-- * @'toLazyText' 'empty' = 'L.empty'@ +-- +empty :: Builder +empty = Builder (\ k buf -> k buf) +{-# INLINE empty #-} + +-- | /O(1)./ A @Builder@ taking a single character, satisfying +-- +-- * @'toLazyText' ('singleton' c) = 'L.singleton' c@ +-- +singleton :: Char -> Builder +singleton c = writeAtMost 2 $ \ marr o -> + if n < 0x10000 + then A.unsafeWrite marr o (fromIntegral n) >> return 1 + else do + A.unsafeWrite marr o lo + A.unsafeWrite marr (o+1) hi + return 2 + where n = ord c + m = n - 0x10000 + lo = fromIntegral $ (m `shiftR` 10) + 0xD800 + hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 +{-# INLINE singleton #-} + +------------------------------------------------------------------------ + +-- | /O(1)./ The concatenation of two builders, an associative +-- operation with identity 'empty', satisfying +-- +-- * @'toLazyText' ('append' x y) = 'L.append' ('toLazyText' x) ('toLazyText' y)@ +-- +append :: Builder -> Builder -> Builder +append (Builder f) (Builder g) = Builder (f . g) +{-# INLINE [0] append #-} + +-- TODO: Experiment to find the right threshold. +copyLimit :: Int +copyLimit = 128 + +-- This function attempts to merge small @Text@ values instead of +-- treating each value as its own chunk. We may not always want this. + +-- | /O(1)./ A @Builder@ taking a 'S.Text', satisfying +-- +-- * @'toLazyText' ('fromText' t) = 'L.fromChunks' [t]@ +-- +fromText :: S.Text -> Builder +fromText t@(Text arr off l) + | S.null t = empty + | l <= copyLimit = writeN l $ \marr o -> A.copyI marr o arr off (l+o) + | otherwise = flush `append` mapBuilder (t :) +{-# INLINE [1] fromText #-} + +{-# RULES +"fromText/pack" forall s . + fromText (S.pack s) = fromString s + #-} + +-- | /O(1)./ A Builder taking a @String@, satisfying +-- +-- * @'toLazyText' ('fromString' s) = 'L.fromChunks' [S.pack s]@ +-- +fromString :: String -> Builder +fromString str = Builder $ \k (Buffer p0 o0 u0 l0) -> + let loop !marr !o !u !l [] = k (Buffer marr o u l) + loop marr o u l s@(c:cs) + | l <= 1 = do + arr <- A.unsafeFreeze marr + let !t = Text arr o u + marr' <- A.new chunkSize + ts <- inlineInterleaveST (loop marr' 0 0 chunkSize s) + return $ t : ts + | otherwise = do + n <- unsafeWrite marr (o+u) c + loop marr o (u+n) (l-n) cs + in loop p0 o0 u0 l0 str + where + chunkSize = smallChunkSize +{-# INLINE fromString #-} + +-- | /O(1)./ A @Builder@ taking a lazy @Text@, satisfying +-- +-- * @'toLazyText' ('fromLazyText' t) = t@ +-- +fromLazyText :: L.Text -> Builder +fromLazyText ts = flush `append` mapBuilder (L.toChunks ts ++) +{-# INLINE fromLazyText #-} + +------------------------------------------------------------------------ + +-- Our internal buffer type +data Buffer s = Buffer {-# UNPACK #-} !(A.MArray s) + {-# UNPACK #-} !Int -- offset + {-# UNPACK #-} !Int -- used units + {-# UNPACK #-} !Int -- length left + +------------------------------------------------------------------------ + +-- | /O(n)./ Extract a lazy @Text@ from a @Builder@ with a default +-- buffer size. The construction work takes place if and when the +-- relevant part of the lazy @Text@ is demanded. +toLazyText :: Builder -> L.Text +toLazyText = toLazyTextWith smallChunkSize + +-- | /O(n)./ Extract a lazy @Text@ from a @Builder@, using the given +-- size for the initial buffer. The construction work takes place if +-- and when the relevant part of the lazy @Text@ is demanded. +-- +-- If the initial buffer is too small to hold all data, subsequent +-- buffers will be the default buffer size. +toLazyTextWith :: Int -> Builder -> L.Text +toLazyTextWith chunkSize m = L.fromChunks (runST $ + newBuffer chunkSize >>= runBuilder (m `append` flush) (const (return []))) + +-- | /O(1)./ Pop the strict @Text@ we have constructed so far, if any, +-- yielding a new chunk in the result lazy @Text@. +flush :: Builder +flush = Builder $ \ k buf@(Buffer p o u l) -> + if u == 0 + then k buf + else do arr <- A.unsafeFreeze p + let !b = Buffer p (o+u) 0 l + !t = Text arr o u + ts <- inlineInterleaveST (k b) + return $! t : ts + +------------------------------------------------------------------------ + +-- | Sequence an ST operation on the buffer +withBuffer :: (forall s. Buffer s -> ST s (Buffer s)) -> Builder +withBuffer f = Builder $ \k buf -> f buf >>= k +{-# INLINE withBuffer #-} + +-- | Get the size of the buffer +withSize :: (Int -> Builder) -> Builder +withSize f = Builder $ \ k buf@(Buffer _ _ _ l) -> + runBuilder (f l) k buf +{-# INLINE withSize #-} + +-- | Map the resulting list of texts. +mapBuilder :: ([S.Text] -> [S.Text]) -> Builder +mapBuilder f = Builder (fmap f .) + +------------------------------------------------------------------------ + +-- | Ensure that there are at least @n@ many elements available. +ensureFree :: Int -> Builder +ensureFree !n = withSize $ \ l -> + if n <= l + then empty + else flush `append'` withBuffer (const (newBuffer (max n smallChunkSize))) +{-# INLINE [0] ensureFree #-} + +writeAtMost :: Int -> (forall s. A.MArray s -> Int -> ST s Int) -> Builder +writeAtMost n f = ensureFree n `append'` withBuffer (writeBuffer f) +{-# INLINE [0] writeAtMost #-} + +-- | Ensure that @n@ many elements are available, and then use @f@ to +-- write some elements into the memory. +writeN :: Int -> (forall s. A.MArray s -> Int -> ST s ()) -> Builder +writeN n f = writeAtMost n (\ p o -> f p o >> return n) +{-# INLINE writeN #-} + +writeBuffer :: (A.MArray s -> Int -> ST s Int) -> Buffer s -> ST s (Buffer s) +writeBuffer f (Buffer p o u l) = do + n <- f p (o+u) + return $! Buffer p o (u+n) (l-n) +{-# INLINE writeBuffer #-} + +newBuffer :: Int -> ST s (Buffer s) +newBuffer size = do + arr <- A.new size + return $! Buffer arr 0 0 size +{-# INLINE newBuffer #-} + +------------------------------------------------------------------------ +-- Some nice rules for Builder + +-- This function makes GHC understand that 'writeN' and 'ensureFree' +-- are *not* recursive in the precense of the rewrite rules below. +-- This is not needed with GHC 7+. +append' :: Builder -> Builder -> Builder +append' (Builder f) (Builder g) = Builder (f . g) +{-# INLINE append' #-} + +{-# RULES + +"append/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int) + (g::forall s. A.MArray s -> Int -> ST s Int) ws. + append (writeAtMost a f) (append (writeAtMost b g) ws) = + append (writeAtMost (a+b) (\marr o -> f marr o >>= \ n -> + g marr (o+n) >>= \ m -> + let s = n+m in s `seq` return s)) ws + +"writeAtMost/writeAtMost" forall a b (f::forall s. A.MArray s -> Int -> ST s Int) + (g::forall s. A.MArray s -> Int -> ST s Int). + append (writeAtMost a f) (writeAtMost b g) = + writeAtMost (a+b) (\marr o -> f marr o >>= \ n -> + g marr (o+n) >>= \ m -> + let s = n+m in s `seq` return s) + +"ensureFree/ensureFree" forall a b . + append (ensureFree a) (ensureFree b) = ensureFree (max a b) + +"flush/flush" + append flush flush = flush + + #-} diff --git a/Data/Text/Lazy/Builder/RealFloat.hs b/Data/Text/Lazy/Builder/RealFloat.hs index fcb9e19d04b37a4add0b7ca775e90b1c4d13a518..169e6f453362b3efb7cbf9944419fee004cf39ce 100644 --- a/Data/Text/Lazy/Builder/RealFloat.hs +++ b/Data/Text/Lazy/Builder/RealFloat.hs @@ -1,4 +1,7 @@ -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE CPP, OverloadedStrings #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module: Data.Text.Lazy.Builder.RealFloat diff --git a/Data/Text/Lazy/Encoding.hs b/Data/Text/Lazy/Encoding.hs index 2b361acc3d3668b5fbe41bda1e0681affe422adb..1498f8d962b9dd34fb65e79476fa3ed2e685a8ed 100644 --- a/Data/Text/Lazy/Encoding.hs +++ b/Data/Text/Lazy/Encoding.hs @@ -1,4 +1,7 @@ -{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BangPatterns,CPP #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Lazy.Encoding -- Copyright : (c) 2009, 2010 Bryan O'Sullivan @@ -20,6 +23,7 @@ module Data.Text.Lazy.Encoding -- * Decoding ByteStrings to Text -- $strict decodeASCII + , decodeLatin1 , decodeUtf8 , decodeUtf16LE , decodeUtf16BE @@ -46,17 +50,20 @@ module Data.Text.Lazy.Encoding import Control.Exception (evaluate, try) import Data.Bits ((.&.)) +import Data.Monoid (mempty, (<>)) import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode) import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks) -import System.IO.Unsafe (unsafePerformIO) import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B import qualified Data.ByteString.Unsafe as S +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Builder.Prim as BP import qualified Data.Text as T import qualified Data.Text.Encoding as TE import qualified Data.Text.Lazy.Encoding.Fusion as E import qualified Data.Text.Lazy.Fusion as F +import Data.Text.Unsafe (unsafeDupablePerformIO) -- $strict -- @@ -74,11 +81,15 @@ import qualified Data.Text.Lazy.Fusion as F -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII -- encoded text. -- --- This function is deprecated. Use 'decodeUtf8' instead. +-- This function is deprecated. Use 'decodeLatin1' instead. decodeASCII :: B.ByteString -> Text decodeASCII = decodeUtf8 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-} +-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text. +decodeLatin1 :: B.ByteString -> Text +decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks + -- | Decode a 'ByteString' containing UTF-8 encoded text. decodeUtf8With :: OnDecodeError -> B.ByteString -> Text decodeUtf8With onErr bs0 = fast bs0 @@ -141,7 +152,7 @@ decodeUtf8 = decodeUtf8With strictDecode -- input before it can return a result. If you need lazy (streaming) -- decoding, use 'decodeUtf8With' in lenient mode. decodeUtf8' :: B.ByteString -> Either UnicodeException Text -decodeUtf8' bs = unsafePerformIO $ do +decodeUtf8' bs = unsafeDupablePerformIO $ do let t = decodeUtf8 bs try (evaluate (rnf t `seq` t)) where @@ -150,8 +161,13 @@ decodeUtf8' bs = unsafePerformIO $ do {-# INLINE decodeUtf8' #-} encodeUtf8 :: Text -> B.ByteString -encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs) -encodeUtf8 Empty = B.Empty +encodeUtf8 = + B.toLazyByteString . go + where + go Empty = mempty + go (Chunk c cs) = + TE.encodeUtf8Escaped (BP.liftFixedToBounded BP.word8) c <> go cs + -- | Decode text from little endian UTF-16 encoding. decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text diff --git a/Data/Text/Lazy/Encoding/Fusion.hs b/Data/Text/Lazy/Encoding/Fusion.hs index 28c2d48b17c1a040e3399a174b7c4f0fefd1acbd..0d0c724bed1a718529f67f34aeb69c67760f1ec4 100644 --- a/Data/Text/Lazy/Encoding/Fusion.hs +++ b/Data/Text/Lazy/Encoding/Fusion.hs @@ -5,7 +5,7 @@ -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- -- License : BSD-style --- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, +-- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, -- duncan@haskell.org -- Stability : experimental -- Portability : portable @@ -42,7 +42,7 @@ import Data.Word (Word8, Word16, Word32) import qualified Data.Text.Encoding.Utf8 as U8 import qualified Data.Text.Encoding.Utf16 as U16 import qualified Data.Text.Encoding.Utf32 as U32 -import System.IO.Unsafe (unsafePerformIO) +import Data.Text.Unsafe (unsafeDupablePerformIO) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) import Foreign.Storable (pokeByteOff) import Data.ByteString.Internal (mallocByteString, memcpy) @@ -272,7 +272,7 @@ streamUtf32LE onErr bs0 = Stream next (T bs0 S0 0) unknownSize -- | /O(n)/ Convert a 'Stream' 'Word8' to a lazy 'ByteString'. unstreamChunks :: Int -> Stream Word8 -> ByteString unstreamChunks chunkSize (Stream next s0 len0) = chunk s0 (upperBound 4 len0) - where chunk s1 len1 = unsafePerformIO $ do + where chunk s1 len1 = unsafeDupablePerformIO $ do let len = max 4 (min len1 chunkSize) mallocByteString len >>= loop len 0 s1 where diff --git a/Data/Text/Lazy/Fusion.hs b/Data/Text/Lazy/Fusion.hs index 6a2291dea9febbdce7aa76081743880b9f414cd1..d96fb183771fa5f18ff35fc4ab9338c819abe468 100644 --- a/Data/Text/Lazy/Fusion.hs +++ b/Data/Text/Lazy/Fusion.hs @@ -24,6 +24,7 @@ module Data.Text.Lazy.Fusion import Prelude hiding (length) import qualified Data.Text.Fusion.Common as S +import Control.Monad.ST (runST) import Data.Text.Fusion.Internal import Data.Text.Fusion.Size (isEmpty, unknownSize) import Data.Text.Lazy.Internal @@ -50,33 +51,36 @@ stream text = Stream next (text :*: 0) unknownSize -- | /O(n)/ Convert a 'Stream Char' into a 'Text', using the given -- chunk size. unstreamChunks :: Int -> Stream Char -> Text -unstreamChunks chunkSize (Stream next s0 len0) +unstreamChunks !chunkSize (Stream next s0 len0) | isEmpty len0 = Empty | otherwise = outer s0 where - outer s = {-# SCC "unstreamChunks/outer" #-} - case next s of + outer so = {-# SCC "unstreamChunks/outer" #-} + case next so of Done -> Empty Skip s' -> outer s' - Yield x s' -> I.Text arr 0 len `chunk` outer s'' - where (arr,(s'',len)) = A.run2 fill - fill = do a <- A.new unknownLength - unsafeWrite a 0 x >>= inner a unknownLength s' - unknownLength = 4 - inner marr len s !i - | i + 1 >= chunkSize = return (marr, (s,i)) - | i + 1 >= len = {-# SCC "unstreamChunks/resize" #-} do - let newLen = min (len `shiftL` 1) chunkSize - marr' <- A.new newLen - A.copyM marr' 0 marr 0 len - inner marr' newLen s i - | otherwise = - {-# SCC "unstreamChunks/inner" #-} - case next s of - Done -> return (marr,(s,i)) - Skip s' -> inner marr len s' i - Yield x s' -> do d <- unsafeWrite marr i x - inner marr len s' (i+d) + Yield x s' -> runST $ do + a <- A.new unknownLength + unsafeWrite a 0 x >>= inner a unknownLength s' + where unknownLength = 4 + where + inner marr !len s !i + | i + 1 >= chunkSize = finish marr i s + | i + 1 >= len = {-# SCC "unstreamChunks/resize" #-} do + let newLen = min (len `shiftL` 1) chunkSize + marr' <- A.new newLen + A.copyM marr' 0 marr 0 len + inner marr' newLen s i + | otherwise = + {-# SCC "unstreamChunks/inner" #-} + case next s of + Done -> finish marr i s + Skip s' -> inner marr len s' i + Yield x s' -> do d <- unsafeWrite marr i x + inner marr len s' (i+d) + finish marr len s' = do + arr <- A.unsafeFreeze marr + return (I.Text arr 0 len `Chunk` outer s') {-# INLINE [0] unstreamChunks #-} -- | /O(n)/ Convert a 'Stream Char' into a 'Text', using diff --git a/Data/Text/Lazy/IO.hs b/Data/Text/Lazy/IO.hs index 366107850c5472cb69bb9f6dae6795eb406d7cea..318bc60e7ca301022e25cdab567d7d7a044036c2 100644 --- a/Data/Text/Lazy/IO.hs +++ b/Data/Text/Lazy/IO.hs @@ -1,4 +1,7 @@ {-# LANGUAGE BangPatterns, CPP, RecordWildCards #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Lazy.IO -- Copyright : (c) 2009, 2010 Bryan O'Sullivan, @@ -16,7 +19,7 @@ module Data.Text.Lazy.IO ( -- * Performance - -- $performance + -- $performance -- * Locale support -- $locale @@ -38,18 +41,13 @@ module Data.Text.Lazy.IO ) where import Data.Text.Lazy (Text) -import Prelude hiding (appendFile, catch, getContents, getLine, interact, +import Prelude hiding (appendFile, getContents, getLine, interact, putStr, putStrLn, readFile, writeFile) import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout, withFile) import qualified Data.Text.IO as T import qualified Data.Text.Lazy as L -#if __GLASGOW_HASKELL__ <= 610 -import Data.Text.Lazy.Encoding (decodeUtf8) -import qualified Data.ByteString.Char8 as S8 -import qualified Data.ByteString.Lazy.Char8 as L8 -#else -import Control.Exception (catch, throwIO) +import qualified Control.Exception as E import Control.Monad (when) import Data.IORef (readIORef) import Data.Text.IO.Internal (hGetLineWith, readChunk) @@ -62,7 +60,6 @@ import GHC.IO.Handle.Types (Handle__(..), HandleType(..)) import System.IO (BufferMode(..), hGetBuffering, hSetBuffering) import System.IO.Error (isEOFError) import System.IO.Unsafe (unsafeInterleaveIO) -#endif -- $performance -- @@ -96,9 +93,6 @@ appendFile p = withFile p AppendMode . flip hPutStr -- | Lazily read the remaining contents of a 'Handle'. The 'Handle' -- will be closed after the read completes, or on error. hGetContents :: Handle -> IO Text -#if __GLASGOW_HASKELL__ <= 610 -hGetContents = fmap decodeUtf8 . L8.hGetContents -#else hGetContents h = do chooseGoodBuffering h wantReadableHandle "hGetContents" h $ \hh -> do @@ -119,7 +113,7 @@ lazyRead h = unsafeInterleaveIO $ case haType hh of ClosedHandle -> return (hh, L.empty) SemiClosedHandle -> lazyReadBuffered h hh - _ -> ioException + _ -> ioException (IOError (Just h) IllegalOperation "hGetContents" "illegal handle type" Nothing Nothing) @@ -128,22 +122,17 @@ lazyReadBuffered h hh@Handle__{..} = do buf <- readIORef haCharBuffer (do t <- readChunk hh buf ts <- lazyRead h - return (hh, chunk t ts)) `catch` \e -> do + return (hh, chunk t ts)) `E.catch` \e -> do (hh', _) <- hClose_help hh if isEOFError e then return $ if isEmptyBuffer buf then (hh', empty) else (hh', L.singleton '\r') - else throwIO (augmentIOError e "hGetContents" h) -#endif + else E.throwIO (augmentIOError e "hGetContents" h) -- | Read a single line from a handle. hGetLine :: Handle -> IO Text -#if __GLASGOW_HASKELL__ <= 610 -hGetLine = fmap (decodeUtf8 . L8.fromChunks . (:[])) . S8.hGetLine -#else hGetLine = hGetLineWith L.fromChunks -#endif -- | Write a string to a handle. hPutStr :: Handle -> Text -> IO () diff --git a/Data/Text/Lazy/Internal.hs b/Data/Text/Lazy/Internal.hs index ae66530e164f0a86827a05752cbc237730105d45..8d75b3d98359c6d1dc4778afe403456011939f97 100644 --- a/Data/Text/Lazy/Internal.hs +++ b/Data/Text/Lazy/Internal.hs @@ -8,7 +8,7 @@ -- duncan@haskell.org -- Stability : experimental -- Portability : GHC --- +-- -- A module containing private 'Text' internals. This exposes the -- 'Text' representation and low level construction functions. -- Modules which extend the 'Text' system may need to use this module. diff --git a/Data/Text/Lazy/Read.hs b/Data/Text/Lazy/Read.hs index 153741993e02de830aadd431b63a7ddacd8e6381..85673384e12f5a330295e00e8dd0cb36afa0d9fe 100644 --- a/Data/Text/Lazy/Read.hs +++ b/Data/Text/Lazy/Read.hs @@ -1,4 +1,7 @@ -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE OverloadedStrings, CPP #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Lazy.Read diff --git a/Data/Text/Private.hs b/Data/Text/Private.hs index 313afde5ab68e6b0abd130851171eb72a3b5a85a..5c70bf28842da9be8b6f2a64105f719105c86541 100644 --- a/Data/Text/Private.hs +++ b/Data/Text/Private.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE BangPatterns, UnboxedTuples #-} +{-# LANGUAGE BangPatterns, Rank2Types, UnboxedTuples #-} -- | -- Module : Data.Text.Private @@ -11,11 +11,14 @@ module Data.Text.Private ( - span_ + runText + , span_ ) where +import Control.Monad.ST (ST, runST) import Data.Text.Internal (Text(..), textP) import Data.Text.Unsafe (Iter(..), iter) +import qualified Data.Text.Array as A span_ :: (Char -> Bool) -> Text -> (# Text, Text #) span_ p t@(Text arr off len) = (# hd,tl #) @@ -26,3 +29,9 @@ span_ p t@(Text arr off len) = (# hd,tl #) | otherwise = i where Iter c d = iter t i {-# INLINE span_ #-} + +runText :: (forall s. (A.MArray s -> Int -> ST s Text) -> ST s Text) -> Text +runText act = runST (act $ \ !marr !len -> do + arr <- A.unsafeFreeze marr + return $! textP arr 0 len) +{-# INLINE runText #-} diff --git a/Data/Text/Read.hs b/Data/Text/Read.hs index 709e8c5d948ab5175c85f30e0f8d1532682186f9..f7d721cbf2729a737b61c79bdaa8ad5082822965 100644 --- a/Data/Text/Read.hs +++ b/Data/Text/Read.hs @@ -1,4 +1,7 @@ -{-# LANGUAGE OverloadedStrings, UnboxedTuples #-} +{-# LANGUAGE OverloadedStrings, UnboxedTuples, CPP #-} +#if __GLASGOW_HASKELL__ >= 702 +{-# LANGUAGE Trustworthy #-} +#endif -- | -- Module : Data.Text.Read diff --git a/Data/Text/Search.hs b/Data/Text/Search.hs index 85ce550b1d59ffd137ea3ce145795ed5fd920772..875227678badcd86a2d9ef2b68d81c01c79e8144 100644 --- a/Data/Text/Search.hs +++ b/Data/Text/Search.hs @@ -14,7 +14,7 @@ -- Horspool, Sunday, and Lundh. -- -- References: --- +-- -- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm. -- Communications of the ACM, 20, 10, 762-772 (1977) -- @@ -35,10 +35,11 @@ module Data.Text.Search import qualified Data.Text.Array as A import Data.Word (Word64) import Data.Text.Internal (Text(..)) -import Data.Text.Fusion.Internal (PairS(..)) import Data.Bits ((.|.), (.&.)) import Data.Text.UnsafeShift (shiftL) +data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int + -- | /O(n+m)/ Find the offsets of all non-overlapping indices of -- @needle@ within @haystack@. The offsets returned represent -- locations in the low-level array. @@ -60,9 +61,8 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen) hindex k = A.unsafeIndex harr (hoff+k) hindex' k | k == hlen = 0 | otherwise = A.unsafeIndex harr (hoff+k) - (mask :: Word64) :*: skip = buildTable 0 0 (nlen-2) buildTable !i !msk !skp - | i >= nlast = (msk .|. swizzle z) :*: skp + | i >= nlast = (msk .|. swizzle z) :* skp | otherwise = buildTable (i+1) (msk .|. swizzle c) skp' where c = nindex i skp' | c == z = nlen - i - 2 @@ -80,7 +80,8 @@ indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen) delta | nextInPattern = nlen + 1 | c == z = skip + 1 | otherwise = 1 - nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0 + where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0 + !(mask :* skip) = buildTable 0 0 (nlen-2) scanOne c = loop 0 where loop !i | i >= hlen = [] | hindex i == c = i : loop (i+1) diff --git a/Data/Text/Unsafe.hs b/Data/Text/Unsafe.hs index 754fd4f848d0a75c9d040fc97062472c3ae084e3..e0b66d06bd659e1fe0273808502fe59bc518a96d 100644 --- a/Data/Text/Unsafe.hs +++ b/Data/Text/Unsafe.hs @@ -14,6 +14,7 @@ module Data.Text.Unsafe ( inlineInterleaveST , inlinePerformIO + , unsafeDupablePerformIO , Iter(..) , iter , iter_ @@ -24,7 +25,7 @@ module Data.Text.Unsafe , takeWord16 , dropWord16 ) where - + #if defined(ASSERTS) import Control.Exception (assert) #endif @@ -33,6 +34,7 @@ import Data.Text.Internal (Text(..)) import Data.Text.Unsafe.Base (inlineInterleaveST, inlinePerformIO) import Data.Text.UnsafeChar (unsafeChr) import qualified Data.Text.Array as A +import GHC.IO (unsafeDupablePerformIO) -- | /O(1)/ A variant of 'head' for non-empty 'Text'. 'unsafeHead' -- omits the check for the empty case, so there is an obligation on @@ -45,7 +47,7 @@ unsafeHead (Text arr off _len) n = A.unsafeIndex arr (off+1) {-# INLINE unsafeHead #-} --- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeHead' +-- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeTail' -- omits the check for the empty case, so there is an obligation on -- the programmer to provide a proof that the 'Text' is non-empty. unsafeTail :: Text -> Text diff --git a/Data/Text/Unsafe/Base.hs b/Data/Text/Unsafe/Base.hs index e34c83af5da8a7f1488f5a2126e4be2000af90ab..7907baf0425dec73c9ea2f50a2ecac52745eaf0d 100644 --- a/Data/Text/Unsafe/Base.hs +++ b/Data/Text/Unsafe/Base.hs @@ -15,14 +15,10 @@ module Data.Text.Unsafe.Base inlineInterleaveST , inlinePerformIO ) where - + import GHC.ST (ST(..)) #if defined(__GLASGOW_HASKELL__) -# if __GLASGOW_HASKELL__ >= 611 import GHC.IO (IO(IO)) -# else -import GHC.IOBase (IO(IO)) -# endif import GHC.Base (realWorld#) #endif diff --git a/tests/benchmarks/.gitignore b/benchmarks/.gitignore similarity index 100% rename from tests/benchmarks/.gitignore rename to benchmarks/.gitignore diff --git a/tests/benchmarks/Setup.hs b/benchmarks/Setup.hs similarity index 100% rename from tests/benchmarks/Setup.hs rename to benchmarks/Setup.hs diff --git a/tests/benchmarks/cbits/time_iconv.c b/benchmarks/cbits/time_iconv.c similarity index 100% rename from tests/benchmarks/cbits/time_iconv.c rename to benchmarks/cbits/time_iconv.c diff --git a/tests/benchmarks/src/Data/Text/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs similarity index 59% rename from tests/benchmarks/src/Data/Text/Benchmarks.hs rename to benchmarks/haskell/Benchmarks.hs index b9911bf8969c7e4f3768a5427a8f83119a0836eb..cbf1fa72b496b3388ebec01099d31a99808596c8 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks.hs +++ b/benchmarks/haskell/Benchmarks.hs @@ -9,25 +9,25 @@ import Criterion.Main (Benchmark, defaultMain, bgroup) import System.FilePath ((</>)) import System.IO (IOMode (WriteMode), openFile, hSetEncoding, utf8) -import qualified Data.Text.Benchmarks.Builder as Builder -import qualified Data.Text.Benchmarks.DecodeUtf8 as DecodeUtf8 -import qualified Data.Text.Benchmarks.EncodeUtf8 as EncodeUtf8 -import qualified Data.Text.Benchmarks.Equality as Equality -import qualified Data.Text.Benchmarks.FileRead as FileRead -import qualified Data.Text.Benchmarks.FoldLines as FoldLines -import qualified Data.Text.Benchmarks.Pure as Pure -import qualified Data.Text.Benchmarks.ReadNumbers as ReadNumbers -import qualified Data.Text.Benchmarks.Replace as Replace -import qualified Data.Text.Benchmarks.Search as Search -import qualified Data.Text.Benchmarks.Stream as Stream -import qualified Data.Text.Benchmarks.WordFrequencies as WordFrequencies +import qualified Benchmarks.Builder as Builder +import qualified Benchmarks.DecodeUtf8 as DecodeUtf8 +import qualified Benchmarks.EncodeUtf8 as EncodeUtf8 +import qualified Benchmarks.Equality as Equality +import qualified Benchmarks.FileRead as FileRead +import qualified Benchmarks.FoldLines as FoldLines +import qualified Benchmarks.Pure as Pure +import qualified Benchmarks.ReadNumbers as ReadNumbers +import qualified Benchmarks.Replace as Replace +import qualified Benchmarks.Search as Search +import qualified Benchmarks.Stream as Stream +import qualified Benchmarks.WordFrequencies as WordFrequencies -import qualified Data.Text.Benchmarks.Programs.BigTable as Programs.BigTable -import qualified Data.Text.Benchmarks.Programs.Cut as Programs.Cut -import qualified Data.Text.Benchmarks.Programs.Fold as Programs.Fold -import qualified Data.Text.Benchmarks.Programs.Sort as Programs.Sort -import qualified Data.Text.Benchmarks.Programs.StripTags as Programs.StripTags -import qualified Data.Text.Benchmarks.Programs.Throughput as Programs.Throughput +import qualified Benchmarks.Programs.BigTable as Programs.BigTable +import qualified Benchmarks.Programs.Cut as Programs.Cut +import qualified Benchmarks.Programs.Fold as Programs.Fold +import qualified Benchmarks.Programs.Sort as Programs.Sort +import qualified Benchmarks.Programs.StripTags as Programs.StripTags +import qualified Benchmarks.Programs.Throughput as Programs.Throughput main :: IO () main = benchmarks >>= defaultMain @@ -49,7 +49,8 @@ benchmarks = do , Equality.benchmark (tf "japanese.txt") , FileRead.benchmark (tf "russian.txt") , FoldLines.benchmark (tf "russian.txt") - , Pure.benchmark (tf "japanese.txt") + , Pure.benchmark "tiny "(tf "tiny.txt") + , Pure.benchmark "japanese" (tf "japanese.txt") , ReadNumbers.benchmark (tf "numbers.txt") , Replace.benchmark (tf "russian.txt") "принимаÑ" "Ñвоем" , Search.benchmark (tf "russian.txt") "принимаÑ" @@ -70,4 +71,4 @@ benchmarks = do return $ bs ++ [ps] where -- Location of a test file - tf = ("../text-test-data" </>) + tf = ("../tests/text-test-data" </>) diff --git a/benchmarks/haskell/Benchmarks/Builder.hs b/benchmarks/haskell/Benchmarks/Builder.hs new file mode 100644 index 0000000000000000000000000000000000000000..b6e7bd70e92011f1484e026f4e2f5ca08df1fecd --- /dev/null +++ b/benchmarks/haskell/Benchmarks/Builder.hs @@ -0,0 +1,75 @@ +-- | Testing the internal builder monoid +-- +-- Tested in this benchmark: +-- +-- * Concatenating many small strings using a builder +-- +{-# LANGUAGE OverloadedStrings #-} +module Benchmarks.Builder + ( benchmark + ) where + +import Criterion (Benchmark, bgroup, bench, nf) +import Data.Binary.Builder as B +import Data.ByteString.Char8 () +import Data.Monoid (mconcat, mempty) +import qualified Blaze.ByteString.Builder as Blaze +import qualified Blaze.ByteString.Builder.Char.Utf8 as Blaze +import qualified Data.ByteString as SB +import qualified Data.ByteString.Lazy as LB +import qualified Data.Text as T +import qualified Data.Text.Lazy as LT +import qualified Data.Text.Lazy.Builder as LTB +import qualified Data.Text.Lazy.Builder.Int as Int +import Data.Int (Int64) + +benchmark :: IO Benchmark +benchmark = return $ bgroup "Builder" + [ bgroup "Comparison" + [ bench "LazyText" $ nf + (LT.length . LTB.toLazyText . mconcat . map LTB.fromText) texts + , bench "Binary" $ nf + (LB.length . B.toLazyByteString . mconcat . map B.fromByteString) + byteStrings + , bench "Blaze" $ nf + (LB.length . Blaze.toLazyByteString . mconcat . map Blaze.fromString) + strings + ] + , bgroup "Int" + [ bgroup "Decimal" + [ bgroup "Positive" . + flip map numbers $ \n -> + (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n) + , bgroup "Negative" . + flip map numbers $ \m -> + let n = negate m in + (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n) + , bench "Empty" $ nf LTB.toLazyText mempty + , bgroup "Show" . + flip map numbers $ \n -> + (bench (show (length (show n))) $ nf show n) + ] + ] + ] + where + numbers :: [Int64] + numbers = [ + 6, 14, 500, 9688, 10654, 620735, 5608880, 37010612, + 731223504, 5061580596, 24596952933, 711732309084, 2845910093839, + 54601756118340, 735159434806159, 3619097625502435, 95777227510267124, + 414944309510675693, 8986407456998704019 + ] + +texts :: [T.Text] +texts = take 200000 $ cycle ["foo", "λx", "ç”±ã®"] +{-# NOINLINE texts #-} + +-- Note that the non-ascii characters will be chopped +byteStrings :: [SB.ByteString] +byteStrings = take 200000 $ cycle ["foo", "λx", "ç”±ã®"] +{-# NOINLINE byteStrings #-} + +-- Note that the non-ascii characters will be chopped +strings :: [String] +strings = take 200000 $ cycle ["foo", "λx", "ç”±ã®"] +{-# NOINLINE strings #-} diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs similarity index 96% rename from tests/benchmarks/src/Data/Text/Benchmarks/DecodeUtf8.hs rename to benchmarks/haskell/Benchmarks/DecodeUtf8.hs index 474b75e17984a593c5b086b5015dfcc0ba4aa948..14d084a92b43c3a3cb726bfc95cc697317730ed0 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/DecodeUtf8.hs +++ b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs @@ -14,11 +14,11 @@ -- -- The latter are used for testing stream fusion. -- -module Data.Text.Benchmarks.DecodeUtf8 +module Benchmarks.DecodeUtf8 ( benchmark ) where -import Foreign.C.Types (CInt, CSize) +import Foreign.C.Types import Data.ByteString.Internal (ByteString(..)) import Foreign.Ptr (Ptr, plusPtr) import Foreign.ForeignPtr (withForeignPtr) diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/EncodeUtf8.hs b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs similarity index 95% rename from tests/benchmarks/src/Data/Text/Benchmarks/EncodeUtf8.hs rename to benchmarks/haskell/Benchmarks/EncodeUtf8.hs index 39804615225724f1a33f54a8e4c29508c0cba2a0..758e095f58cf6c5b0a906eb7b8d7c9d4d0fec296 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/EncodeUtf8.hs +++ b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs @@ -6,7 +6,7 @@ -- -- * UTF-8 encoding it -- -module Data.Text.Benchmarks.EncodeUtf8 +module Benchmarks.EncodeUtf8 ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Equality.hs b/benchmarks/haskell/Benchmarks/Equality.hs similarity index 97% rename from tests/benchmarks/src/Data/Text/Benchmarks/Equality.hs rename to benchmarks/haskell/Benchmarks/Equality.hs index 210411336428e81d3d1f4bb7c832b123add3a194..33964a23105a5ab76c02b6833deb4da0c686e020 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Equality.hs +++ b/benchmarks/haskell/Benchmarks/Equality.hs @@ -5,7 +5,7 @@ -- -- * Comparison of strings (Eq instance) -- -module Data.Text.Benchmarks.Equality +module Benchmarks.Equality ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/FileRead.hs b/benchmarks/haskell/Benchmarks/FileRead.hs similarity index 96% rename from tests/benchmarks/src/Data/Text/Benchmarks/FileRead.hs rename to benchmarks/haskell/Benchmarks/FileRead.hs index af07996e0182188f49ca7e3d7aed8ecd1d231864..65a79b49d2dfa7943364118d0bc78010887721c1 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/FileRead.hs +++ b/benchmarks/haskell/Benchmarks/FileRead.hs @@ -4,7 +4,7 @@ -- -- * Reading a file from the disk -- -module Data.Text.Benchmarks.FileRead +module Benchmarks.FileRead ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/FoldLines.hs b/benchmarks/haskell/Benchmarks/FoldLines.hs similarity index 97% rename from tests/benchmarks/src/Data/Text/Benchmarks/FoldLines.hs rename to benchmarks/haskell/Benchmarks/FoldLines.hs index e2a4b2932efe64b3b2f768f3072a90290cac3f80..f08e3b2238dfaf84aaa6afb0e710fff1ae1ca4df 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/FoldLines.hs +++ b/benchmarks/haskell/Benchmarks/FoldLines.hs @@ -6,7 +6,7 @@ -- * Buffered, line-based IO -- {-# LANGUAGE BangPatterns #-} -module Data.Text.Benchmarks.FoldLines +module Benchmarks.FoldLines ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/BigTable.hs b/benchmarks/haskell/Benchmarks/Programs/BigTable.hs similarity index 96% rename from tests/benchmarks/src/Data/Text/Benchmarks/Programs/BigTable.hs rename to benchmarks/haskell/Benchmarks/Programs/BigTable.hs index e057e871e1a5ad771fa479de160140cd2c13469a..0d3402b4720aca7f9f503fecea6361db29748678 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/BigTable.hs +++ b/benchmarks/haskell/Benchmarks/Programs/BigTable.hs @@ -7,7 +7,7 @@ -- * Writing to a handle -- {-# LANGUAGE OverloadedStrings #-} -module Data.Text.Benchmarks.Programs.BigTable +module Benchmarks.Programs.BigTable ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Cut.hs b/benchmarks/haskell/Benchmarks/Programs/Cut.hs similarity index 98% rename from tests/benchmarks/src/Data/Text/Benchmarks/Programs/Cut.hs rename to benchmarks/haskell/Benchmarks/Programs/Cut.hs index 3e918fff8c23372f2c556558718eda48bea15fdd..3bb44696687e9586aaa7bb07f0bb60dd47d729cf 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Cut.hs +++ b/benchmarks/haskell/Benchmarks/Programs/Cut.hs @@ -12,7 +12,7 @@ -- -- * Writing back to a handle -- -module Data.Text.Benchmarks.Programs.Cut +module Benchmarks.Programs.Cut ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Fold.hs b/benchmarks/haskell/Benchmarks/Programs/Fold.hs similarity index 97% rename from tests/benchmarks/src/Data/Text/Benchmarks/Programs/Fold.hs rename to benchmarks/haskell/Benchmarks/Programs/Fold.hs index f42a71b05f7be55d3093de836929cec772883c69..722487a89f6bb488471b70a194102a4e4cd280af 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Fold.hs +++ b/benchmarks/haskell/Benchmarks/Programs/Fold.hs @@ -13,7 +13,7 @@ -- * Writing back to a handle -- {-# LANGUAGE OverloadedStrings #-} -module Data.Text.Benchmarks.Programs.Fold +module Benchmarks.Programs.Fold ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Sort.hs b/benchmarks/haskell/Benchmarks/Programs/Sort.hs similarity index 98% rename from tests/benchmarks/src/Data/Text/Benchmarks/Programs/Sort.hs rename to benchmarks/haskell/Benchmarks/Programs/Sort.hs index 4b4aee0686df600dab05ac2154df7f849a4b814b..808f71056c0e8361541929e6bb766eb40d7b2364 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Sort.hs +++ b/benchmarks/haskell/Benchmarks/Programs/Sort.hs @@ -13,7 +13,7 @@ -- * Writing back to a handle -- {-# LANGUAGE OverloadedStrings #-} -module Data.Text.Benchmarks.Programs.Sort +module Benchmarks.Programs.Sort ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/StripTags.hs b/benchmarks/haskell/Benchmarks/Programs/StripTags.hs similarity index 96% rename from tests/benchmarks/src/Data/Text/Benchmarks/Programs/StripTags.hs rename to benchmarks/haskell/Benchmarks/Programs/StripTags.hs index 41f18d5194af19d22c2d23f041a6ea9c72f1e131..1d94cf688a12dca39a81c5e18f5558221c146b79 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/StripTags.hs +++ b/benchmarks/haskell/Benchmarks/Programs/StripTags.hs @@ -11,10 +11,10 @@ -- * Writing back to a handle -- {-# OPTIONS_GHC -fspec-constr-count=5 #-} -module Data.Text.Benchmarks.Programs.StripTags +module Benchmarks.Programs.StripTags ( benchmark ) where - + import Criterion (Benchmark, bgroup, bench) import Data.List (mapAccumL) import System.IO (Handle, hPutStr) diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Throughput.hs b/benchmarks/haskell/Benchmarks/Programs/Throughput.hs similarity index 96% rename from tests/benchmarks/src/Data/Text/Benchmarks/Programs/Throughput.hs rename to benchmarks/haskell/Benchmarks/Programs/Throughput.hs index ecb7021c13220b390a2c2a7716f8b509908644da..87932ead6ca65ad4e1a5beb643f20e34733fe002 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Throughput.hs +++ b/benchmarks/haskell/Benchmarks/Programs/Throughput.hs @@ -14,7 +14,7 @@ -- -- * Writing back to a handle -- -module Data.Text.Benchmarks.Programs.Throughput +module Benchmarks.Programs.Throughput ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Pure.hs b/benchmarks/haskell/Benchmarks/Pure.hs similarity index 97% rename from tests/benchmarks/src/Data/Text/Benchmarks/Pure.hs rename to benchmarks/haskell/Benchmarks/Pure.hs index 708e2db4444c9c626ef057c407154a75e22b2eba..4860ab309f549359ef68493671f983795b66e9ad 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Pure.hs +++ b/benchmarks/haskell/Benchmarks/Pure.hs @@ -4,9 +4,9 @@ -- -- * Most pure functions defined the string types -- -{-# LANGUAGE BangPatterns, GADTs, MagicHash #-} +{-# LANGUAGE BangPatterns, CPP, GADTs, MagicHash #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -module Data.Text.Benchmarks.Pure +module Benchmarks.Pure ( benchmark ) where @@ -27,8 +27,8 @@ import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as TB import qualified Data.Text.Lazy.Encoding as TL -benchmark :: FilePath -> IO Benchmark -benchmark fp = do +benchmark :: String -> FilePath -> IO Benchmark +benchmark kind fp = do -- Evaluate stuff before actually running the benchmark, we don't want to -- count it here. @@ -100,6 +100,10 @@ benchmark fp = do , benchBSL $ nf BL.unpack bla , benchS $ nf UTF8.toString bsa ] + , bgroup "decode'" + [ benchT $ nf T.decodeUtf8' bsa + , benchTL $ nf TL.decodeUtf8' bla + ] , bgroup "drop" [ benchT $ nf (T.drop (ta_len `div` 3)) ta , benchTL $ nf (TL.drop (tla_len `div` 3)) tla @@ -405,11 +409,11 @@ benchmark fp = do ] ] where - benchS = bench "String" - benchT = bench "Text" - benchTL = bench "LazyText" - benchBS = bench "ByteString" - benchBSL = bench "LazyByteString" + benchS = bench ("String+" ++ kind) + benchT = bench ("Text+" ++ kind) + benchTL = bench ("LazyText+" ++ kind) + benchBS = bench ("ByteString+" ++ kind) + benchBSL = bench ("LazyByteString+" ++ kind) c = 'й' p0 = (== c) @@ -425,11 +429,13 @@ benchmark fp = do replicat n = concat . L.replicate n short = T.pack "short" +#if !MIN_VERSION_bytestring(0,10,0) instance NFData BS.ByteString instance NFData BL.ByteString where rnf BL.Empty = () rnf (BL.Chunk _ ts) = rnf ts +#endif data B where B :: NFData a => a -> B diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/ReadNumbers.hs b/benchmarks/haskell/Benchmarks/ReadNumbers.hs similarity index 98% rename from tests/benchmarks/src/Data/Text/Benchmarks/ReadNumbers.hs rename to benchmarks/haskell/Benchmarks/ReadNumbers.hs index 04d50bb34ec1b6b5a26b68d5742b71944f419b31..b907f8a8bc05bc53450bda344134533f2d78a2ff 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/ReadNumbers.hs +++ b/benchmarks/haskell/Benchmarks/ReadNumbers.hs @@ -16,7 +16,7 @@ -- -- * Lexing/parsing of different numerical types -- -module Data.Text.Benchmarks.ReadNumbers +module Benchmarks.ReadNumbers ( benchmark ) where @@ -88,7 +88,7 @@ text reader = foldl' go 1000000 where go z t = case reader t of Left _ -> z Right (n, _) -> min n z - + byteString :: (Ord a, Num a) => (t -> Maybe (a,t)) -> [t] -> a byteString reader = foldl' go 1000000 where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Replace.hs b/benchmarks/haskell/Benchmarks/Replace.hs similarity index 96% rename from tests/benchmarks/src/Data/Text/Benchmarks/Replace.hs rename to benchmarks/haskell/Benchmarks/Replace.hs index 42a7b59125486d72ed40fe56b4dc113b25057c25..d56d52dfe13193497e9e088c3c6ef0a1d13be99e 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Replace.hs +++ b/benchmarks/haskell/Benchmarks/Replace.hs @@ -4,7 +4,7 @@ -- -- * Search and replace of a pattern in a text -- -module Data.Text.Benchmarks.Replace +module Benchmarks.Replace ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Search.hs b/benchmarks/haskell/Benchmarks/Search.hs similarity index 97% rename from tests/benchmarks/src/Data/Text/Benchmarks/Search.hs rename to benchmarks/haskell/Benchmarks/Search.hs index b4473b746edd20d011359442b1ce9893e2180c1c..93f1e05e1e8179dac914494bd219eaf5b05c167e 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Search.hs +++ b/benchmarks/haskell/Benchmarks/Search.hs @@ -4,7 +4,7 @@ -- -- * Searching all occurences of a pattern using library routines -- -module Data.Text.Benchmarks.Search +module Benchmarks.Search ( benchmark ) where diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Stream.hs b/benchmarks/haskell/Benchmarks/Stream.hs similarity index 97% rename from tests/benchmarks/src/Data/Text/Benchmarks/Stream.hs rename to benchmarks/haskell/Benchmarks/Stream.hs index d5e6b843627546a091dbe3dd4a82cca3e2264d3a..9960bef0bbff1794db32c6202e5cecbaeede423c 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Stream.hs +++ b/benchmarks/haskell/Benchmarks/Stream.hs @@ -7,7 +7,7 @@ -- {-# LANGUAGE BangPatterns #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -module Data.Text.Benchmarks.Stream +module Benchmarks.Stream ( benchmark ) where @@ -86,7 +86,6 @@ benchmark fp = do ] -- Encoding.Fusion.Common - , bench "restreamUtf8" $ nf F.restreamUtf8 s , bench "restreamUtf16LE" $ nf F.restreamUtf16LE s , bench "restreamUtf16BE" $ nf F.restreamUtf16BE s , bench "restreamUtf32LE" $ nf F.restreamUtf32LE s diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/WordFrequencies.hs b/benchmarks/haskell/Benchmarks/WordFrequencies.hs similarity index 95% rename from tests/benchmarks/src/Data/Text/Benchmarks/WordFrequencies.hs rename to benchmarks/haskell/Benchmarks/WordFrequencies.hs index 559470f6d304cad98dd4fb9da046f4b7b0f0050b..698103debaee2c9164c80083c2104f5c0f8d5ffe 100644 --- a/tests/benchmarks/src/Data/Text/Benchmarks/WordFrequencies.hs +++ b/benchmarks/haskell/Benchmarks/WordFrequencies.hs @@ -8,7 +8,7 @@ -- -- * Comparing: Eq/Ord instances -- -module Data.Text.Benchmarks.WordFrequencies +module Benchmarks.WordFrequencies ( benchmark ) where diff --git a/benchmarks/haskell/Multilang.hs b/benchmarks/haskell/Multilang.hs new file mode 100644 index 0000000000000000000000000000000000000000..a861afad04661c5029404f1f6da57ac42db8f3dc --- /dev/null +++ b/benchmarks/haskell/Multilang.hs @@ -0,0 +1,32 @@ +{-# LANGUAGE BangPatterns, OverloadedStrings, RankNTypes #-} + +module Main ( + main + ) where + +import Control.Monad (forM_) +import qualified Data.ByteString as B +import qualified Data.Text as Text +import Data.Text.Encoding (decodeUtf8) +import Data.Text (Text) +import System.IO (hFlush, stdout) +import Timer (timer) + +type BM = Text -> () + +bm :: forall a. (Text -> a) -> BM +bm f t = f t `seq` () + +benchmarks :: [(String, Text.Text -> ())] +benchmarks = [ + ("find_first", bm $ Text.isInfixOf "en:Benin") + , ("find_index", bm $ Text.findIndex (=='c')) + ] + +main :: IO () +main = do + !contents <- decodeUtf8 `fmap` B.readFile "../tests/text-test-data/yiwiki.xml" + forM_ benchmarks $ \(name, bmark) -> do + putStr $ name ++ " " + hFlush stdout + putStrLn =<< (timer 100 contents bmark) diff --git a/benchmarks/haskell/Timer.hs b/benchmarks/haskell/Timer.hs new file mode 100644 index 0000000000000000000000000000000000000000..ac09616d4b9f5a0dfd2a08e8d07708a1b8ee9459 --- /dev/null +++ b/benchmarks/haskell/Timer.hs @@ -0,0 +1,30 @@ +{-# LANGUAGE BangPatterns #-} + +module Timer (timer) where + +import Control.Exception (evaluate) +import Data.Time.Clock.POSIX (getPOSIXTime) +import GHC.Float (FFFormat(..), formatRealFloat) + +ickyRound :: Int -> Double -> String +ickyRound k = formatRealFloat FFFixed (Just k) + +timer :: Int -> a -> (a -> b) -> IO String +timer count a0 f = do + let loop !k !fastest + | k <= 0 = return fastest + | otherwise = do + start <- getPOSIXTime + let inner a i + | i <= 0 = return () + | otherwise = evaluate (f a) >> inner a (i-1) + inner a0 count + end <- getPOSIXTime + let elapsed = end - start + loop (k-1) (min fastest (elapsed / fromIntegral count)) + t <- loop (3::Int) 1e300 + let log10 x = log x / log 10 + ft = realToFrac t + prec = round (log10 (fromIntegral count) - log10 ft) + return $! ickyRound prec ft +{-# NOINLINE timer #-} diff --git a/tests/benchmarks/python/.gitignore b/benchmarks/python/.gitignore similarity index 100% rename from tests/benchmarks/python/.gitignore rename to benchmarks/python/.gitignore diff --git a/tests/benchmarks/python/cut.py b/benchmarks/python/cut.py similarity index 100% rename from tests/benchmarks/python/cut.py rename to benchmarks/python/cut.py diff --git a/benchmarks/python/multilang.py b/benchmarks/python/multilang.py new file mode 100755 index 0000000000000000000000000000000000000000..f2868545924ada4f877f9220f93e13bf23f89010 --- /dev/null +++ b/benchmarks/python/multilang.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +import math +import sys +import time + +def find_first(): + cf = contents.find + return timer(lambda: cf("en:Benin")) + +def timer(f, count=100): + a = 1e300 + def g(): + return + for i in xrange(3): + start = time.time() + for j in xrange(count): + g() + a = min(a, (time.time() - start) / count) + + b = 1e300 + for i in xrange(3): + start = time.time() + for j in xrange(count): + f() + b = min(b, (time.time() - start) / count) + + return round(b - a, int(round(math.log(count, 10) - math.log(b - a, 10)))) + +contents = open('../../tests/text-test-data/yiwiki.xml', 'r').read() +contents = contents.decode('utf-8') + +benchmarks = ( + find_first, + ) + +to_run = sys.argv[1:] +bms = [] +if to_run: + for r in to_run: + for b in benchmarks: + if b.__name__.startswith(r): + bms.append(b) +else: + bms = benchmarks + +for b in bms: + sys.stdout.write(b.__name__ + ' ') + sys.stdout.flush() + print b() diff --git a/tests/benchmarks/python/sort.py b/benchmarks/python/sort.py similarity index 100% rename from tests/benchmarks/python/sort.py rename to benchmarks/python/sort.py diff --git a/tests/benchmarks/python/strip_tags.py b/benchmarks/python/strip_tags.py similarity index 100% rename from tests/benchmarks/python/strip_tags.py rename to benchmarks/python/strip_tags.py diff --git a/tests/benchmarks/python/utils.py b/benchmarks/python/utils.py similarity index 100% rename from tests/benchmarks/python/utils.py rename to benchmarks/python/utils.py diff --git a/tests/benchmarks/ruby/cut.rb b/benchmarks/ruby/cut.rb similarity index 100% rename from tests/benchmarks/ruby/cut.rb rename to benchmarks/ruby/cut.rb diff --git a/tests/benchmarks/ruby/fold.rb b/benchmarks/ruby/fold.rb similarity index 100% rename from tests/benchmarks/ruby/fold.rb rename to benchmarks/ruby/fold.rb diff --git a/tests/benchmarks/ruby/sort.rb b/benchmarks/ruby/sort.rb similarity index 100% rename from tests/benchmarks/ruby/sort.rb rename to benchmarks/ruby/sort.rb diff --git a/tests/benchmarks/ruby/strip_tags.rb b/benchmarks/ruby/strip_tags.rb similarity index 100% rename from tests/benchmarks/ruby/strip_tags.rb rename to benchmarks/ruby/strip_tags.rb diff --git a/tests/benchmarks/ruby/utils.rb b/benchmarks/ruby/utils.rb similarity index 100% rename from tests/benchmarks/ruby/utils.rb rename to benchmarks/ruby/utils.rb diff --git a/benchmarks/text-benchmarks.cabal b/benchmarks/text-benchmarks.cabal new file mode 100644 index 0000000000000000000000000000000000000000..ce36dd599773254b604637bae0cfd4f24a73246c --- /dev/null +++ b/benchmarks/text-benchmarks.cabal @@ -0,0 +1,54 @@ +name: text-benchmarks +version: 0.0.0.0 +synopsis: Benchmarks for the text package +description: Benchmarks for the text package +homepage: https://bitbucket.org/bos/text +license: BSD3 +license-file: ../LICENSE +author: Jasper Van der Jeugt <jaspervdj@gmail.com>, + Bryan O'Sullivan <bos@serpentine.com>, + Tom Harper <rtomharper@googlemail.com>, + Duncan Coutts <duncan@haskell.org> +maintainer: jaspervdj@gmail.com +category: Text +build-type: Simple + +cabal-version: >=1.2 + +flag llvm + description: use LLVM + default: False + +executable text-benchmarks + hs-source-dirs: haskell .. + c-sources: ../cbits/cbits.c + cbits/time_iconv.c + include-dirs: ../include + main-is: Benchmarks.hs + ghc-options: -Wall -O2 + if flag(llvm) + ghc-options: -fllvm + cpp-options: -DHAVE_DEEPSEQ -DINTEGER_GMP + build-depends: base == 4.*, + binary, + blaze-builder, + bytestring >= 0.10.4.0, + bytestring-lexing, + containers, + criterion >= 0.6.0.1, + deepseq, + directory, + filepath, + ghc-prim, + integer-gmp, + stringsearch, + utf8-string + +executable text-multilang + hs-source-dirs: haskell + main-is: Multilang.hs + ghc-options: -Wall -O2 + build-depends: base == 4.*, + bytestring, + text, + time diff --git a/cbits/cbits.c b/cbits/cbits.c index 7b367458fa677c3faa4d2a621aec933adc787ba6..6fa8bc123fc75675b75b02a4924fb731461a067b 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -9,6 +9,7 @@ #include <string.h> #include <stdint.h> #include <stdio.h> +#include "text_cbits.h" void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff, size_t n) @@ -47,7 +48,7 @@ static const uint8_t utf8d[] = { 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, - 12,36,12,12,12,12,12,12,12,12,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, }; static inline uint32_t @@ -61,24 +62,83 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) { return *state = utf8d[256 + *state + type]; } +/* + * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode + * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to + * an UTF16 array + */ +void +_hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src, + const uint8_t const *srcend) +{ + const uint8_t *p = src; + +#if defined(__i386__) || defined(__x86_64__) + /* This optimization works on a little-endian systems by using + (aligned) 32-bit loads instead of 8-bit loads + */ + + /* consume unaligned prefix */ + while (p != srcend && (uintptr_t)p & 0x3) + *dest++ = *p++; + + /* iterate over 32-bit aligned loads */ + while (p < srcend - 3) { + const uint32_t w = *((const uint32_t *)p); + + *dest++ = w & 0xff; + *dest++ = (w >> 8) & 0xff; + *dest++ = (w >> 16) & 0xff; + *dest++ = (w >> 24) & 0xff; + + p += 4; + } +#endif + + /* handle unaligned suffix */ + while (p != srcend) + *dest++ = *p++; +} + /* * A best-effort decoder. Runs until it hits either end of input or * the start of an invalid byte sequence. * - * At exit, updates *destoff with the next offset to write to, and - * returns the next source offset to read from. + * At exit, we update *destoff with the next offset to write to, *src + * with the next source location past the last one successfully + * decoded, and return the next source location to read from. + * + * Moreover, we expose the internal decoder state (state0 and + * codepoint0), allowing one to restart the decoder after it + * terminates (say, due to a partial codepoint). + * + * In particular, there are a few possible outcomes, + * + * 1) We decoded the buffer entirely: + * In this case we return srcend + * state0 == UTF8_ACCEPT + * + * 2) We met an invalid encoding + * In this case we return the address of the first invalid byte + * state0 == UTF8_REJECT + * + * 3) We reached the end of the buffer while decoding a codepoint + * In this case we return a pointer to the first byte of the partial codepoint + * state0 != UTF8_ACCEPT, UTF8_REJECT + * */ -uint8_t const * -_hs_text_decode_utf8(uint16_t *dest, size_t *destoff, - const uint8_t const *src, const uint8_t const *srcend) +const uint8_t * +_hs_text_decode_utf8_state(uint16_t *const dest, size_t *destoff, + const uint8_t **const src, + const uint8_t *const srcend, + uint32_t *codepoint0, uint32_t *state0) { uint16_t *d = dest + *destoff; - const uint8_t const *s = src; - uint32_t state = UTF8_ACCEPT; + const uint8_t *s = *src, *last = *src; + uint32_t state = *state0; + uint32_t codepoint = *codepoint0; while (s < srcend) { - uint32_t codepoint; - #if defined(__i386__) || defined(__x86_64__) /* * This code will only work on a little-endian system that @@ -106,6 +166,7 @@ _hs_text_decode_utf8(uint16_t *dest, size_t *destoff, *d++ = (uint16_t) ((codepoint >> 16) & 0xff); *d++ = (uint16_t) ((codepoint >> 24) & 0xff); } + last = s; } #endif @@ -121,13 +182,29 @@ _hs_text_decode_utf8(uint16_t *dest, size_t *destoff, *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); } + last = s; } - /* Error recovery - if we're not in a valid finishing state, back up. */ - if (state != UTF8_ACCEPT) + /* Invalid encoding, back up to the errant character */ + if (state == UTF8_REJECT) s -= 1; *destoff = d - dest; + *codepoint0 = codepoint; + *state0 = state; + *src = last; return s; } + +/* + * Helper to decode buffer and discard final decoder state + */ +const uint8_t * +_hs_text_decode_utf8(uint16_t *const dest, size_t *destoff, + const uint8_t *src, const uint8_t *const srcend) +{ + uint32_t codepoint; + uint32_t state = UTF8_ACCEPT; + return _hs_text_decode_utf8_state(dest, destoff, &src, srcend, &codepoint, &state); +} diff --git a/include/text_cbits.h b/include/text_cbits.h new file mode 100644 index 0000000000000000000000000000000000000000..3523efea5f1662ec3041550e784155e6be8107f8 --- /dev/null +++ b/include/text_cbits.h @@ -0,0 +1,11 @@ +/* + * Copyright (c) 2013 Bryan O'Sullivan <bos@serpentine.com>. + */ + +#ifndef _text_cbits_h +#define _text_cbits_h + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +#endif diff --git a/scripts/Arsec.hs b/scripts/Arsec.hs index 818bfc022124a27c2401e89a03de96aca7a2a47f..653c8e4f581c6b22c14ba8fa27d9593c01c0e89c 100644 --- a/scripts/Arsec.hs +++ b/scripts/Arsec.hs @@ -26,7 +26,7 @@ import Text.ParserCombinators.Parsec.Prim hiding ((<|>), many) instance Applicative (GenParser s a) where pure = return (<*>) = ap - + instance Alternative (GenParser s a) where empty = mzero (<|>) = mplus diff --git a/tests-and-benchmarks.markdown b/tests-and-benchmarks.markdown new file mode 100644 index 0000000000000000000000000000000000000000..71e42684baed61dcb6f3b83433408ef90f5c7cec --- /dev/null +++ b/tests-and-benchmarks.markdown @@ -0,0 +1,63 @@ +Tests and benchmarks +==================== + +Prerequisites +------------- + +To run the tests and benchmarks, you will need the test data, which +you can clone from one of the following locations: + +* Mercurial master repository: + [bitbucket.org/bos/text-test-data](https://bitbucket.org/bos/text-test-data) + +* Git mirror repository: + [github.com/bos/text-test-data](https://github.com/bos/text-test-data) + +You should clone that repository into the `tests` subdirectory (your +clone must be named `text-test-data` locally), then run `make -C +tests/text-test-data` to uncompress the test files. Many tests and +benchmarks will fail if the test files are missing. + +Functional tests +---------------- + +The functional tests are located in the `tests` subdirectory. An overview of +what's in that directory: + + Makefile Has targets for common tasks + Tests Source files of the testing code + scripts Various utility scripts + text-tests.cabal Cabal file that compiles all benchmarks + +The `text-tests.cabal` builds: + +- A copy of the text library, sharing the source code, but exposing all internal + modules, for testing purposes +- The different test suites + +To compile, run all tests, and generate a coverage report, simply use `make`. + +Benchmarks +---------- + +The benchmarks are located in the `benchmarks` subdirectory. An overview of +what's in that directory: + + Makefile Has targets for common tasks + haskell Source files of the haskell benchmarks + python Python implementations of some benchmarks + ruby Ruby implementations of some benchmarks + text-benchmarks.cabal Cabal file which compiles all benchmarks + +To compile the benchmarks, navigate to the `benchmarks` subdirectory and run +`cabal configure && cabal build`. Then, you can run the benchmarks using: + + ./dist/build/text-benchmarks/text-benchmarks + +However, since there's quite a lot of benchmarks, you usually don't want to +run them all. Instead, use the `-l` flag to get a list of benchmarks: + + ./dist/build/text-benchmarks/text-benchmarks + +And run the ones you want to inspect. If you want to configure the benchmarks +further, the exact parameters can be changed in `Benchmarks.hs`. diff --git a/tests/tests/.ghci b/tests/.ghci similarity index 100% rename from tests/tests/.ghci rename to tests/.ghci diff --git a/tests/tests/Makefile b/tests/Makefile similarity index 78% rename from tests/tests/Makefile rename to tests/Makefile index f2c9d0e7f922e7ab1af18d79a3479b1e9300cf1c..11468b9201f8ea2ffa5e6fed97688e9b2ec60acb 100644 --- a/tests/tests/Makefile +++ b/tests/Makefile @@ -1,12 +1,18 @@ +count = 1000 + coverage: build coverage/hpc_index.html -build: +build: text-test-data cabal configure -fhpc cabal build +text-test-data: + hg clone https://bitbucket.org/bos/text-test-data + $(MAKE) -C text-test-data + coverage/text-tests.tix: -mkdir -p coverage - ./dist/build/text-tests/text-tests + ./dist/build/text-tests/text-tests -a $(count) mv text-tests.tix $@ coverage/text-tests-stdio.tix: diff --git a/tests/README.markdown b/tests/README.markdown deleted file mode 100644 index 66d33ac3981d4e3cd370fbfa1ccd9de32f286e41..0000000000000000000000000000000000000000 --- a/tests/README.markdown +++ /dev/null @@ -1,63 +0,0 @@ -Tests -===== - -This directory contains the tests for the Text library. To run these -tests, you will need the test data from one of the following -locations: - -* Mercurial master repository: - [bitbucket.org/bos/text-test-data](https://bitbucket.org/bos/text-test-data) - -* Git mirror repository: - [github.com/bos/text-test-data](https://github.com/bos/text-test-data) - -You should clone that repository to the same directory as this `README` -file, then run `make` to uncompress the test files. Many tests will -fail if the test files are missing. - -There are two categories of tests: functional tests (including QuickCheck -properties), and benchmarks. - -Functional tests ----------------- - -The functional tests are located in the `tests` subdirectory. An overview of -what's in that directory: - - scripts Various utility scripts - src Source files of the testing code - text-tests.cabal Cabal file which compiles all benchmarks - Makefile Has targets for common tasks - -The `text-tests.cabal` builds: - -- A copy of the text library, sharing the source code, but exposing all internal - modules, for testing purposes -- The different test suites - -To compile, run all tests, and generate a coverage report, simply use `make`. - -Benchmarks ----------- - -The benchmarks are located in the `benchmarks` subdirectory. An overview of -what's in that directory: - - python Python implementations of some benchmarks - ruby Ruby implementations of some benchmarks - src Source files of the haskell benchmarks - benchmarks.cabal Cabal file which compiles all benchmarks - Makefile Has targets for common tasks - -To compile the benchmarks, navigate to the `benchmarks` subdirectory and run -`cabal configure && cabal build`. Then, you can run the benchmarks using: - - ./dist/build/benchmarks/benchmarks - -However, since there quite a lot of benchmarks, you usually don't want to run -them all. Instead, use the `-l` flag to get a list of benchmarks: - - ./dist/build/benchmarks/benchmarks - -And run the ones you want to inspect. If you want to configure the benchmarks -further, the exact parameters can be changed in `src/Data/Text/Benchmarks.hs`. diff --git a/tests/tests/src/Data/Text/Tests.hs b/tests/Tests.hs similarity index 64% rename from tests/tests/src/Data/Text/Tests.hs rename to tests/Tests.hs index 3bb1a51c0b51ea260857649537addb2b84162db1..fb97ff4adc08eac99f5682de9c191a14481796e9 100644 --- a/tests/tests/src/Data/Text/Tests.hs +++ b/tests/Tests.hs @@ -6,8 +6,8 @@ module Main import Test.Framework (defaultMain) -import qualified Data.Text.Tests.Properties as Properties -import qualified Data.Text.Tests.Regressions as Regressions +import qualified Tests.Properties as Properties +import qualified Tests.Regressions as Regressions main :: IO () main = defaultMain [Properties.tests, Regressions.tests] diff --git a/tests/tests/src/Data/Text/Tests/IO.hs b/tests/Tests/IO.hs similarity index 100% rename from tests/tests/src/Data/Text/Tests/IO.hs rename to tests/Tests/IO.hs diff --git a/tests/tests/src/Data/Text/Tests/Properties.hs b/tests/Tests/Properties.hs similarity index 90% rename from tests/tests/src/Data/Text/Tests/Properties.hs rename to tests/Tests/Properties.hs index ef6cb4a876b96bceb2cafc8bc4c34a33dc14c1d7..55974b640d74759dae1fe1e4b8491e0363c47b2b 100644 --- a/tests/tests/src/Data/Text/Tests/Properties.hs +++ b/tests/Tests/Properties.hs @@ -1,10 +1,10 @@ -- | General quicktest properties for the text library -- -{-# LANGUAGE BangPatterns, FlexibleInstances, OverloadedStrings, - ScopedTypeVariables, TypeSynonymInstances, CPP #-} +{-# LANGUAGE BangPatterns, CPP, FlexibleInstances, OverloadedStrings, + ScopedTypeVariables, TypeSynonymInstances #-} {-# OPTIONS_GHC -fno-enable-rewrite-rules #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} -module Data.Text.Tests.Properties +module Tests.Properties ( tests ) where @@ -16,6 +16,7 @@ import Text.Show.Functions () import Control.Arrow ((***), second) import Control.Exception (catch) import Data.Char (chr, isDigit, isHexDigit, isLower, isSpace, isUpper, ord) +import Data.Int (Int8, Int16, Int32, Int64) import Data.Monoid (Monoid(..)) import Data.String (fromString) import Data.Text.Encoding.Error @@ -24,13 +25,13 @@ import Data.Text.Fusion.Size import Data.Text.Lazy.Read as TL import Data.Text.Read as T import Data.Text.Search (indices) -import Data.Word (Word8, Word16, Word32) +import Data.Word (Word, Word8, Word16, Word32, Word64) import Numeric (showHex) -import Prelude hiding (catch, replicate) import Test.Framework (Test, testGroup) import Test.Framework.Providers.QuickCheck2 (testProperty) import qualified Data.Bits as Bits (shiftL, shiftR) import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as BL import qualified Data.List as L import qualified Data.Text as T import qualified Data.Text.Encoding as E @@ -39,6 +40,8 @@ import qualified Data.Text.Fusion.Common as S import qualified Data.Text.IO as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as TB +import qualified Data.Text.Lazy.Builder.Int as TB +import qualified Data.Text.Lazy.Builder.RealFloat as TB import qualified Data.Text.Lazy.Encoding as EL import qualified Data.Text.Lazy.Fusion as SL import qualified Data.Text.Lazy.IO as TL @@ -46,9 +49,15 @@ import qualified Data.Text.Lazy.Search as S (indices) import qualified Data.Text.UnsafeShift as U import qualified System.IO as IO -import Data.Text.Tests.QuickCheckUtils -import Data.Text.Tests.Utils -import qualified Data.Text.Tests.SlowFunctions as Slow +import Tests.QuickCheckUtils +import Tests.Utils +import qualified Tests.SlowFunctions as Slow + +#if MIN_VERSION_base(4,6,0) +import Prelude hiding (replicate) +#else +import Prelude hiding (catch, replicate) +#endif t_pack_unpack = (T.unpack . T.pack) `eq` id tl_pack_unpack = (TL.unpack . TL.pack) `eq` id @@ -62,10 +71,20 @@ tl_unstreamChunks x = f 11 x == f 1000 x tl_chunk_unchunk = (TL.fromChunks . TL.toChunks) `eq` id tl_from_to_strict = (TL.fromStrict . TL.toStrict) `eq` id +-- Note: this silently truncates code-points > 255 to 8-bit due to 'B.pack' +encodeL1 :: T.Text -> B.ByteString +encodeL1 = B.pack . map (fromIntegral . fromEnum) . T.unpack +encodeLazyL1 :: TL.Text -> BL.ByteString +encodeLazyL1 = BL.fromChunks . map encodeL1 . TL.toChunks + t_ascii t = E.decodeASCII (E.encodeUtf8 a) == a where a = T.map (\c -> chr (ord c `mod` 128)) t tl_ascii t = EL.decodeASCII (EL.encodeUtf8 a) == a where a = TL.map (\c -> chr (ord c `mod` 128)) t +t_latin1 t = E.decodeLatin1 (encodeL1 a) == a + where a = T.map (\c -> chr (ord c `mod` 256)) t +tl_latin1 t = EL.decodeLatin1 (encodeLazyL1 a) == a + where a = TL.map (\c -> chr (ord c `mod` 256)) t t_utf8 = forAll genUnicode $ (E.decodeUtf8 . E.encodeUtf8) `eq` id t_utf8' = forAll genUnicode $ (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right) tl_utf8 = forAll genUnicode $ (EL.decodeUtf8 . EL.encodeUtf8) `eq` id @@ -79,6 +98,18 @@ tl_utf32LE = forAll genUnicode $ (EL.decodeUtf32LE . EL.encodeUtf32LE) `eq` id t_utf32BE = forAll genUnicode $ (E.decodeUtf32BE . E.encodeUtf32BE) `eq` id tl_utf32BE = forAll genUnicode $ (EL.decodeUtf32BE . EL.encodeUtf32BE) `eq` id +t_utf8_incr = do + Positive n <- arbitrary + forAll genUnicode $ recode n `eq` id + where recode n = T.concat . feedChunksOf n E.streamDecodeUtf8 . E.encodeUtf8 + feedChunksOf :: Int -> (B.ByteString -> E.Decoding) -> B.ByteString + -> [T.Text] + feedChunksOf n f bs + | B.null bs = [] + | otherwise = let (a,b) = B.splitAt n bs + E.Some t _ f' = f a + in t : feedChunksOf n f' b + -- This is a poor attempt to ensure that the error handling paths on -- decode are exercised in some way. Proper testing would be rather -- more involved. @@ -610,17 +641,55 @@ shiftR_Word32 = shiftR :: Word32 -> Property -- Builder. -t_builderSingleton = id `eqP` - (unpackS . TB.toLazyText . mconcat . map TB.singleton) -t_builderFromText = L.concat `eq` (unpackS . TB.toLazyText . mconcat . +tb_singleton = id `eqP` + (unpackS . TB.toLazyText . mconcat . map TB.singleton) +tb_fromText = L.concat `eq` (unpackS . TB.toLazyText . mconcat . map (TB.fromText . packS)) -t_builderAssociative s1 s2 s3 = +tb_associative s1 s2 s3 = TB.toLazyText (b1 `mappend` (b2 `mappend` b3)) == TB.toLazyText ((b1 `mappend` b2) `mappend` b3) where b1 = TB.fromText (packS s1) b2 = TB.fromText (packS s2) b3 = TB.fromText (packS s3) +-- Numeric builder stuff. + +tb_decimal :: (Integral a, Show a) => a -> Bool +tb_decimal = (TB.toLazyText . TB.decimal) `eq` (TL.pack . show) + +tb_decimal_integer (a::Integer) = tb_decimal a +tb_decimal_int (a::Int) = tb_decimal a +tb_decimal_int8 (a::Int8) = tb_decimal a +tb_decimal_int16 (a::Int16) = tb_decimal a +tb_decimal_int32 (a::Int32) = tb_decimal a +tb_decimal_int64 (a::Int64) = tb_decimal a +tb_decimal_word (a::Word) = tb_decimal a +tb_decimal_word8 (a::Word8) = tb_decimal a +tb_decimal_word16 (a::Word16) = tb_decimal a +tb_decimal_word32 (a::Word32) = tb_decimal a +tb_decimal_word64 (a::Word64) = tb_decimal a + +tb_hex :: (Integral a, Show a) => a -> Bool +tb_hex = (TB.toLazyText . TB.hexadecimal) `eq` (TL.pack . flip showHex "") + +tb_hexadecimal_integer (a::Integer) = tb_hex a +tb_hexadecimal_int (a::Int) = tb_hex a +tb_hexadecimal_int8 (a::Int8) = tb_hex a +tb_hexadecimal_int16 (a::Int16) = tb_hex a +tb_hexadecimal_int32 (a::Int32) = tb_hex a +tb_hexadecimal_int64 (a::Int64) = tb_hex a +tb_hexadecimal_word (a::Word) = tb_hex a +tb_hexadecimal_word8 (a::Word8) = tb_hex a +tb_hexadecimal_word16 (a::Word16) = tb_hex a +tb_hexadecimal_word32 (a::Word32) = tb_hex a +tb_hexadecimal_word64 (a::Word64) = tb_hex a + +tb_realfloat :: (RealFloat a, Show a) => a -> Bool +tb_realfloat = (TB.toLazyText . TB.realFloat) `eq` (TL.pack . show) + +tb_realfloat_float (a::Float) = tb_realfloat a +tb_realfloat_double (a::Double) = tb_realfloat a + -- Reading. t_decimal (n::Int) s = @@ -681,6 +750,8 @@ t_take_drop_16 m t = T.append (takeWord16 n t) (dropWord16 n t) == t where n = small m t_use_from t = monadicIO $ assert . (==t) =<< run (useAsPtr t fromPtr) +t_copy t = T.copy t == t + -- Regression tests. s_filter_eq s = S.filter p t == S.streamList (filter p s) where p = (/= S.last t) @@ -691,7 +762,7 @@ s_filter_eq s = S.filter p t == S.streamList (filter p s) -- themselves. shorten :: Int -> S.Stream a -> S.Stream a shorten n t@(S.Stream arr off len) - | n > 0 = S.Stream arr off (smaller (exactSize n) len) + | n > 0 = S.Stream arr off (smaller (exactSize n) len) | otherwise = t tests :: Test @@ -713,8 +784,11 @@ tests = testGroup "transcoding" [ testProperty "t_ascii" t_ascii, testProperty "tl_ascii" tl_ascii, + testProperty "t_latin1" t_latin1, + testProperty "tl_latin1" tl_latin1, testProperty "t_utf8" t_utf8, testProperty "t_utf8'" t_utf8', + testProperty "t_utf8_incr" t_utf8_incr, testProperty "tl_utf8" tl_utf8, testProperty "tl_utf8'" tl_utf8', testProperty "t_utf16LE" t_utf16LE, @@ -1080,9 +1154,39 @@ tests = ], testGroup "builder" [ - testProperty "t_builderSingleton" t_builderSingleton, - testProperty "t_builderFromText" t_builderFromText, - testProperty "t_builderAssociative" t_builderAssociative + testProperty "tb_associative" tb_associative, + testGroup "decimal" [ + testProperty "tb_decimal_int" tb_decimal_int, + testProperty "tb_decimal_int8" tb_decimal_int8, + testProperty "tb_decimal_int16" tb_decimal_int16, + testProperty "tb_decimal_int32" tb_decimal_int32, + testProperty "tb_decimal_int64" tb_decimal_int64, + testProperty "tb_decimal_integer" tb_decimal_integer, + testProperty "tb_decimal_word" tb_decimal_word, + testProperty "tb_decimal_word8" tb_decimal_word8, + testProperty "tb_decimal_word16" tb_decimal_word16, + testProperty "tb_decimal_word32" tb_decimal_word32, + testProperty "tb_decimal_word64" tb_decimal_word64 + ], + testGroup "hexadecimal" [ + testProperty "tb_hexadecimal_int" tb_hexadecimal_int, + testProperty "tb_hexadecimal_int8" tb_hexadecimal_int8, + testProperty "tb_hexadecimal_int16" tb_hexadecimal_int16, + testProperty "tb_hexadecimal_int32" tb_hexadecimal_int32, + testProperty "tb_hexadecimal_int64" tb_hexadecimal_int64, + testProperty "tb_hexadecimal_integer" tb_hexadecimal_integer, + testProperty "tb_hexadecimal_word" tb_hexadecimal_word, + testProperty "tb_hexadecimal_word8" tb_hexadecimal_word8, + testProperty "tb_hexadecimal_word16" tb_hexadecimal_word16, + testProperty "tb_hexadecimal_word32" tb_hexadecimal_word32, + testProperty "tb_hexadecimal_word64" tb_hexadecimal_word64 + ], + testGroup "realfloat" [ + testProperty "tb_realfloat_double" tb_realfloat_double, + testProperty "tb_realfloat_float" tb_realfloat_float + ], + testProperty "tb_fromText" tb_fromText, + testProperty "tb_singleton" tb_singleton ], testGroup "read" [ @@ -1113,6 +1217,7 @@ tests = testProperty "t_dropWord16" t_dropWord16, testProperty "t_takeWord16" t_takeWord16, testProperty "t_take_drop_16" t_take_drop_16, - testProperty "t_use_from" t_use_from + testProperty "t_use_from" t_use_from, + testProperty "t_copy" t_copy ] ] diff --git a/tests/tests/src/Data/Text/Tests/QuickCheckUtils.hs b/tests/Tests/QuickCheckUtils.hs similarity index 98% rename from tests/tests/src/Data/Text/Tests/QuickCheckUtils.hs rename to tests/Tests/QuickCheckUtils.hs index e6449ffaed5516874ad8f2e96ed88f274dec7e87..00d4d3fa2d2992e4602bed95e3c35138ddf38b2a 100644 --- a/tests/tests/src/Data/Text/Tests/QuickCheckUtils.hs +++ b/tests/Tests/QuickCheckUtils.hs @@ -1,10 +1,10 @@ -- | This module provides quickcheck utilities, e.g. arbitrary and show -- instances, and comparison functions, so we can focus on the actual properties --- in the 'Data.Text.Tests.Properties' module. +-- in the 'Tests.Properties' module. -- {-# LANGUAGE CPP, FlexibleInstances, TypeSynonymInstances #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -module Data.Text.Tests.QuickCheckUtils +module Tests.QuickCheckUtils ( genUnicode , unsquare @@ -24,7 +24,7 @@ module Data.Text.Tests.QuickCheckUtils , eqP , Encoding (..) - + , write_read ) where @@ -50,7 +50,7 @@ import qualified Data.Text.Lazy.Fusion as TLF import qualified Data.Text.Lazy.Internal as TL import qualified System.IO as IO -import Data.Text.Tests.Utils +import Tests.Utils instance Random I16 where randomR = integralRandomR @@ -67,7 +67,7 @@ genUnicode = fmap fromString string where string = sized $ \n -> do k <- choose (0,n) sequence [ char | _ <- [1..k] ] - + excluding :: [a -> Bool] -> Gen a -> Gen a excluding bad gen = loop where @@ -76,14 +76,14 @@ genUnicode = fmap fromString string where if or (map ($ x) bad) then loop else return x - + reserved = [lowSurrogate, highSurrogate, noncharacter] lowSurrogate c = c >= 0xDC00 && c <= 0xDFFF highSurrogate c = c >= 0xD800 && c <= 0xDBFF noncharacter c = masked == 0xFFFE || masked == 0xFFFF where - masked = c .&. 0xFFFF - + masked = c .&. 0xFFFF + ascii = choose (0,0x7F) plane0 = choose (0xF0, 0xFFFF) plane1 = oneof [ choose (0x10000, 0x10FFF) @@ -109,7 +109,7 @@ genUnicode = fmap fromString string where ] plane14 = choose (0xE0000, 0xE0FFF) planes = [ascii, plane0, plane1, plane2, plane14] - + char = chr `fmap` excluding reserved (oneof planes) -- For tests that have O(n^2) running times or input sizes, resize diff --git a/tests/tests/src/Data/Text/Tests/Regressions.hs b/tests/Tests/Regressions.hs similarity index 62% rename from tests/tests/src/Data/Text/Tests/Regressions.hs rename to tests/Tests/Regressions.hs index a80c616afafbcdfc27e6cad21a0bfb0317f091e8..5d2db97626db2b654f533047ddbae171cdb60408 100644 --- a/tests/tests/src/Data/Text/Tests/Regressions.hs +++ b/tests/Tests/Regressions.hs @@ -1,24 +1,27 @@ -- | Regression tests for specific bugs. -- {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} -module Data.Text.Tests.Regressions +module Tests.Regressions ( tests ) where import Control.Exception (SomeException, handle) import System.IO -import Test.HUnit (assertFailure) +import Test.HUnit (assertBool, assertEqual, assertFailure) import qualified Data.ByteString as B +import Data.ByteString.Char8 () import qualified Data.ByteString.Lazy as LB import qualified Data.Text as T +import qualified Data.Text.Encoding as TE import qualified Data.Text.IO as T import qualified Data.Text.Lazy as LT import qualified Data.Text.Lazy.Encoding as LE +import qualified Data.Text.Unsafe as T import qualified Test.Framework as F import qualified Test.Framework.Providers.HUnit as F -import Data.Text.Tests.Utils (withTempFile) +import Tests.Utils (withTempFile) -- Reported by Michael Snoyman: UTF-8 encoding a large lazy bytestring -- caused either a segfault or attempt to allocate a negative number @@ -49,9 +52,31 @@ replicate_crash = handle (\(_::SomeException) -> return ()) $ power | maxBound == (2147483647::Int) = 28 | otherwise = 60 :: Int +-- Reported by John Millikin: a UTF-8 decode error handler could +-- return a bogus substitution character, which we would write without +-- checking. +utf8_decode_unsafe :: IO () +utf8_decode_unsafe = do + let t = TE.decodeUtf8With (\_ _ -> Just '\xdc00') "\x80" + assertBool "broken error recovery shouldn't break us" (t == "\xfffd") + +-- Reported by Eric Seidel: we mishandled mapping Chars that fit in a +-- single Word16 to Chars that require two. +mapAccumL_resize :: IO () +mapAccumL_resize = do + let f a _ = (a, '\65536') + count = 5 + val = T.mapAccumL f (0::Int) (T.replicate count "a") + assertEqual "mapAccumL should correctly fill buffers for two-word results" + (0, T.replicate count "\65536") val + assertEqual "mapAccumL should correctly size buffers for two-word results" + (count * 2) (T.lengthWord16 (snd val)) + tests :: F.Test tests = F.testGroup "Regressions" [ F.testCase "hGetContents_crash" hGetContents_crash , F.testCase "lazy_encode_crash" lazy_encode_crash + , F.testCase "mapAccumL_resize" mapAccumL_resize , F.testCase "replicate_crash" replicate_crash + , F.testCase "utf8_decode_unsafe" utf8_decode_unsafe ] diff --git a/tests/tests/src/Data/Text/Tests/SlowFunctions.hs b/tests/Tests/SlowFunctions.hs similarity index 97% rename from tests/tests/src/Data/Text/Tests/SlowFunctions.hs rename to tests/Tests/SlowFunctions.hs index f6d21a156339021db828df24d86240757c1552b6..2d0b9a3cdaf4c3eb707b78632e829887c4abb29e 100644 --- a/tests/tests/src/Data/Text/Tests/SlowFunctions.hs +++ b/tests/Tests/SlowFunctions.hs @@ -1,5 +1,5 @@ {-# LANGUAGE BangPatterns #-} -module Data.Text.Tests.SlowFunctions +module Tests.SlowFunctions ( indices , splitOn diff --git a/tests/tests/src/Data/Text/Tests/Utils.hs b/tests/Tests/Utils.hs similarity index 98% rename from tests/tests/src/Data/Text/Tests/Utils.hs rename to tests/Tests/Utils.hs index ed828b5470688fd0fb87f875acecf4dd199a1e4e..34f9acca1fc13cde4010b84ffc02b93fcd960fac 100644 --- a/tests/tests/src/Data/Text/Tests/Utils.hs +++ b/tests/Tests/Utils.hs @@ -1,7 +1,7 @@ -- | Miscellaneous testing utilities -- {-# LANGUAGE ScopedTypeVariables #-} -module Data.Text.Tests.Utils +module Tests.Utils ( (=^=) , withRedirect diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Builder.hs b/tests/benchmarks/src/Data/Text/Benchmarks/Builder.hs deleted file mode 100644 index 5bb83fed0a2742ec22e76c035933f28ec71cefbc..0000000000000000000000000000000000000000 --- a/tests/benchmarks/src/Data/Text/Benchmarks/Builder.hs +++ /dev/null @@ -1,48 +0,0 @@ --- | Testing the internal builder monoid --- --- Tested in this benchmark: --- --- * Concatenating many small strings using a builder --- -{-# LANGUAGE OverloadedStrings #-} -module Data.Text.Benchmarks.Builder - ( benchmark - ) where - -import Criterion (Benchmark, bgroup, bench, nf) -import Data.Binary.Builder as B -import Data.ByteString.Char8 () -import Data.Monoid (mconcat) -import qualified Blaze.ByteString.Builder as Blaze -import qualified Blaze.ByteString.Builder.Char.Utf8 as Blaze -import qualified Data.ByteString as SB -import qualified Data.ByteString.Lazy as LB -import qualified Data.Text as T -import qualified Data.Text.Lazy as LT -import qualified Data.Text.Lazy.Builder as LTB - -benchmark :: IO Benchmark -benchmark = return $ bgroup "Builder" - [ bench "LazyText" $ nf - (LT.length . LTB.toLazyText . mconcat . map LTB.fromText) texts - , bench "Binary" $ nf - (LB.length . B.toLazyByteString . mconcat . map B.fromByteString) - byteStrings - , bench "Blaze" $ nf - (LB.length . Blaze.toLazyByteString . mconcat . map Blaze.fromString) - strings - ] - -texts :: [T.Text] -texts = take 200000 $ cycle ["foo", "λx", "ç”±ã®"] -{-# NOINLINE texts #-} - --- Note that the non-ascii characters will be chopped -byteStrings :: [SB.ByteString] -byteStrings = take 200000 $ cycle ["foo", "λx", "ç”±ã®"] -{-# NOINLINE byteStrings #-} - --- Note that the non-ascii characters will be chopped -strings :: [String] -strings = take 200000 $ cycle ["foo", "λx", "ç”±ã®"] -{-# NOINLINE strings #-} diff --git a/tests/benchmarks/text-benchmarks.cabal b/tests/benchmarks/text-benchmarks.cabal deleted file mode 100644 index 72eff1453fe74ff82b7b7f0edce3254be55e5134..0000000000000000000000000000000000000000 --- a/tests/benchmarks/text-benchmarks.cabal +++ /dev/null @@ -1,36 +0,0 @@ -name: text-benchmarks -version: 0.0.0.0 -synopsis: Benchmarks for the text package -description: Benchmarks for the text package -homepage: https://bitbucket.org/bos/text -license: BSD3 -license-file: ../../LICENSE -author: Jasper Van der Jeugt <jaspervdj@gmail.com>, - Bryan O'Sullivan <bos@serpentine.com>, - Tom Harper <rtomharper@googlemail.com>, - Duncan Coutts <duncan@haskell.org> -maintainer: jaspervdj@gmail.com -category: Text -build-type: Simple - -cabal-version: >=1.2 - -executable text-benchmarks - hs-source-dirs: src ../.. - c-sources: ../../cbits/cbits.c - cbits/time_iconv.c - main-is: Data/Text/Benchmarks.hs - ghc-options: -Wall -O2 - cpp-options: -DHAVE_DEEPSEQ - build-depends: base >= 4 && < 5, - criterion >= 0.5 && < 0.7, - bytestring >= 0.9 && < 0.10, - deepseq >= 1.1 && < 1.2, - filepath >= 1.1 && < 1.3, - directory >= 1.1 && < 1.2, - containers >= 0.3 && < 0.5, - binary >= 0.5 && < 0.6, - utf8-string >= 0.3 && < 0.4, - blaze-builder >= 0.3 && < 0.4, - bytestring-lexing >= 0.2 && < 0.3, - stringsearch >= 0.3 && < 0.4 diff --git a/tests/tests/scripts/cover-stdio.sh b/tests/scripts/cover-stdio.sh similarity index 100% rename from tests/tests/scripts/cover-stdio.sh rename to tests/scripts/cover-stdio.sh diff --git a/tests/tests/text-tests.cabal b/tests/text-tests.cabal similarity index 72% rename from tests/tests/text-tests.cabal rename to tests/text-tests.cabal index f90640e520098a5219cdcaf24aee6dd9b9df8723..96cb79f831de6b7f776749d8fe2e4ada36c7befe 100644 --- a/tests/tests/text-tests.cabal +++ b/tests/text-tests.cabal @@ -4,7 +4,7 @@ synopsis: Functional tests for the text package description: Functional tests for the text package homepage: https://bitbucket.org/bos/text license: BSD3 -license-file: ../../LICENSE +license-file: ../LICENSE author: Jasper Van der Jeugt <jaspervdj@gmail.com>, Bryan O'Sullivan <bos@serpentine.com>, Tom Harper <rtomharper@googlemail.com>, @@ -20,8 +20,7 @@ flag hpc default: False executable text-tests - hs-source-dirs: src - main-is: Data/Text/Tests.hs + main-is: Tests.hs ghc-options: -Wall -threaded -O0 -rtsopts @@ -35,21 +34,20 @@ executable text-tests -DHAVE_DEEPSEQ build-depends: - text-tests, - base >= 4 && < 5, - bytestring >= 0.9 && < 0.10, - deepseq >= 1.1, - directory >= 1.1 && < 1.2, - random >= 1.0 && < 1.1, - QuickCheck >= 2.4 && < 2.5, - HUnit >= 1.2 && < 1.3, - test-framework >= 0.4 && < 0.5, - test-framework-quickcheck2 >= 0.2 && < 0.3, - test-framework-hunit >= 0.2 && < 0.3 + HUnit >= 1.2, + QuickCheck >= 2.4, + base == 4.*, + bytestring, + deepseq, + directory, + random, + test-framework >= 0.4, + test-framework-hunit >= 0.2, + test-framework-quickcheck2 >= 0.2, + text-tests executable text-tests-stdio - hs-source-dirs: src - main-is: Data/Text/Tests/IO.hs + main-is: Tests/IO.hs ghc-options: -Wall -threaded -rtsopts @@ -64,41 +62,45 @@ executable text-tests-stdio base >= 4 && < 5 library - hs-source-dirs: ../.. - c-sources: ../../cbits/cbits.c + hs-source-dirs: .. + c-sources: ../cbits/cbits.c + include-dirs: ../include exposed-modules: Data.Text Data.Text.Array Data.Text.Encoding Data.Text.Encoding.Error - Data.Text.Foreign - Data.Text.IO - Data.Text.Internal - Data.Text.Lazy - Data.Text.Lazy.Builder - Data.Text.Lazy.Builder.Int - Data.Text.Lazy.Builder.RealFloat - Data.Text.Lazy.Encoding - Data.Text.Lazy.IO - Data.Text.Lazy.Internal - Data.Text.Lazy.Read - Data.Text.Read Data.Text.Encoding.Fusion Data.Text.Encoding.Fusion.Common Data.Text.Encoding.Utf16 Data.Text.Encoding.Utf32 Data.Text.Encoding.Utf8 + Data.Text.Foreign Data.Text.Fusion Data.Text.Fusion.CaseMapping Data.Text.Fusion.Common Data.Text.Fusion.Internal Data.Text.Fusion.Size + Data.Text.IO Data.Text.IO.Internal + Data.Text.Internal + Data.Text.Lazy + Data.Text.Lazy.Builder Data.Text.Lazy.Builder.Functions + Data.Text.Lazy.Builder.Int + Data.Text.Lazy.Builder.Int.Digits + Data.Text.Lazy.Builder.Internal + Data.Text.Lazy.Builder.RealFloat Data.Text.Lazy.Builder.RealFloat.Functions + Data.Text.Lazy.Encoding Data.Text.Lazy.Encoding.Fusion Data.Text.Lazy.Fusion + Data.Text.Lazy.IO + Data.Text.Lazy.Internal + Data.Text.Lazy.Read Data.Text.Lazy.Search + Data.Text.Private + Data.Text.Read Data.Text.Search Data.Text.Unsafe Data.Text.Unsafe.Base @@ -117,8 +119,8 @@ library build-depends: array, - base >= 4 && < 5, - bytestring >= 0.9 && < 1.0, - deepseq >= 1.1, - integer-gmp >= 0.2 && < 0.3, - ghc-prim >= 0.2 && < 0.3 + base == 4.*, + bytestring, + deepseq, + ghc-prim, + integer-gmp diff --git a/text.cabal b/text.cabal index 7f66e8c8746405f9a14cc2f11a24e30807644de0..01381ab4ad6b4e716a690a9cb495acde186e2678 100644 --- a/text.cabal +++ b/text.cabal @@ -1,9 +1,9 @@ name: text -version: 0.11.2.0 +version: 0.11.5.0 homepage: https://github.com/bos/text bug-reports: https://github.com/bos/text/issues synopsis: An efficient packed Unicode text type. -description: +description: . An efficient packed, immutable Unicode text type (both strict and lazy), with a powerful loop fusion optimization framework. @@ -31,6 +31,15 @@ description: the @text-icu@ package: <http://hackage.haskell.org/package/text-icu> . + —— RELEASE NOTES —— + . + Changes in 0.11.2.0: + . + * String literals are now converted directly from the format in + which GHC stores them into 'Text', without an intermediate + transformation through 'String', and without inlining of + conversion code at each site where a string literal is declared. + . license: BSD3 license-file: LICENSE author: Bryan O'Sullivan <bos@serpentine.com> @@ -40,31 +49,36 @@ category: Data, Text build-type: Simple cabal-version: >= 1.8 extra-source-files: - README.markdown -- scripts/CaseFolding.txt -- scripts/SpecialCasing.txt + README.markdown + benchmarks/Setup.hs + benchmarks/cbits/*.c + benchmarks/haskell/*.hs + benchmarks/haskell/Benchmarks/*.hs + benchmarks/python/*.py + benchmarks/ruby/*.rb + benchmarks/text-benchmarks.cabal scripts/*.hs - tests/README.markdown - tests/benchmarks/Setup.hs - tests/benchmarks/cbits/*.c - tests/benchmarks/python/*.py - tests/benchmarks/ruby/*.rb - tests/benchmarks/src/Data/Text/*.hs - tests/benchmarks/src/Data/Text/Benchmarks/*.hs - tests/benchmarks/text-benchmarks.cabal - tests/tests/.ghci - tests/tests/Makefile - tests/tests/scripts/*.sh - tests/tests/src/Data/Text/*.hs - tests/tests/src/Data/Text/Tests/*.hs - tests/tests/text-tests.cabal + tests-and-benchmarks.markdown + tests/*.hs + tests/.ghci + tests/Makefile + tests/Tests/*.hs + tests/scripts/*.sh + tests/text-tests.cabal flag developer description: operate in developer mode default: False +flag integer-simple + description: Use the simple integer library instead of GMP + default: False + library - c-sources: cbits/cbits.c + c-sources: cbits/cbits.c + include-dirs: include exposed-modules: Data.Text @@ -83,6 +97,7 @@ library Data.Text.Lazy.Internal Data.Text.Lazy.Read Data.Text.Read + Data.Text.Unsafe other-modules: Data.Text.Encoding.Fusion Data.Text.Encoding.Fusion.Common @@ -96,51 +111,46 @@ library Data.Text.Fusion.Size Data.Text.IO.Internal Data.Text.Lazy.Builder.Functions + Data.Text.Lazy.Builder.Int.Digits + Data.Text.Lazy.Builder.Internal Data.Text.Lazy.Builder.RealFloat.Functions Data.Text.Lazy.Encoding.Fusion Data.Text.Lazy.Fusion Data.Text.Lazy.Search Data.Text.Private Data.Text.Search - Data.Text.Unsafe Data.Text.Unsafe.Base Data.Text.UnsafeChar Data.Text.UnsafeShift Data.Text.Util build-depends: - array, - base < 5, - bytestring >= 0.10 && < 1.0 - if impl(ghc >= 6.10) - build-depends: - ghc-prim, base >= 4, deepseq >= 1.1.0.0 - cpp-options: -DHAVE_DEEPSEQ - else - build-depends: extensible-exceptions - extensions: PatternSignatures + array >= 0.3, + base >= 4.2 && < 5, + bytestring >= 0.10.4.0, + deepseq >= 1.1.0.0, + ghc-prim >= 0.2 - ghc-options: -Wall -funbox-strict-fields -O2 - if impl(ghc >= 6.8) - ghc-options: -fwarn-tabs + cpp-options: -DHAVE_DEEPSEQ + ghc-options: -Wall -fwarn-tabs -funbox-strict-fields -O2 if flag(developer) ghc-prof-options: -auto-all ghc-options: -Werror cpp-options: -DASSERTS - if impl(ghc >= 6.11) - cpp-options: -DINTEGER_GMP - build-depends: integer-gmp >= 0.2 && < 0.5 - - if impl(ghc >= 6.9) && impl(ghc < 6.11) + if flag(integer-simple) + cpp-options: -DINTEGER_SIMPLE + build-depends: integer-simple >= 0.1 && < 0.5 + else cpp-options: -DINTEGER_GMP - build-depends: integer >= 0.1 && < 0.2 + build-depends: integer-gmp >= 0.2 test-suite tests type: exitcode-stdio-1.0 - hs-source-dirs: . tests/tests/src - main-is: Data/Text/Tests.hs + hs-source-dirs: tests . + main-is: Tests.hs c-sources: cbits/cbits.c + include-dirs: include ghc-options: -Wall -threaded -O0 -rtsopts @@ -149,16 +159,25 @@ test-suite tests -DASSERTS -DHAVE_DEEPSEQ build-depends: - base >= 4 && < 5, - bytestring >= 0.10 && < 1.0, - deepseq >= 1.1, - directory >= 1.0 && < 1.2, - random >= 1.0 && < 1.1, - QuickCheck >= 2.4 && < 2.5, - HUnit >= 1.2 && < 1.3, - test-framework >= 0.4 && < 0.5, - test-framework-quickcheck2 >= 0.2 && < 0.3, - test-framework-hunit >= 0.2 && < 0.3 + HUnit >= 1.2, + QuickCheck >= 2.4, + array, + base, + bytestring, + deepseq, + directory, + ghc-prim, + random, + test-framework >= 0.4, + test-framework-hunit >= 0.2, + test-framework-quickcheck2 >= 0.2 + + if flag(integer-simple) + cpp-options: -DINTEGER_SIMPLE + build-depends: integer-simple >= 0.1 && < 0.5 + else + cpp-options: -DINTEGER_GMP + build-depends: integer-gmp >= 0.2 source-repository head type: git