Fusion.hs 9.64 KB
Newer Older
1
{-# LANGUAGE BangPatterns, CPP, MagicHash #-}
2

bos's avatar
bos committed
3
-- |
bos's avatar
bos committed
4
-- Module      : Data.Text.Internal.Fusion
bos's avatar
bos committed
5
-- Copyright   : (c) Tom Harper 2008-2009,
bos's avatar
bos committed
6
--               (c) Bryan O'Sullivan 2009-2010,
bos's avatar
bos committed
7
8
9
--               (c) Duncan Coutts 2009
--
-- License     : BSD-style
bos's avatar
bos committed
10
-- Maintainer  : bos@serpentine.com
bos's avatar
bos committed
11
12
13
-- Stability   : experimental
-- Portability : GHC
--
bos's avatar
bos committed
14
15
16
17
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
bos's avatar
bos committed
18
19
-- Text manipulation functions represented as fusible operations over
-- streams.
bos's avatar
bos committed
20
module Data.Text.Internal.Fusion
bos's avatar
bos committed
21
    (
bos's avatar
bos committed
22
    -- * Types
bos's avatar
bos committed
23
24
      Stream(..)
    , Step(..)
bos's avatar
bos committed
25
26

    -- * Creation and elimination
bos's avatar
bos committed
27
28
    , stream
    , unstream
bos's avatar
bos committed
29
    , reverseStream
bos's avatar
bos committed
30

bos's avatar
bos committed
31
    , length
bos's avatar
bos committed
32
33

    -- * Transformations
bos's avatar
bos committed
34
    , reverse
bos's avatar
bos committed
35
36

    -- * Construction
bos's avatar
bos committed
37
    -- ** Scans
bos's avatar
bos committed
38
    , reverseScanr
bos's avatar
bos committed
39

bos's avatar
bos committed
40
41
42
    -- ** Accumulating maps
    , mapAccumL

bos's avatar
bos committed
43
    -- ** Generation and unfolding
bos's avatar
bos committed
44
    , unfoldrN
bos's avatar
bos committed
45
46

    -- * Indexing
bos's avatar
bos committed
47
48
    , index
    , findIndex
bos's avatar
bos committed
49
    , countChar
bos's avatar
bos committed
50
    ) where
51

bos's avatar
bos committed
52
import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int,
53
54
55
                Num(..), Ord(..), ($),
                otherwise)
import Data.Bits (shiftL, shiftR)
bos's avatar
bos committed
56
import Data.Text.Internal (Text(..))
bos's avatar
bos committed
57
import Data.Text.Internal.Private (runText)
58
import Data.Text.Internal.Unsafe.Char (unsafeChr8, unsafeWrite)
59
import qualified Data.Text.Array as A
bos's avatar
bos committed
60
61
62
import qualified Data.Text.Internal.Fusion.Common as S
import Data.Text.Internal.Fusion.Types
import Data.Text.Internal.Fusion.Size
bos's avatar
bos committed
63
import qualified Data.Text.Internal as I
64
import qualified Data.Text.Internal.Encoding.Utf8 as U8
65

66
67
68
69
#if defined(ASSERTS)
import GHC.Stack (HasCallStack)
#endif

70
71
default(Int)

72
73
74
75
76
77
78
-- | /O(n)/ Convert 'Text' into a 'Stream' 'Char'.
--
-- __Properties__
--
-- @'unstream' . 'stream' = 'Data.Function.id'@
--
-- @'stream' . 'unstream' = 'Data.Function.id' @
79
80
81
82
83
stream ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  Text -> Stream Char
84
stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 2) len)
85
    where
86
      !end = off+len
87
      next !i
88
89
          | i >= end  = Done
          | otherwise = Yield chr (i + l)
90
          where
91
92
93
94
95
96
97
98
99
100
101
            n0 = A.unsafeIndex arr i
            n1 = A.unsafeIndex arr (i + 1)
            n2 = A.unsafeIndex arr (i + 2)
            n3 = A.unsafeIndex arr (i + 3)

            l  = U8.utf8LengthByLeader n0
            chr = case l of
              1 -> unsafeChr8 n0
              2 -> U8.chr2 n0 n1
              3 -> U8.chr3 n0 n1 n2
              _ -> U8.chr4 n0 n1 n2 n3
102
103
{-# INLINE [0] stream #-}

104
105
106
107
108
109
-- | /O(n)/ Converts 'Text' into a 'Stream' 'Char', but iterates
-- backwards through the text.
--
-- __Properties__
--
-- @'unstream' . 'reverseStream' = 'Data.Text.reverse' @
bos's avatar
bos committed
110
reverseStream :: Text -> Stream Char
111
reverseStream (Text arr off len) = Stream next (off+len-1) (betweenSize (len `shiftR` 2) len)
bos's avatar
bos committed
112
113
114
    where
      {-# INLINE next #-}
      next !i
115
116
117
118
119
          | i < off    = Done
          | n0 <  0x80 = Yield (unsafeChr8 n0)       (i - 1)
          | n1 >= 0xC0 = Yield (U8.chr2 n1 n0)       (i - 2)
          | n2 >= 0xC0 = Yield (U8.chr3 n2 n1 n0)    (i - 3)
          | otherwise  = Yield (U8.chr4 n3 n2 n1 n0) (i - 4)
bos's avatar
bos committed
120
          where
121
122
123
124
            n0 = A.unsafeIndex arr i
            n1 = A.unsafeIndex arr (i - 1)
            n2 = A.unsafeIndex arr (i - 2)
            n3 = A.unsafeIndex arr (i - 3)
bos's avatar
bos committed
125
126
{-# INLINE [0] reverseStream #-}

127
128
129
130
131
132
133
-- | /O(n)/ Convert 'Stream' 'Char' into a 'Text'.
--
-- __Properties__
--
-- @'unstream' . 'stream' = 'Data.Function.id'@
--
-- @'stream' . 'unstream' = 'Data.Function.id' @
134
unstream :: Stream Char -> Text
bos's avatar
bos committed
135
unstream (Stream next0 s0 len) = runText $ \done -> do
harendra's avatar
harendra committed
136
  -- Before encoding each char we perform a buffer realloc check assuming
137
  -- worst case encoding size of four 8-bit units for the char. Just add an
harendra's avatar
harendra committed
138
139
  -- extra space to the buffer so that we do not end up reallocating even when
  -- all the chars are encoded as single unit.
140
  let mlen = upperBound 4 len + 3
bos's avatar
bos committed
141
  arr0 <- A.new mlen
harendra's avatar
harendra committed
142
  let outer !arr !maxi = encode
143
       where
harendra's avatar
harendra committed
144
145
146
147
148
149
150
        -- keep the common case loop as small as possible
        encode !si !di =
            case next0 si of
                Done        -> done arr di
                Skip si'    -> encode si' di
                Yield c si'
                    -- simply check for the worst case
151
                    | maxi < di + 3 -> realloc si di
harendra's avatar
harendra committed
152
153
154
155
156
157
158
159
                    | otherwise -> do
                            n <- unsafeWrite arr di c
                            encode si' (di + n)

        -- keep uncommon case separate from the common case code
        {-# NOINLINE realloc #-}
        realloc !si !di = do
            let newlen = (maxi + 1) * 2
160
            arr' <- A.resizeM arr newlen
harendra's avatar
harendra committed
161
162
163
            outer arr' (newlen - 1) si di

  outer arr0 (mlen - 1) s0 0
164
{-# INLINE [0] unstream #-}
165
{-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-}
166

bos's avatar
bos committed
167

168
169
170
-- ----------------------------------------------------------------------------
-- * Basic stream functions

171
172
173
174
175
-- | /O(n)/ Returns the number of characters in a 'Stream'.
--
-- __Properties__
--
-- @'length' . 'stream' = 'Data.Text.length' @
176
length :: Stream Char -> Int
177
length = S.lengthI
178
179
{-# INLINE[0] length #-}

180
181
182
183
184
-- | /O(n)/ Reverse the characters of a 'Stream' returning 'Text'.
--
-- __Properties__
--
-- @'reverse' . 'stream' = 'Data.Text.reverse' @
185
186
187
188
189
reverse ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  Stream Char -> Text
bos's avatar
bos committed
190
reverse (Stream next s len0)
bos's avatar
bos committed
191
    | isEmpty len0 = I.empty
bos's avatar
bos committed
192
    | otherwise    = I.text arr off' len'
bos's avatar
bos committed
193
  where
bos's avatar
bos committed
194
    len0' = upperBound 4 (larger len0 4)
bos's avatar
bos committed
195
    (arr, (off', len')) = A.run2 (A.new len0' >>= loop s (len0'-1) len0')
bos's avatar
bos committed
196
197
198
199
200
    loop !s0 !i !len marr =
        case next s0 of
          Done -> return (marr, (j, len-j))
              where j = i + 1
          Skip s1    -> loop s1 i len marr
bos's avatar
bos committed
201
          Yield x s1 | i < least -> {-# SCC "reverse/resize" #-} do
202
                       let newLen = len `shiftL` 1
bos's avatar
bos committed
203
                       marr' <- A.new newLen
bos's avatar
bos committed
204
                       A.copyM marr' (newLen-len) marr 0 len
205
206
207
208
209
210
                       _ <- unsafeWrite marr' (len + i - least) x
                       loop s1 (len + i - least - 1) newLen marr'
                     | otherwise -> do
                       _ <- unsafeWrite marr (i - least) x
                       loop s1 (i - least - 1) len marr
            where least = U8.utf8Length x - 1
bos's avatar
bos committed
211
{-# INLINE [0] reverse #-}
212

bos's avatar
bos committed
213
214
-- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with
-- the input and result reversed.
215
216
217
218
--
-- __Properties__
--
-- @'reverse' . 'reverseScanr' f c . 'reverseStream' = 'Data.Text.scanr' f c @
bos's avatar
bos committed
219
reverseScanr :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char
bos's avatar
bos committed
220
reverseScanr f z0 (Stream next0 s0 len) = Stream next (Scan1 z0 s0) (len+1) -- HINT maybe too low
bos's avatar
bos committed
221
222
  where
    {-# INLINE next #-}
bos's avatar
bos committed
223
224
225
226
227
228
    next (Scan1 z s) = Yield z (Scan2 z s)
    next (Scan2 z s) = case next0 s of
                         Yield x s' -> let !x' = f x z
                                       in Yield x' (Scan2 x' s')
                         Skip s'    -> Skip (Scan2 z s')
                         Done       -> Done
bos's avatar
bos committed
229
230
{-# INLINE reverseScanr #-}

bos's avatar
bos committed
231
232
233
234
-- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
-- value. However, the length of the result is limited by the
-- first argument to 'unfoldrN'. This function is more efficient than
-- 'unfoldr' when the length of the result is known.
235
236
237
238
--
-- __Properties__
--
-- @'unstream' ('unfoldrN' n f a) = 'Data.Text.unfoldrN' n f a @
bos's avatar
bos committed
239
unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Stream Char
240
unfoldrN n = S.unfoldrNI n
bos's avatar
bos committed
241
242
{-# INLINE [0] unfoldrN #-}

243
244
245
-------------------------------------------------------------------------------
-- ** Indexing streams

bos's avatar
bos committed
246
-- | /O(n)/ stream index (subscript) operator, starting from 0.
247
248
249
250
--
-- __Properties__
--
-- @'index' ('stream' t) n  = 'Data.Text.index' t n @
251
index :: Stream Char -> Int -> Char
bos's avatar
bos committed
252
index = S.indexI
253
254
255
256
257
{-# INLINE [0] index #-}

-- | The 'findIndex' function takes a predicate and a stream and
-- returns the index of the first element in the stream
-- satisfying the predicate.
258
259
260
261
--
-- __Properties__
--
-- @'findIndex' p . 'stream'  = 'Data.Text.findIndex' p @
262
findIndex :: (Char -> Bool) -> Stream Char -> Maybe Int
263
findIndex = S.findIndexI
bos's avatar
bos committed
264
265
{-# INLINE [0] findIndex #-}

bos's avatar
bos committed
266
267
-- | /O(n)/ The 'count' function returns the number of times the query
-- element appears in the given stream.
268
269
270
271
--
-- __Properties__
--
-- @'countChar' c . 'stream'  = 'Data.Text.countChar' c @
bos's avatar
bos committed
272
273
274
countChar :: Char -> Stream Char -> Int
countChar = S.countCharI
{-# INLINE [0] countChar #-}
bos's avatar
bos committed
275
276
277
278

-- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
-- function to each element of a 'Text', passing an accumulating
-- parameter from left to right, and returns a final 'Text'.
279
280
281
282
--
-- __Properties__
--
-- @'mapAccumL' g z0 . 'stream' = 'Data.Text.mapAccumL' g z0@
283
284
285
286
287
mapAccumL ::
#if defined(ASSERTS)
  HasCallStack =>
#endif
  (a -> Char -> (a,Char)) -> a -> Stream Char -> (a, Text)
bos's avatar
bos committed
288
mapAccumL f z0 (Stream next0 s0 len) = (nz, I.text na 0 nl)
bos's avatar
bos committed
289
  where
bos's avatar
bos committed
290
    (na,(nz,nl)) = A.run2 (A.new mlen >>= \arr -> outer arr mlen z0 s0 0)
bos's avatar
bos committed
291
292
293
294
295
296
297
298
299
300
      where mlen = upperBound 4 len
    outer arr top = loop
      where
        loop !z !s !i =
            case next0 s of
              Done          -> return (arr, (z,i))
              Skip s'       -> loop z s' i
              Yield x s'
                | j >= top  -> {-# SCC "mapAccumL/resize" #-} do
                               let top' = (top + 1) `shiftL` 1
301
                               arr' <- A.resizeM arr top'
bos's avatar
bos committed
302
                               outer arr' top' z s i
303
                | otherwise -> do d <- unsafeWrite arr i c
bos's avatar
bos committed
304
                                  loop z' s' (i+d)
305
                where (z',c) = f z x
306
                      j = i + U8.utf8Length c - 1
bos's avatar
bos committed
307
{-# INLINE [0] mapAccumL #-}