Commit ce9916fb authored by Bodigrim's avatar Bodigrim
Browse files

Speed up strict and lazy search

parent 2f90469a
......@@ -1684,10 +1684,10 @@ findIndex p t = S.findIndex p (stream t)
-- In (unlikely) bad cases, this function's time complexity degrades
-- towards /O(n*m)/.
count :: Text -> Text -> Int
count pat src
count pat
| null pat = emptyError "count"
| isSingleton pat = countChar (unsafeHead pat) src
| otherwise = L.length (indices pat src)
| isSingleton pat = countChar (unsafeHead pat)
| otherwise = L.length . indices pat
{-# INLINE [1] count #-}
{-# RULES
......
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}
-- |
-- Module : Data.Text.Lazy.Search
......@@ -27,9 +29,14 @@ import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word8, Word64)
import qualified Data.Text.Internal as T
import qualified Data.Text as T (concat)
import Data.Text.Internal.Fusion.Types (PairS(..))
import Data.Text.Internal.Lazy (Text(..), foldlChunks)
import Data.Text.Internal.Lazy (Text(..), foldrChunks, equal)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Bits ((.|.), (.&.))
import Foreign.C.Types
import GHC.Exts (ByteArray#)
import System.Posix.Types (CSsize(..))
-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
......@@ -42,110 +49,90 @@ import Data.Bits ((.|.), (.&.))
indices :: Text -- ^ Substring to search for (@needle@)
-> Text -- ^ Text to search in (@haystack@)
-> [Int64]
indices needle@(Chunk n ns) _haystack@(Chunk k ks)
| nlen <= 0 = []
| nlen == 1 = indicesOne (nindex 0) 0 k ks
| otherwise = advance k ks 0 0
indices needle
| nlen <= 0 = const []
| nlen == 1 = indicesOne (A.unsafeIndex narr noff) 0
| otherwise = advance 0 0
where
advance x@(T.Text _ _ l) xs = scan
where
scan !g !i
| i >= m = case xs of
Empty -> []
Chunk y ys -> advance y ys g (i-m)
T.Text narr noff nlen = T.concat (foldrChunks (:) [] needle)
advance !_ !_ Empty = []
advance !(g :: Int64) !(i :: Int) xxs@(Chunk x@(T.Text xarr@(A.ByteArray xarr#) xoff l) xs)
| i >= l = advance g (i - l) xs
| lackingHay (i + nlen) x xs = []
| c == z && candidateMatch 0 = g : scan (g+nlen) (i+nlen)
| otherwise = scan (g+delta) (i+delta)
| c == z && candidateMatch = g : advance (g + intToInt64 nlen) (i + nlen) xxs
| otherwise = advance (g + intToInt64 delta) (i + delta) xxs
where
m = intToInt64 l
c = hindex (i + nlast)
c = index xxs (i + nlast)
delta | nextInPattern = nlen + 1
| c == z = skip + 1
| otherwise = 1
nextInPattern = mask .&. swizzle (hindex (i+nlen)) == 0
candidateMatch !j
| j >= nlast = True
| hindex (i+j) /= nindex j = False
| otherwise = candidateMatch (j+1)
hindex = index x xs
nlen = wordLength needle
| l >= i + nlen = case unsafeDupablePerformIO $
memchr xarr# (intToCSize (xoff + i + nlen)) (intToCSize (l - i - nlen)) z of
-1 -> max 1 (l - i - nlen)
s -> cSsizeToInt s + 1
| otherwise = 1
nextInPattern = mask .&. swizzle (index xxs (i + nlen)) == 0
candidateMatch
| i + nlen <= l = A.equal narr noff xarr (xoff + i) nlen
| otherwise = A.equal narr noff xarr (xoff + i) (l - i) &&
Chunk (T.Text narr (noff + l - i) (nlen - l + i)) Empty `equal` xs
nlast = nlen - 1
nindex = index n ns
z = foldlChunks fin 0 needle
where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
(mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
z = A.unsafeIndex narr (noff + nlen - 1)
(mask :: Word64) :*: skip = buildTable 0 0 0 (nlen-2)
swizzle :: Word8 -> Word64
swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f)
buildTable (T.Text xarr xoff xlen) xs = go
where
go !(g::Int64) !i !msk !skp
| i >= xlast = case xs of
Empty -> (msk .|. swizzle z) :*: skp
Chunk y ys -> buildTable y ys g 0 msk' skp'
| otherwise = go (g+1) (i+1) msk' skp'
where c = A.unsafeIndex xarr (xoff+i)
buildTable !g !i !msk !skp
| i >= nlast = (msk .|. swizzle z) :*: skp
| otherwise = buildTable (g+1) (i+1) msk' skp'
where c = A.unsafeIndex narr (noff+i)
msk' = msk .|. swizzle c
skp' | c == z = nlen - g - 2
| otherwise = skp
xlast = xlen - 1
-- | Check whether an attempt to index into the haystack at the
-- given offset would fail.
lackingHay :: Int64 -> T.Text -> Text -> Bool
lackingHay q = go 0
where
go p (T.Text _ _ l) ps = p' < q && case ps of
Empty -> True
Chunk r rs -> go p' r rs
where p' = p + intToInt64 l
indices _ _ = []
lackingHay :: Int -> T.Text -> Text -> Bool
lackingHay q (T.Text _ _ l) ps = l < q && case ps of
Empty -> True
Chunk r rs -> lackingHay (q - l) r rs
-- | Fast index into a partly unpacked 'Text'. We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: T.Text -> Text -> Int64 -> Word8
index (T.Text arr off len) xs !i
| j < len = A.unsafeIndex arr (off+j)
| otherwise = case xs of
Empty
-- out of bounds, but legal
| j == len -> 0
-- should never happen, due to lackingHay above
| otherwise -> emptyError "index"
Chunk c cs -> index c cs (i-intToInt64 len)
where j = int64ToInt i
index :: Text -> Int -> Word8
index Empty !_ = 0
index (Chunk (T.Text arr off len) xs) !i
| i < len = A.unsafeIndex arr (off + i)
| otherwise = index xs (i - len)
-- | A variant of 'indices' that scans linearly for a single 'Word8'.
indicesOne :: Word8 -> Int64 -> T.Text -> Text -> [Int64]
indicesOne :: Word8 -> Int64 -> Text -> [Int64]
indicesOne c = chunk
where
chunk :: Int64 -> T.Text -> Text -> [Int64]
chunk !i (T.Text oarr ooff olen) os = go 0
chunk :: Int64 -> Text -> [Int64]
chunk !_ Empty = []
chunk !i (Chunk (T.Text oarr ooff olen) os) = go 0
where
go h | h >= olen = case os of
Empty -> []
Chunk y ys -> chunk (i+intToInt64 olen) y ys
go h | h >= olen = chunk (i+intToInt64 olen) os
| on == c = i + intToInt64 h : go (h+1)
| otherwise = go (h+1)
where on = A.unsafeIndex oarr (ooff+h)
-- | The number of 'Word8' values in a 'Text'.
wordLength :: Text -> Int64
wordLength = foldlChunks sumLength 0
where
sumLength :: Int64 -> T.Text -> Int64
sumLength i (T.Text _ _ l) = i + intToInt64 l
emptyError :: String -> a
emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")
intToInt64 :: Int -> Int64
intToInt64 = fromIntegral
int64ToInt :: Int64 -> Int
int64ToInt = fromIntegral
word8ToInt :: Word8 -> Int
word8ToInt = fromIntegral
intToCSize :: Int -> CSize
intToCSize = fromIntegral
cSsizeToInt :: CSsize -> Int
cSsizeToInt = fromIntegral
foreign import ccall unsafe "_hs_text_memchr" memchr
:: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}
-- |
-- Module : Data.Text.Internal.Search
......@@ -35,6 +37,10 @@ import qualified Data.Text.Array as A
import Data.Word (Word64, Word8)
import Data.Text.Internal (Text(..))
import Data.Bits ((.|.), (.&.), unsafeShiftL)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Foreign.C.Types
import GHC.Exts (ByteArray#)
import System.Posix.Types (CSsize(..))
data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
......@@ -48,47 +54,60 @@ data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
indices :: Text -- ^ Substring to search for (@needle@)
-> Text -- ^ Text to search in (@haystack@)
-> [Int]
indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
| nlen == 1 = scanOne (nindex 0)
| nlen <= 0 || ldiff < 0 = []
| otherwise = scan 0
indices (Text narr noff nlen)
| nlen == 1 = scanOne (A.unsafeIndex narr noff)
| nlen <= 0 = const []
| otherwise = scan
where
ldiff = hlen - nlen
nlast = nlen - 1
z = nindex nlast
!z = nindex nlast
nindex k = A.unsafeIndex narr (noff+k)
hindex k = A.unsafeIndex harr (hoff+k)
hindex' k | k == hlen = 0
| otherwise = A.unsafeIndex harr (hoff+k)
buildTable !i !msk !skp
| i >= nlast = (msk .|. swizzle z) :* skp
| otherwise = buildTable (i+1) (msk .|. swizzle c) skp'
where c = nindex i
where !c = nindex i
skp' | c == z = nlen - i - 2
| otherwise = skp
!(mask :* skip) = buildTable 0 0 (nlen-2)
swizzle :: Word8 -> Word64
swizzle k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f)
swizzle !k = 1 `unsafeShiftL` (word8ToInt k .&. 0x3f)
scan !i
| i > ldiff = []
| c == z && candidateMatch 0 = i : scan (i + nlen)
| otherwise = scan (i + delta)
where c = hindex (i + nlast)
candidateMatch !j
| j >= nlast = True
| hindex (i+j) /= nindex j = False
| otherwise = candidateMatch (j+1)
delta | nextInPattern = nlen + 1
| c == z = skip + 1
| otherwise = 1
where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0
!(mask :* skip) = buildTable 0 0 (nlen-2)
scanOne c = loop 0
where loop !i | i >= hlen = []
| hindex i == c = i : loop (i+1)
| otherwise = loop (i+1)
scan (Text harr@(A.ByteArray harr#) hoff hlen) = loop (hoff + nlen) where
loop !i
| i > hlen + hoff
= []
| A.unsafeIndex harr (i - 1) == z
= if A.equal narr noff harr (i - nlen) nlen
then i - nlen : loop (i + nlen)
else loop (i + skip + 1)
| i == hlen + hoff
= []
| mask .&. swizzle (A.unsafeIndex harr i) == 0
= loop (i + nlen + 1)
| otherwise
= case unsafeDupablePerformIO $ memchr harr# (intToCSize i) (intToCSize (hlen + hoff - i)) z of
-1 -> []
x -> loop (i + cSsizeToInt x + 1)
{-# INLINE indices #-}
scanOne :: Word8 -> Text -> [Int]
scanOne c (Text harr hoff hlen) = loop 0
where
loop !i
| i >= hlen = []
| A.unsafeIndex harr (hoff+i) == c = i : loop (i+1)
| otherwise = loop (i+1)
{-# INLINE scanOne #-}
word8ToInt :: Word8 -> Int
word8ToInt = fromIntegral
intToCSize :: Int -> CSize
intToCSize = fromIntegral
cSsizeToInt :: CSsize -> Int
cSsizeToInt = fromIntegral
foreign import ccall unsafe "_hs_text_memchr" memchr
:: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
......@@ -1599,9 +1599,9 @@ index t n = S.index (stream t) n
-- In (unlikely) bad cases, this function's time complexity degrades
-- towards /O(n*m)/.
count :: Text -> Text -> Int64
count pat src
count pat
| null pat = emptyError "count"
| otherwise = go 0 (indices pat src)
| otherwise = go 0 . indices pat
where go !n [] = n
go !n (_:xs) = go (n+1) xs
{-# INLINE [1] count #-}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment