FastString.lhs 14.2 KB
Newer Older
1
%
2
% (c) The University of Glasgow, 1997-2006
3
%
4 5
\begin{code}
{-
6 7 8 9 10 11 12 13 14 15 16 17
FastString:	A compact, hash-consed, representation of character strings.
		Comparison is O(1), and you can get a Unique from them.
		Generated by the FSLIT macro
		Turn into SDoc with Outputable.ftext

LitString:	Just a wrapper for the Addr# of a C string (Ptr CChar).
		Practically no operations
		Outputing them is fast
		Generated by the SLIT macro
		Turn into SDoc with Outputable.ptext

Use LitString unless you want the facilities of FastString
18
-}
19 20
module FastString
       (
21
	-- * FastStrings
22 23
	FastString(..),     -- not abstract, for now.

24 25 26
	-- ** Construction
        mkFastString,
	mkFastStringBytes,
27
        mkFastStringByteList,
28 29 30 31
	mkFastStringForeignPtr,
	mkFastString#,
	mkZFastString,
	mkZFastStringBytes,
32

33
	-- ** Deconstruction
34
	unpackFS,	    -- :: FastString -> String
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
	bytesFS,	    -- :: FastString -> [Word8]

	-- ** Encoding
	isZEncoded,
	zEncodeFS,

	-- ** Operations
        uniqueOfFS,
	lengthFS,
	nullFS,
	appendFS,
        headFS,
        tailFS,
	concatFS,
        consFS,
	nilFS,

	-- ** Outputing
        hPutFS,

55 56 57 58
	-- ** Internal
	getFastStringTable,
	hasZEncoding,

59
	-- * LitStrings
60
	LitString, 
61 62
	mkLitString#,
	strLength
63 64
       ) where

65 66 67 68
-- This #define suppresses the "import FastString" that
-- HsVersions otherwise produces
#define COMPILING_FAST_STRING
#include "HsVersions.h"
69

70
import Encoding
71

72 73
import Foreign
import Foreign.C
74 75 76 77
import GHC.Exts
import System.IO.Unsafe ( unsafePerformIO )
import Control.Monad.ST	( stToIO )
import Data.IORef	( IORef, newIORef, readIORef, writeIORef )
78
import System.IO	( hPutBuf )
79
import Data.Maybe	( isJust )
80

81
import GHC.Arr		( STArray(..), newSTArray )
82
import GHC.IOBase	( IO(..) )
83
import GHC.Ptr		( Ptr(..) )
84

85
#define hASH_TBL_SIZE  4091
86 87


88 89 90 91 92
{-|
A 'FastString' is an array of bytes, hashed to support fast O(1)
comparison.  It is also associated with a character encoding, so that
we know how to convert a 'FastString' to the local encoding, or to the
Z-encoding used by the compiler internally.
93

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
-}

data FastString = FastString {
      uniq    :: {-# UNPACK #-} !Int,       -- unique id
      n_bytes :: {-# UNPACK #-} !Int,       -- number of bytes
      n_chars :: {-# UNPACK #-} !Int,	  -- number of chars
      buf     :: {-# UNPACK #-} !(ForeignPtr Word8),
      enc     :: FSEncoding
  }

data FSEncoding
  = ZEncoded
 	-- including strings that don't need any encoding
  | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
	-- A UTF-8 string with a memoized Z-encoding
110

111 112
instance Eq FastString where
  f1 == f2  =  uniq f1 == uniq f2
113

114
instance Ord FastString where
115
	-- Compares lexicographically, not by unique
116 117 118 119 120 121 122 123 124
    a <= b = case cmpFS a b of { LT -> True;  EQ -> True;  GT -> False }
    a <	 b = case cmpFS a b of { LT -> True;  EQ -> False; GT -> False }
    a >= b = case cmpFS a b of { LT -> False; EQ -> True;  GT -> True  }
    a >	 b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True  }
    max x y | x >= y	=  x
            | otherwise	=  y
    min x y | x <= y	=  x
            | otherwise	=  y
    compare a b = cmpFS a b
125

126 127 128
instance Show FastString where
   show fs = show (unpackFS fs)

129 130 131 132 133 134 135 136
cmpFS :: FastString -> FastString -> Ordering
cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
  if u1 == u2 then EQ else
  let l = if l1 <= l2 then l1 else l2 in
  inlinePerformIO $
    withForeignPtr buf1 $ \p1 ->
    withForeignPtr buf2 $ \p2 -> do
      res <- memcmp p1 p2 l
137 138 139 140
      return $ case compare res 0 of
                 LT -> LT
                 EQ -> compare l1 l2
                 GT -> GT
141

142 143 144 145
#ifndef __HADDOCK__
foreign import ccall unsafe "ghc_memcmp" 
  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
#endif
146

147 148
-- -----------------------------------------------------------------------------
-- Construction
149

150
{-
151 152 153 154
Internally, the compiler will maintain a fast string symbol
table, providing sharing and fast comparison. Creation of
new @FastString@s then covertly does a lookup, re-using the
@FastString@ if there was a hit.
155
-}
156 157 158

data FastStringTable = 
 FastStringTable
159
    {-# UNPACK #-} !Int
160
    (MutableArray# RealWorld [FastString])
161

162
{-# NOINLINE string_table #-}
163
string_table :: IORef FastStringTable
164
string_table = 
165 166 167 168 169 170 171 172 173 174 175 176
 unsafePerformIO $ do
   (STArray _ _ arr#) <- stToIO (newSTArray (0::Int,hASH_TBL_SIZE) [])
   newIORef (FastStringTable 0 arr#)

lookupTbl :: FastStringTable -> Int -> IO [FastString]
lookupTbl (FastStringTable _ arr#) (I# i#) =
  IO $ \ s# -> readArray# arr# i# s#

updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
  (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
  writeIORef fs_table_var (FastStringTable (uid+1) arr#)
177

178
mkFastString# :: Addr# -> FastString
179 180
mkFastString# a# = mkFastStringBytes ptr (strLength ptr)
  where ptr = Ptr a#
181

182 183 184
mkFastStringBytes :: Ptr Word8 -> Int -> FastString
mkFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
185
  let
186 187 188 189 190 191 192 193
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
194
  case lookup_result of
195 196 197 198 199 200 201 202 203 204
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
mkZFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
sof's avatar
sof committed
205
  let
206 207 208 209 210 211 212 213
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewZFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
214
  case lookup_result of
215 216 217 218 219 220 221 222 223 224 225 226 227 228
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
-- between this and 'mkFastStringBytes' is that we don't have to copy
-- the bytes if the string is new to the table.
mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
229
  let
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkZFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
  let
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewZFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
259
  case lookup_result of
260 261 262 263 264 265
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
266 267


268 269 270 271 272 273 274 275 276 277
-- | Creates a UTF-8 encoded 'FastString' from a 'String'
mkFastString :: String -> FastString
mkFastString str = 
  inlinePerformIO $ do
    let l = utf8EncodedLength str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      utf8EncodeString ptr str
      mkFastStringForeignPtr ptr buf l 

278 279 280 281 282 283 284 285 286
-- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
mkFastStringByteList :: [Word8] -> FastString
mkFastStringByteList str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeArray (castPtr ptr) str
      mkFastStringForeignPtr ptr buf l 
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339

-- | Creates a Z-encoded 'FastString' from a 'String'
mkZFastString :: String -> FastString
mkZFastString str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeCAString (castPtr ptr) str
      mkZFastStringForeignPtr ptr buf l 

bucket_match [] _ _ = return Nothing
bucket_match (v@(FastString _ l _ buf _):ls) len ptr
      | len == l  =  do
	 b <- cmpStringPrefix ptr buf len
	 if b then return (Just v)
	      else bucket_match ls len ptr
      | otherwise = 
	 bucket_match ls len ptr

mkNewFastString uid ptr fp len = do
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

mkNewZFastString uid ptr fp len = do
  return (FastString uid len len fp ZEncoded)


copyNewFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

copyNewZFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  return (FastString uid len len fp ZEncoded)


copyBytesToForeignPtr ptr len = do
  fp <- mallocForeignPtrBytes len
  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
  return fp

cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
cmpStringPrefix ptr fp len =
  withForeignPtr fp $ \ptr' -> do
    r <- memcmp ptr ptr' len
    return (r == 0)


hashStr  :: Ptr Word8 -> Int -> Int
340
 -- use the Addr to produce a hash value between 0 & m (inclusive)
341
hashStr (Ptr a#) (I# len#) = loop 0# 0#
342
   where 
343
    loop h n | n ==# len# = I# h
344 345 346
	     | otherwise  = loop h2 (n +# 1#)
	  where c = ord# (indexCharOffAddr# a# n)
		h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#
347

348 349
-- -----------------------------------------------------------------------------
-- Operations
350

351 352 353
-- | Returns the length of the 'FastString' in characters
lengthFS :: FastString -> Int
lengthFS f = n_chars f
354

355 356 357 358
-- | Returns 'True' if the 'FastString' is Z-encoded
isZEncoded :: FastString -> Bool
isZEncoded fs | ZEncoded <- enc fs = True
		| otherwise          = False
359

360 361 362 363 364 365 366 367 368 369 370
-- | Returns 'True' if this 'FastString' is not Z-encoded but already has
-- a Z-encoding cached (used in producing stats).
hasZEncoding :: FastString -> Bool
hasZEncoding fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> False
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
	return (isJust m)

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
-- | Returns 'True' if the 'FastString' is empty
nullFS :: FastString -> Bool
nullFS f  =  n_bytes f == 0

-- | unpacks and decodes the FastString
unpackFS :: FastString -> String
unpackFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    case enc of
	ZEncoded      -> peekCAStringLen (castPtr ptr,n_bytes)
	UTF8Encoded _ -> utf8DecodeString ptr n_bytes

bytesFS :: FastString -> [Word8]
bytesFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    peekArray n_bytes ptr

-- | returns a Z-encoded version of a 'FastString'.  This might be the
-- original, if it was already Z-encoded.  The first time this
-- function is applied to a particular 'FastString', the results are
-- memoized.
--
zEncodeFS :: FastString -> FastString
zEncodeFS fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> fs
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
        case m of
	  Just fs -> return fs
	  Nothing -> do
            let efs = mkZFastString (zEncodeString (unpackFS fs))
	    writeIORef ref (Just efs)
	    return efs

appendFS :: FastString -> FastString -> FastString
appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)

concatFS :: [FastString] -> FastString
concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better

headFS :: FastString -> Char
headFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do 
	 w <- peek (castPtr ptr)
	 return (castCCharToChar w)
      UTF8Encoded _ -> 
	 return (fst (utf8DecodeChar ptr))

tailFS :: FastString -> FastString
tailFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do
	return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
      UTF8Encoded _ -> do
	 let (_,ptr') = utf8DecodeChar ptr
	 let off = ptr' `minusPtr` ptr
	 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)

consFS :: Char -> FastString -> FastString
consFS c fs = mkFastString (c : unpackFS fs)

uniqueOfFS :: FastString -> Int#
uniqueOfFS (FastString (I# u#) _ _ _ _) = u#

nilFS = mkFastString ""
441

442 443 444 445 446 447
-- -----------------------------------------------------------------------------
-- Stats

getFastStringTable :: IO [[FastString]]
getFastStringTable = do
  tbl <- readIORef string_table
448
  buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
449 450
  return buckets

451 452 453
-- -----------------------------------------------------------------------------
-- Outputting 'FastString's

454 455 456 457 458
-- |Outputs a 'FastString' with /no decoding at all/, that is, you
-- get the actual bytes in the 'FastString' written to the 'Handle'.
hPutFS handle (FastString _ len _ fp _)
  | len == 0  = return ()
  | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
459

460 461
-- ToDo: we'll probably want an hPutFSLocal, or something, to output
-- in the current locale's encoding (for error messages and suchlike).
462

463 464
-- -----------------------------------------------------------------------------
-- LitStrings, here for convenience only.
465

466
type LitString = Ptr ()
467 468

mkLitString# :: Addr# -> LitString
469
mkLitString# a# = Ptr a#
470 471 472 473 474 475 476 477 478 479 480 481

foreign import ccall unsafe "ghc_strlen" 
  strLength :: Ptr () -> Int

-- -----------------------------------------------------------------------------
-- under the carpet

-- Just like unsafePerformIO, but we inline it.
{-# INLINE inlinePerformIO #-}
inlinePerformIO :: IO a -> a
inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r

482
-- NB. does *not* add a '\0'-terminator.
483 484 485
pokeCAString :: Ptr CChar -> String -> IO ()
pokeCAString ptr str =
  let
486
	go [] n     = return ()
487 488 489 490
    	go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
  in
  go str 0

491
#if __GLASGOW_HASKELL__ <= 602
492 493
peekCAStringLen = peekCStringLen
#endif
494
\end{code}