FastString.lhs 14.1 KB
Newer Older
1
%
2
% (c) The University of Glasgow, 1997-2006
3
%
4 5
\begin{code}
{-
6 7 8 9 10 11 12 13 14 15 16 17
FastString:	A compact, hash-consed, representation of character strings.
		Comparison is O(1), and you can get a Unique from them.
		Generated by the FSLIT macro
		Turn into SDoc with Outputable.ftext

LitString:	Just a wrapper for the Addr# of a C string (Ptr CChar).
		Practically no operations
		Outputing them is fast
		Generated by the SLIT macro
		Turn into SDoc with Outputable.ptext

Use LitString unless you want the facilities of FastString
18
-}
19 20
module FastString
       (
21
	-- * FastStrings
22 23
	FastString(..),     -- not abstract, for now.

24 25 26 27 28 29 30
	-- ** Construction
        mkFastString,
	mkFastStringBytes,
	mkFastStringForeignPtr,
	mkFastString#,
	mkZFastString,
	mkZFastStringBytes,
31

32
	-- ** Deconstruction
33
	unpackFS,	    -- :: FastString -> String
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
	bytesFS,	    -- :: FastString -> [Word8]

	-- ** Encoding
	isZEncoded,
	zEncodeFS,

	-- ** Operations
        uniqueOfFS,
	lengthFS,
	nullFS,
	appendFS,
        headFS,
        tailFS,
	concatFS,
        consFS,
	nilFS,

	-- ** Outputing
        hPutFS,

54 55 56 57
	-- ** Internal
	getFastStringTable,
	hasZEncoding,

58
	-- * LitStrings
59
	LitString, 
60 61
	mkLitString#,
	strLength
62 63
       ) where

64 65 66 67
-- This #define suppresses the "import FastString" that
-- HsVersions otherwise produces
#define COMPILING_FAST_STRING
#include "HsVersions.h"
68

69
import Encoding
70

71 72
import Foreign
import Foreign.C
73 74 75 76
import GHC.Exts
import System.IO.Unsafe ( unsafePerformIO )
import Control.Monad.ST	( stToIO )
import Data.IORef	( IORef, newIORef, readIORef, writeIORef )
77
import System.IO	( hPutBuf )
78
import Data.Maybe	( isJust )
79

80
import GHC.Arr		( STArray(..), newSTArray )
81
import GHC.IOBase	( IO(..) )
82
import GHC.Ptr		( Ptr(..) )
83

84
#define hASH_TBL_SIZE  4091
85 86


87 88 89 90 91
{-|
A 'FastString' is an array of bytes, hashed to support fast O(1)
comparison.  It is also associated with a character encoding, so that
we know how to convert a 'FastString' to the local encoding, or to the
Z-encoding used by the compiler internally.
92

93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
-}

data FastString = FastString {
      uniq    :: {-# UNPACK #-} !Int,       -- unique id
      n_bytes :: {-# UNPACK #-} !Int,       -- number of bytes
      n_chars :: {-# UNPACK #-} !Int,	  -- number of chars
      buf     :: {-# UNPACK #-} !(ForeignPtr Word8),
      enc     :: FSEncoding
  }

data FSEncoding
  = ZEncoded
 	-- including strings that don't need any encoding
  | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
	-- A UTF-8 string with a memoized Z-encoding
109

110 111
instance Eq FastString where
  f1 == f2  =  uniq f1 == uniq f2
112

113
instance Ord FastString where
114
	-- Compares lexicographically, not by unique
115 116 117 118 119 120 121 122 123
    a <= b = case cmpFS a b of { LT -> True;  EQ -> True;  GT -> False }
    a <	 b = case cmpFS a b of { LT -> True;  EQ -> False; GT -> False }
    a >= b = case cmpFS a b of { LT -> False; EQ -> True;  GT -> True  }
    a >	 b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True  }
    max x y | x >= y	=  x
            | otherwise	=  y
    min x y | x <= y	=  x
            | otherwise	=  y
    compare a b = cmpFS a b
124

125 126 127
instance Show FastString where
   show fs = show (unpackFS fs)

128 129 130 131 132 133 134 135 136 137 138 139 140 141
cmpFS :: FastString -> FastString -> Ordering
cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
  if u1 == u2 then EQ else
  let l = if l1 <= l2 then l1 else l2 in
  inlinePerformIO $
    withForeignPtr buf1 $ \p1 ->
    withForeignPtr buf2 $ \p2 -> do
      res <- memcmp p1 p2 l
      case () of
       _ | res <  0  -> return LT
	 | res == 0  -> if l1 == l2 then return EQ 
				    else if l1 < l2 then return LT
						    else return GT
 	 | otherwise -> return GT
142

143 144 145 146
#ifndef __HADDOCK__
foreign import ccall unsafe "ghc_memcmp" 
  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
#endif
147

148 149
-- -----------------------------------------------------------------------------
-- Construction
150

151
{-
152 153 154 155
Internally, the compiler will maintain a fast string symbol
table, providing sharing and fast comparison. Creation of
new @FastString@s then covertly does a lookup, re-using the
@FastString@ if there was a hit.
156
-}
157 158 159

data FastStringTable = 
 FastStringTable
160
    {-# UNPACK #-} !Int
161
    (MutableArray# RealWorld [FastString])
162

163
string_table :: IORef FastStringTable
164
string_table = 
165 166 167 168 169 170 171 172 173 174 175 176
 unsafePerformIO $ do
   (STArray _ _ arr#) <- stToIO (newSTArray (0::Int,hASH_TBL_SIZE) [])
   newIORef (FastStringTable 0 arr#)

lookupTbl :: FastStringTable -> Int -> IO [FastString]
lookupTbl (FastStringTable _ arr#) (I# i#) =
  IO $ \ s# -> readArray# arr# i# s#

updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
  (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
  writeIORef fs_table_var (FastStringTable (uid+1) arr#)
177

178
mkFastString# :: Addr# -> FastString
179 180
mkFastString# a# = mkFastStringBytes ptr (strLength ptr)
  where ptr = Ptr a#
181

182 183 184
mkFastStringBytes :: Ptr Word8 -> Int -> FastString
mkFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
185
  let
186 187 188 189 190 191 192 193
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
194
  case lookup_result of
195 196 197 198 199 200 201 202 203 204
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
mkZFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
sof's avatar
sof committed
205
  let
206 207 208 209 210 211 212 213
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewZFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
214
  case lookup_result of
215 216 217 218 219 220 221 222 223 224 225 226 227 228
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
-- between this and 'mkFastStringBytes' is that we don't have to copy
-- the bytes if the string is new to the table.
mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
229
  let
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkZFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
  let
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewZFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
259
  case lookup_result of
260 261 262 263 264 265
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
266 267


268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
-- | Creates a UTF-8 encoded 'FastString' from a 'String'
mkFastString :: String -> FastString
mkFastString str = 
  inlinePerformIO $ do
    let l = utf8EncodedLength str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      utf8EncodeString ptr str
      mkFastStringForeignPtr ptr buf l 


-- | Creates a Z-encoded 'FastString' from a 'String'
mkZFastString :: String -> FastString
mkZFastString str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeCAString (castPtr ptr) str
      mkZFastStringForeignPtr ptr buf l 

bucket_match [] _ _ = return Nothing
bucket_match (v@(FastString _ l _ buf _):ls) len ptr
      | len == l  =  do
	 b <- cmpStringPrefix ptr buf len
	 if b then return (Just v)
	      else bucket_match ls len ptr
      | otherwise = 
	 bucket_match ls len ptr

mkNewFastString uid ptr fp len = do
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

mkNewZFastString uid ptr fp len = do
  return (FastString uid len len fp ZEncoded)


copyNewFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

copyNewZFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  return (FastString uid len len fp ZEncoded)


copyBytesToForeignPtr ptr len = do
  fp <- mallocForeignPtrBytes len
  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
  return fp

cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
cmpStringPrefix ptr fp len =
  withForeignPtr fp $ \ptr' -> do
    r <- memcmp ptr ptr' len
    return (r == 0)


hashStr  :: Ptr Word8 -> Int -> Int
331
 -- use the Addr to produce a hash value between 0 & m (inclusive)
332
hashStr (Ptr a#) (I# len#) = loop 0# 0#
333
   where 
334
    loop h n | n ==# len# = I# h
335 336 337
	     | otherwise  = loop h2 (n +# 1#)
	  where c = ord# (indexCharOffAddr# a# n)
		h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#
338

339 340
-- -----------------------------------------------------------------------------
-- Operations
341

342 343 344
-- | Returns the length of the 'FastString' in characters
lengthFS :: FastString -> Int
lengthFS f = n_chars f
345

346 347 348 349
-- | Returns 'True' if the 'FastString' is Z-encoded
isZEncoded :: FastString -> Bool
isZEncoded fs | ZEncoded <- enc fs = True
		| otherwise          = False
350

351 352 353 354 355 356 357 358 359 360 361
-- | Returns 'True' if this 'FastString' is not Z-encoded but already has
-- a Z-encoding cached (used in producing stats).
hasZEncoding :: FastString -> Bool
hasZEncoding fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> False
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
	return (isJust m)

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
-- | Returns 'True' if the 'FastString' is empty
nullFS :: FastString -> Bool
nullFS f  =  n_bytes f == 0

-- | unpacks and decodes the FastString
unpackFS :: FastString -> String
unpackFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    case enc of
	ZEncoded      -> peekCAStringLen (castPtr ptr,n_bytes)
	UTF8Encoded _ -> utf8DecodeString ptr n_bytes

bytesFS :: FastString -> [Word8]
bytesFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    peekArray n_bytes ptr

-- | returns a Z-encoded version of a 'FastString'.  This might be the
-- original, if it was already Z-encoded.  The first time this
-- function is applied to a particular 'FastString', the results are
-- memoized.
--
zEncodeFS :: FastString -> FastString
zEncodeFS fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> fs
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
        case m of
	  Just fs -> return fs
	  Nothing -> do
            let efs = mkZFastString (zEncodeString (unpackFS fs))
	    writeIORef ref (Just efs)
	    return efs

appendFS :: FastString -> FastString -> FastString
appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)

concatFS :: [FastString] -> FastString
concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better

headFS :: FastString -> Char
headFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do 
	 w <- peek (castPtr ptr)
	 return (castCCharToChar w)
      UTF8Encoded _ -> 
	 return (fst (utf8DecodeChar ptr))

tailFS :: FastString -> FastString
tailFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do
	return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
      UTF8Encoded _ -> do
	 let (_,ptr') = utf8DecodeChar ptr
	 let off = ptr' `minusPtr` ptr
	 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)

consFS :: Char -> FastString -> FastString
consFS c fs = mkFastString (c : unpackFS fs)

uniqueOfFS :: FastString -> Int#
uniqueOfFS (FastString (I# u#) _ _ _ _) = u#

nilFS = mkFastString ""
432

433 434 435 436 437 438
-- -----------------------------------------------------------------------------
-- Stats

getFastStringTable :: IO [[FastString]]
getFastStringTable = do
  tbl <- readIORef string_table
439
  buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
440 441
  return buckets

442 443 444
-- -----------------------------------------------------------------------------
-- Outputting 'FastString's

445 446 447 448 449
-- |Outputs a 'FastString' with /no decoding at all/, that is, you
-- get the actual bytes in the 'FastString' written to the 'Handle'.
hPutFS handle (FastString _ len _ fp _)
  | len == 0  = return ()
  | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
450

451 452
-- ToDo: we'll probably want an hPutFSLocal, or something, to output
-- in the current locale's encoding (for error messages and suchlike).
453

454 455
-- -----------------------------------------------------------------------------
-- LitStrings, here for convenience only.
456

457
type LitString = Ptr ()
458 459

mkLitString# :: Addr# -> LitString
460
mkLitString# a# = Ptr a#
461 462 463 464 465 466 467 468 469 470 471 472

foreign import ccall unsafe "ghc_strlen" 
  strLength :: Ptr () -> Int

-- -----------------------------------------------------------------------------
-- under the carpet

-- Just like unsafePerformIO, but we inline it.
{-# INLINE inlinePerformIO #-}
inlinePerformIO :: IO a -> a
inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r

473
-- NB. does *not* add a '\0'-terminator.
474 475 476
pokeCAString :: Ptr CChar -> String -> IO ()
pokeCAString ptr str =
  let
477
	go [] n     = return ()
478 479 480 481
    	go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
  in
  go str 0

482
#if __GLASGOW_HASKELL__ < 600
483

484 485 486 487 488 489 490 491 492
mallocForeignPtrBytes :: Int -> IO (ForeignPtr a)
mallocForeignPtrBytes n = do
  r <- mallocBytes n
  newForeignPtr r (finalizerFree r)

foreign import ccall unsafe "stdlib.h free" 
  finalizerFree :: Ptr a -> IO ()

peekCAStringLen = peekCStringLen
493 494 495 496 497

#elif __GLASGOW_HASKELL__ <= 602

peekCAStringLen = peekCStringLen

498
#endif
499
\end{code}