FastString.lhs 14.6 KB
Newer Older
1
%
2
% (c) The University of Glasgow, 1997-2006
3
%
4
\begin{code}
5
{-# OPTIONS -w #-}
6 7 8
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
Ian Lynagh's avatar
Ian Lynagh committed
9
--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
10 11
-- for details

12
{-
13 14 15 16 17 18 19 20 21 22 23 24
FastString:	A compact, hash-consed, representation of character strings.
		Comparison is O(1), and you can get a Unique from them.
		Generated by the FSLIT macro
		Turn into SDoc with Outputable.ftext

LitString:	Just a wrapper for the Addr# of a C string (Ptr CChar).
		Practically no operations
		Outputing them is fast
		Generated by the SLIT macro
		Turn into SDoc with Outputable.ptext

Use LitString unless you want the facilities of FastString
25
-}
26 27
module FastString
       (
28
	-- * FastStrings
29 30
	FastString(..),     -- not abstract, for now.

31 32 33
	-- ** Construction
        mkFastString,
	mkFastStringBytes,
34
        mkFastStringByteList,
35 36 37 38
	mkFastStringForeignPtr,
	mkFastString#,
	mkZFastString,
	mkZFastStringBytes,
39

40
	-- ** Deconstruction
41
	unpackFS,	    -- :: FastString -> String
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
	bytesFS,	    -- :: FastString -> [Word8]

	-- ** Encoding
	isZEncoded,
	zEncodeFS,

	-- ** Operations
        uniqueOfFS,
	lengthFS,
	nullFS,
	appendFS,
        headFS,
        tailFS,
	concatFS,
        consFS,
	nilFS,

	-- ** Outputing
        hPutFS,

62 63 64 65
	-- ** Internal
	getFastStringTable,
	hasZEncoding,

66
	-- * LitStrings
67
	LitString, 
68 69
	mkLitString#,
	strLength
70 71
       ) where

72 73 74 75
-- This #define suppresses the "import FastString" that
-- HsVersions otherwise produces
#define COMPILING_FAST_STRING
#include "HsVersions.h"
76

77
import Encoding
78

79 80
import Foreign
import Foreign.C
81 82 83 84
import GHC.Exts
import System.IO.Unsafe ( unsafePerformIO )
import Control.Monad.ST	( stToIO )
import Data.IORef	( IORef, newIORef, readIORef, writeIORef )
85
import System.IO	( hPutBuf )
86
import Data.Maybe	( isJust )
87

88
import GHC.ST
89
import GHC.IOBase	( IO(..) )
90
import GHC.Ptr		( Ptr(..) )
91

92 93
#define hASH_TBL_SIZE          4091
#define hASH_TBL_SIZE_UNBOXED  4091#
94 95


96 97 98 99 100
{-|
A 'FastString' is an array of bytes, hashed to support fast O(1)
comparison.  It is also associated with a character encoding, so that
we know how to convert a 'FastString' to the local encoding, or to the
Z-encoding used by the compiler internally.
101

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
-}

data FastString = FastString {
      uniq    :: {-# UNPACK #-} !Int,       -- unique id
      n_bytes :: {-# UNPACK #-} !Int,       -- number of bytes
      n_chars :: {-# UNPACK #-} !Int,	  -- number of chars
      buf     :: {-# UNPACK #-} !(ForeignPtr Word8),
      enc     :: FSEncoding
  }

data FSEncoding
  = ZEncoded
 	-- including strings that don't need any encoding
  | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
	-- A UTF-8 string with a memoized Z-encoding
118

119 120
instance Eq FastString where
  f1 == f2  =  uniq f1 == uniq f2
121

122
instance Ord FastString where
123
	-- Compares lexicographically, not by unique
124 125 126 127 128 129 130 131 132
    a <= b = case cmpFS a b of { LT -> True;  EQ -> True;  GT -> False }
    a <	 b = case cmpFS a b of { LT -> True;  EQ -> False; GT -> False }
    a >= b = case cmpFS a b of { LT -> False; EQ -> True;  GT -> True  }
    a >	 b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True  }
    max x y | x >= y	=  x
            | otherwise	=  y
    min x y | x <= y	=  x
            | otherwise	=  y
    compare a b = cmpFS a b
133

134 135 136
instance Show FastString where
   show fs = show (unpackFS fs)

137 138 139
cmpFS :: FastString -> FastString -> Ordering
cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
  if u1 == u2 then EQ else
Isaac Dupree's avatar
Isaac Dupree committed
140 141 142 143 144 145 146 147 148 149 150
  case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
     LT -> LT
     EQ -> compare l1 l2
     GT -> GT

unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
unsafeMemcmp buf1 buf2 l =
      inlinePerformIO $
        withForeignPtr buf1 $ \p1 ->
        withForeignPtr buf2 $ \p2 ->
          memcmp p1 p2 l
151

152 153 154 155
#ifndef __HADDOCK__
foreign import ccall unsafe "ghc_memcmp" 
  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
#endif
156

157 158
-- -----------------------------------------------------------------------------
-- Construction
159

160
{-
161 162 163 164
Internally, the compiler will maintain a fast string symbol
table, providing sharing and fast comparison. Creation of
new @FastString@s then covertly does a lookup, re-using the
@FastString@ if there was a hit.
165
-}
166 167 168

data FastStringTable = 
 FastStringTable
169
    {-# UNPACK #-} !Int
170
    (MutableArray# RealWorld [FastString])
171

172
{-# NOINLINE string_table #-}
173
string_table :: IORef FastStringTable
174
string_table = 
175
 unsafePerformIO $ do
176 177 178 179
   tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
                           (# s2#, arr# #) ->
                               (# s2#, FastStringTable 0 arr# #)
   newIORef tab
180 181 182 183 184 185 186 187 188

lookupTbl :: FastStringTable -> Int -> IO [FastString]
lookupTbl (FastStringTable _ arr#) (I# i#) =
  IO $ \ s# -> readArray# arr# i# s#

updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
  (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
  writeIORef fs_table_var (FastStringTable (uid+1) arr#)
189

190
mkFastString# :: Addr# -> FastString
191 192
mkFastString# a# = mkFastStringBytes ptr (strLength ptr)
  where ptr = Ptr a#
193

194 195 196
mkFastStringBytes :: Ptr Word8 -> Int -> FastString
mkFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
197
  let
198 199 200 201 202 203 204 205
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
206
  case lookup_result of
207 208 209 210 211 212 213 214 215 216
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
mkZFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
sof's avatar
sof committed
217
  let
218 219 220 221 222 223 224 225
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewZFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
226
  case lookup_result of
227 228 229 230 231 232 233 234 235 236 237 238 239 240
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
-- between this and 'mkFastStringBytes' is that we don't have to copy
-- the bytes if the string is new to the table.
mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
241
  let
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkZFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
  let
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewZFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
271
  case lookup_result of
272 273 274 275 276 277
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
278 279


280 281 282 283 284 285 286 287 288 289
-- | Creates a UTF-8 encoded 'FastString' from a 'String'
mkFastString :: String -> FastString
mkFastString str = 
  inlinePerformIO $ do
    let l = utf8EncodedLength str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      utf8EncodeString ptr str
      mkFastStringForeignPtr ptr buf l 

290 291 292 293 294 295 296 297 298
-- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
mkFastStringByteList :: [Word8] -> FastString
mkFastStringByteList str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeArray (castPtr ptr) str
      mkFastStringForeignPtr ptr buf l 
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351

-- | Creates a Z-encoded 'FastString' from a 'String'
mkZFastString :: String -> FastString
mkZFastString str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeCAString (castPtr ptr) str
      mkZFastStringForeignPtr ptr buf l 

bucket_match [] _ _ = return Nothing
bucket_match (v@(FastString _ l _ buf _):ls) len ptr
      | len == l  =  do
	 b <- cmpStringPrefix ptr buf len
	 if b then return (Just v)
	      else bucket_match ls len ptr
      | otherwise = 
	 bucket_match ls len ptr

mkNewFastString uid ptr fp len = do
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

mkNewZFastString uid ptr fp len = do
  return (FastString uid len len fp ZEncoded)


copyNewFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

copyNewZFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  return (FastString uid len len fp ZEncoded)


copyBytesToForeignPtr ptr len = do
  fp <- mallocForeignPtrBytes len
  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
  return fp

cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
cmpStringPrefix ptr fp len =
  withForeignPtr fp $ \ptr' -> do
    r <- memcmp ptr ptr' len
    return (r == 0)


hashStr  :: Ptr Word8 -> Int -> Int
352
 -- use the Addr to produce a hash value between 0 & m (inclusive)
353
hashStr (Ptr a#) (I# len#) = loop 0# 0#
354
   where 
355
    loop h n | n ==# len# = I# h
356 357 358
	     | otherwise  = loop h2 (n +# 1#)
	  where c = ord# (indexCharOffAddr# a# n)
		h2 = (c +# (h *# 128#)) `remInt#` hASH_TBL_SIZE#
359

360 361
-- -----------------------------------------------------------------------------
-- Operations
362

363 364 365
-- | Returns the length of the 'FastString' in characters
lengthFS :: FastString -> Int
lengthFS f = n_chars f
366

367 368 369 370
-- | Returns 'True' if the 'FastString' is Z-encoded
isZEncoded :: FastString -> Bool
isZEncoded fs | ZEncoded <- enc fs = True
		| otherwise          = False
371

372 373 374 375 376 377 378 379 380 381 382
-- | Returns 'True' if this 'FastString' is not Z-encoded but already has
-- a Z-encoding cached (used in producing stats).
hasZEncoding :: FastString -> Bool
hasZEncoding fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> False
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
	return (isJust m)

383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
-- | Returns 'True' if the 'FastString' is empty
nullFS :: FastString -> Bool
nullFS f  =  n_bytes f == 0

-- | unpacks and decodes the FastString
unpackFS :: FastString -> String
unpackFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    case enc of
	ZEncoded      -> peekCAStringLen (castPtr ptr,n_bytes)
	UTF8Encoded _ -> utf8DecodeString ptr n_bytes

bytesFS :: FastString -> [Word8]
bytesFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    peekArray n_bytes ptr

-- | returns a Z-encoded version of a 'FastString'.  This might be the
-- original, if it was already Z-encoded.  The first time this
-- function is applied to a particular 'FastString', the results are
-- memoized.
--
zEncodeFS :: FastString -> FastString
zEncodeFS fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> fs
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
        case m of
	  Just fs -> return fs
	  Nothing -> do
            let efs = mkZFastString (zEncodeString (unpackFS fs))
	    writeIORef ref (Just efs)
	    return efs

appendFS :: FastString -> FastString -> FastString
appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)

concatFS :: [FastString] -> FastString
concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better

headFS :: FastString -> Char
headFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do 
	 w <- peek (castPtr ptr)
	 return (castCCharToChar w)
      UTF8Encoded _ -> 
	 return (fst (utf8DecodeChar ptr))

tailFS :: FastString -> FastString
tailFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do
	return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
      UTF8Encoded _ -> do
	 let (_,ptr') = utf8DecodeChar ptr
	 let off = ptr' `minusPtr` ptr
	 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)

consFS :: Char -> FastString -> FastString
consFS c fs = mkFastString (c : unpackFS fs)

uniqueOfFS :: FastString -> Int#
uniqueOfFS (FastString (I# u#) _ _ _ _) = u#

nilFS = mkFastString ""
453

454 455 456 457 458 459
-- -----------------------------------------------------------------------------
-- Stats

getFastStringTable :: IO [[FastString]]
getFastStringTable = do
  tbl <- readIORef string_table
460
  buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
461 462
  return buckets

463 464 465
-- -----------------------------------------------------------------------------
-- Outputting 'FastString's

466 467 468 469 470
-- |Outputs a 'FastString' with /no decoding at all/, that is, you
-- get the actual bytes in the 'FastString' written to the 'Handle'.
hPutFS handle (FastString _ len _ fp _)
  | len == 0  = return ()
  | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
471

472 473
-- ToDo: we'll probably want an hPutFSLocal, or something, to output
-- in the current locale's encoding (for error messages and suchlike).
474

475 476
-- -----------------------------------------------------------------------------
-- LitStrings, here for convenience only.
477

478
type LitString = Ptr ()
479 480

mkLitString# :: Addr# -> LitString
481
mkLitString# a# = Ptr a#
482 483 484 485 486 487 488 489 490 491 492 493

foreign import ccall unsafe "ghc_strlen" 
  strLength :: Ptr () -> Int

-- -----------------------------------------------------------------------------
-- under the carpet

-- Just like unsafePerformIO, but we inline it.
{-# INLINE inlinePerformIO #-}
inlinePerformIO :: IO a -> a
inlinePerformIO (IO m) = case m realWorld# of (# _, r #)   -> r

494
-- NB. does *not* add a '\0'-terminator.
495 496 497
pokeCAString :: Ptr CChar -> String -> IO ()
pokeCAString ptr str =
  let
498
	go [] n     = return ()
499 500 501 502
    	go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
  in
  go str 0

503
#if __GLASGOW_HASKELL__ <= 602
504 505
peekCAStringLen = peekCStringLen
#endif
506
\end{code}