FastString.lhs 16.5 KB
Newer Older
1
%
2
% (c) The University of Glasgow, 1997-2006
3
%
4
\begin{code}
5
{-# OPTIONS -w #-}
6 7 8
-- The above warning supression flag is a temporary kludge.
-- While working on this module you are encouraged to remove it and fix
-- any warnings in the module. See
Ian Lynagh's avatar
Ian Lynagh committed
9
--     http://hackage.haskell.org/trac/ghc/wiki/Commentary/CodingStyle#Warnings
10 11
-- for details

12
{-
13 14 15 16 17 18 19 20 21 22 23 24
FastString:	A compact, hash-consed, representation of character strings.
		Comparison is O(1), and you can get a Unique from them.
		Generated by the FSLIT macro
		Turn into SDoc with Outputable.ftext

LitString:	Just a wrapper for the Addr# of a C string (Ptr CChar).
		Practically no operations
		Outputing them is fast
		Generated by the SLIT macro
		Turn into SDoc with Outputable.ptext

Use LitString unless you want the facilities of FastString
25
-}
26 27
module FastString
       (
28
	-- * FastStrings
29 30
	FastString(..),     -- not abstract, for now.

31 32 33
	-- ** Construction
        mkFastString,
	mkFastStringBytes,
34
        mkFastStringByteList,
35
	mkFastStringForeignPtr,
36
#if defined(__GLASGOW_HASKELL__)
37
	mkFastString#,
38
#endif
39 40
	mkZFastString,
	mkZFastStringBytes,
41

42
	-- ** Deconstruction
43
	unpackFS,	    -- :: FastString -> String
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
	bytesFS,	    -- :: FastString -> [Word8]

	-- ** Encoding
	isZEncoded,
	zEncodeFS,

	-- ** Operations
        uniqueOfFS,
	lengthFS,
	nullFS,
	appendFS,
        headFS,
        tailFS,
	concatFS,
        consFS,
	nilFS,

	-- ** Outputing
        hPutFS,

64 65 66 67
	-- ** Internal
	getFastStringTable,
	hasZEncoding,

68
	-- * LitStrings
69
	LitString, 
70
#if defined(__GLASGOW_HASKELL__)
71
	mkLitString#,
72 73 74 75 76 77 78
#else
	mkLitString,
#endif
	unpackLitString,
	strLength,

	ptrStrLength
79 80
       ) where

81 82 83 84
-- This #define suppresses the "import FastString" that
-- HsVersions otherwise produces
#define COMPILING_FAST_STRING
#include "HsVersions.h"
85

86
import Encoding
87 88
import FastTypes
import FastFunctions
89

90 91
import Foreign
import Foreign.C
92 93 94 95
import GHC.Exts
import System.IO.Unsafe ( unsafePerformIO )
import Control.Monad.ST	( stToIO )
import Data.IORef	( IORef, newIORef, readIORef, writeIORef )
96
import System.IO	( hPutBuf )
97
import Data.Maybe	( isJust )
98
import Data.Char	( ord )
99

100
import GHC.ST
101
import GHC.IOBase	( IO(..) )
102
import GHC.Ptr		( Ptr(..) )
103

104 105
#define hASH_TBL_SIZE          4091
#define hASH_TBL_SIZE_UNBOXED  4091#
106 107


108 109 110 111 112
{-|
A 'FastString' is an array of bytes, hashed to support fast O(1)
comparison.  It is also associated with a character encoding, so that
we know how to convert a 'FastString' to the local encoding, or to the
Z-encoding used by the compiler internally.
113

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
'FastString's support a memoized conversion to the Z-encoding via zEncodeFS.
-}

data FastString = FastString {
      uniq    :: {-# UNPACK #-} !Int,       -- unique id
      n_bytes :: {-# UNPACK #-} !Int,       -- number of bytes
      n_chars :: {-# UNPACK #-} !Int,	  -- number of chars
      buf     :: {-# UNPACK #-} !(ForeignPtr Word8),
      enc     :: FSEncoding
  }

data FSEncoding
  = ZEncoded
 	-- including strings that don't need any encoding
  | UTF8Encoded {-# UNPACK #-} !(IORef (Maybe FastString))
	-- A UTF-8 string with a memoized Z-encoding
130

131 132
instance Eq FastString where
  f1 == f2  =  uniq f1 == uniq f2
133

134
instance Ord FastString where
135
	-- Compares lexicographically, not by unique
136 137 138 139 140 141 142 143 144
    a <= b = case cmpFS a b of { LT -> True;  EQ -> True;  GT -> False }
    a <	 b = case cmpFS a b of { LT -> True;  EQ -> False; GT -> False }
    a >= b = case cmpFS a b of { LT -> False; EQ -> True;  GT -> True  }
    a >	 b = case cmpFS a b of { LT -> False; EQ -> False; GT -> True  }
    max x y | x >= y	=  x
            | otherwise	=  y
    min x y | x <= y	=  x
            | otherwise	=  y
    compare a b = cmpFS a b
145

146 147 148
instance Show FastString where
   show fs = show (unpackFS fs)

149 150 151
cmpFS :: FastString -> FastString -> Ordering
cmpFS (FastString u1 l1 _ buf1 _) (FastString u2 l2 _ buf2 _) =
  if u1 == u2 then EQ else
Isaac Dupree's avatar
Isaac Dupree committed
152 153 154 155 156 157 158 159 160 161 162
  case unsafeMemcmp buf1 buf2 (min l1 l2) `compare` 0 of
     LT -> LT
     EQ -> compare l1 l2
     GT -> GT

unsafeMemcmp :: ForeignPtr a -> ForeignPtr b -> Int -> Int
unsafeMemcmp buf1 buf2 l =
      inlinePerformIO $
        withForeignPtr buf1 $ \p1 ->
        withForeignPtr buf2 $ \p2 ->
          memcmp p1 p2 l
163

164 165 166 167
#ifndef __HADDOCK__
foreign import ccall unsafe "ghc_memcmp" 
  memcmp :: Ptr a -> Ptr b -> Int -> IO Int
#endif
168

169 170
-- -----------------------------------------------------------------------------
-- Construction
171

172
{-
173 174 175 176
Internally, the compiler will maintain a fast string symbol
table, providing sharing and fast comparison. Creation of
new @FastString@s then covertly does a lookup, re-using the
@FastString@ if there was a hit.
177
-}
178 179 180

data FastStringTable = 
 FastStringTable
181
    {-# UNPACK #-} !Int
182
    (MutableArray# RealWorld [FastString])
183

184
{-# NOINLINE string_table #-}
185
string_table :: IORef FastStringTable
186
string_table = 
187
 unsafePerformIO $ do
188 189 190 191
   tab <- IO $ \s1# -> case newArray# hASH_TBL_SIZE_UNBOXED [] s1# of
                           (# s2#, arr# #) ->
                               (# s2#, FastStringTable 0 arr# #)
   newIORef tab
192 193 194 195 196 197 198 199 200

lookupTbl :: FastStringTable -> Int -> IO [FastString]
lookupTbl (FastStringTable _ arr#) (I# i#) =
  IO $ \ s# -> readArray# arr# i# s#

updTbl :: IORef FastStringTable -> FastStringTable -> Int -> [FastString] -> IO ()
updTbl fs_table_var (FastStringTable uid arr#) (I# i#) ls = do
  (IO $ \ s# -> case writeArray# arr# i# ls s# of { s2# -> (# s2#, () #) })
  writeIORef fs_table_var (FastStringTable (uid+1) arr#)
201

202
mkFastString# :: Addr# -> FastString
203
mkFastString# a# = mkFastStringBytes ptr (ptrStrLength ptr)
204
  where ptr = Ptr a#
205

206 207 208
mkFastStringBytes :: Ptr Word8 -> Int -> FastString
mkFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
209
  let
210 211 212 213 214 215 216 217
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
218
  case lookup_result of
219 220 221 222 223 224 225 226 227 228
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringBytes :: Ptr Word8 -> Int -> FastString
mkZFastStringBytes ptr len = unsafePerformIO $ do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
sof's avatar
sof committed
229
  let
230 231 232 233 234 235 236 237
   h = hashStr ptr len
   add_it ls = do
	fs <- copyNewZFastString uid ptr len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
sof's avatar
sof committed
238
  case lookup_result of
239 240 241 242 243 244 245 246 247 248 249 250 251 252
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

-- | Create a 'FastString' from an existing 'ForeignPtr'; the difference
-- between this and 'mkFastStringBytes' is that we don't have to copy
-- the bytes if the string is new to the table.
mkFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
253
  let
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
  case lookup_result of
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v

mkZFastStringForeignPtr :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO FastString
mkZFastStringForeignPtr ptr fp len = do
  ft@(FastStringTable uid tbl#) <- readIORef string_table
--  _trace ("hashed: "++show (I# h)) $
  let
    h = hashStr ptr len
    add_it ls = do
	fs <- mkNewZFastString uid ptr fp len
	updTbl string_table ft h (fs:ls)
	{- _trace ("new: " ++ show f_str)   $ -}
	return fs
  --
  lookup_result <- lookupTbl ft h
283
  case lookup_result of
284 285 286 287 288 289
    [] -> add_it []
    ls -> do
       b <- bucket_match ls len ptr
       case b of
	 Nothing -> add_it ls
	 Just v  -> {- _trace ("re-use: "++show v) $ -} return v
290 291


292 293 294 295 296 297 298 299 300 301
-- | Creates a UTF-8 encoded 'FastString' from a 'String'
mkFastString :: String -> FastString
mkFastString str = 
  inlinePerformIO $ do
    let l = utf8EncodedLength str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      utf8EncodeString ptr str
      mkFastStringForeignPtr ptr buf l 

302 303 304 305 306 307 308 309 310
-- | Creates a 'FastString' from a UTF-8 encoded @[Word8]@
mkFastStringByteList :: [Word8] -> FastString
mkFastStringByteList str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeArray (castPtr ptr) str
      mkFastStringForeignPtr ptr buf l 
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363

-- | Creates a Z-encoded 'FastString' from a 'String'
mkZFastString :: String -> FastString
mkZFastString str = 
  inlinePerformIO $ do
    let l = Prelude.length str
    buf <- mallocForeignPtrBytes l
    withForeignPtr buf $ \ptr -> do
      pokeCAString (castPtr ptr) str
      mkZFastStringForeignPtr ptr buf l 

bucket_match [] _ _ = return Nothing
bucket_match (v@(FastString _ l _ buf _):ls) len ptr
      | len == l  =  do
	 b <- cmpStringPrefix ptr buf len
	 if b then return (Just v)
	      else bucket_match ls len ptr
      | otherwise = 
	 bucket_match ls len ptr

mkNewFastString uid ptr fp len = do
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

mkNewZFastString uid ptr fp len = do
  return (FastString uid len len fp ZEncoded)


copyNewFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  ref <- newIORef Nothing
  n_chars <- countUTF8Chars ptr len
  return (FastString uid len n_chars fp (UTF8Encoded ref))

copyNewZFastString uid ptr len = do
  fp <- copyBytesToForeignPtr ptr len
  return (FastString uid len len fp ZEncoded)


copyBytesToForeignPtr ptr len = do
  fp <- mallocForeignPtrBytes len
  withForeignPtr fp $ \ptr' -> copyBytes ptr' ptr len
  return fp

cmpStringPrefix :: Ptr Word8 -> ForeignPtr Word8 -> Int -> IO Bool
cmpStringPrefix ptr fp len =
  withForeignPtr fp $ \ptr' -> do
    r <- memcmp ptr ptr' len
    return (r == 0)


hashStr  :: Ptr Word8 -> Int -> Int
364
 -- use the Addr to produce a hash value between 0 & m (inclusive)
365
hashStr (Ptr a#) (I# len#) = loop 0# 0#
366
   where 
367 368
    loop h n | n GHC.Exts.==# len# = I# h
	     | otherwise  = loop h2 (n GHC.Exts.+# 1#)
369
	  where c = ord# (indexCharOffAddr# a# n)
370
		h2 = (c GHC.Exts.+# (h GHC.Exts.*# 128#)) `remInt#` hASH_TBL_SIZE#
371

372 373
-- -----------------------------------------------------------------------------
-- Operations
374

375 376 377
-- | Returns the length of the 'FastString' in characters
lengthFS :: FastString -> Int
lengthFS f = n_chars f
378

379 380 381 382
-- | Returns 'True' if the 'FastString' is Z-encoded
isZEncoded :: FastString -> Bool
isZEncoded fs | ZEncoded <- enc fs = True
		| otherwise          = False
383

384 385 386 387 388 389 390 391 392 393 394
-- | Returns 'True' if this 'FastString' is not Z-encoded but already has
-- a Z-encoding cached (used in producing stats).
hasZEncoding :: FastString -> Bool
hasZEncoding fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> False
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
	return (isJust m)

395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
-- | Returns 'True' if the 'FastString' is empty
nullFS :: FastString -> Bool
nullFS f  =  n_bytes f == 0

-- | unpacks and decodes the FastString
unpackFS :: FastString -> String
unpackFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    case enc of
	ZEncoded      -> peekCAStringLen (castPtr ptr,n_bytes)
	UTF8Encoded _ -> utf8DecodeString ptr n_bytes

bytesFS :: FastString -> [Word8]
bytesFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr ->
    peekArray n_bytes ptr

-- | returns a Z-encoded version of a 'FastString'.  This might be the
-- original, if it was already Z-encoded.  The first time this
-- function is applied to a particular 'FastString', the results are
-- memoized.
--
zEncodeFS :: FastString -> FastString
zEncodeFS fs@(FastString uid n_bytes _ fp enc) =
  case enc of
    ZEncoded -> fs
    UTF8Encoded ref ->
      inlinePerformIO $ do
        m <- readIORef ref
        case m of
	  Just fs -> return fs
	  Nothing -> do
            let efs = mkZFastString (zEncodeString (unpackFS fs))
	    writeIORef ref (Just efs)
	    return efs

appendFS :: FastString -> FastString -> FastString
appendFS fs1 fs2 = mkFastString (unpackFS fs1 ++ unpackFS fs2)

concatFS :: [FastString] -> FastString
concatFS ls = mkFastString (Prelude.concat (map unpackFS ls)) -- ToDo: do better

headFS :: FastString -> Char
headFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do 
	 w <- peek (castPtr ptr)
	 return (castCCharToChar w)
      UTF8Encoded _ -> 
	 return (fst (utf8DecodeChar ptr))

tailFS :: FastString -> FastString
tailFS (FastString _ n_bytes _ buf enc) = 
  inlinePerformIO $ withForeignPtr buf $ \ptr -> do
    case enc of
      ZEncoded -> do
	return $! mkZFastStringBytes (ptr `plusPtr` 1) (n_bytes - 1)
      UTF8Encoded _ -> do
	 let (_,ptr') = utf8DecodeChar ptr
	 let off = ptr' `minusPtr` ptr
	 return $! mkFastStringBytes (ptr `plusPtr` off) (n_bytes - off)

consFS :: Char -> FastString -> FastString
consFS c fs = mkFastString (c : unpackFS fs)

461 462
uniqueOfFS :: FastString -> FastInt
uniqueOfFS (FastString u _ _ _ _) = iUnbox u
463 464

nilFS = mkFastString ""
465

466 467 468 469 470 471
-- -----------------------------------------------------------------------------
-- Stats

getFastStringTable :: IO [[FastString]]
getFastStringTable = do
  tbl <- readIORef string_table
472
  buckets <- mapM (lookupTbl tbl) [0 .. hASH_TBL_SIZE]
473 474
  return buckets

475 476 477
-- -----------------------------------------------------------------------------
-- Outputting 'FastString's

478 479 480 481 482
-- |Outputs a 'FastString' with /no decoding at all/, that is, you
-- get the actual bytes in the 'FastString' written to the 'Handle'.
hPutFS handle (FastString _ len _ fp _)
  | len == 0  = return ()
  | otherwise = do withForeignPtr fp $ \ptr -> hPutBuf handle ptr len
483

484 485
-- ToDo: we'll probably want an hPutFSLocal, or something, to output
-- in the current locale's encoding (for error messages and suchlike).
486

487 488
-- -----------------------------------------------------------------------------
-- LitStrings, here for convenience only.
489

490 491 492 493 494 495 496
-- hmm, not unboxed (or rather FastPtr), interesting
--a.k.a. Ptr CChar, Ptr Word8, Ptr (), hmph.  We don't
--really care about C types in naming, where we can help it.
type LitString = Ptr Word8
--Why do we recalculate length every time it's requested?
--If it's commonly needed, we should perhaps have
--data LitString = LitString {-#UNPACK#-}!(FastPtr Word8) {-#UNPACK#-}!FastInt
497

498
#if defined(__GLASGOW_HASKELL__)
499
mkLitString# :: Addr# -> LitString
500
mkLitString# a# = Ptr a#
501
#endif
502

503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
--can/should we use FastTypes here?
--Is this likely to be memory-preserving if only used on constant strings?
--should we inline it? If lucky, that would make a CAF that wouldn't
--be computationally repeated... although admittedly we're not
--really intending to use mkLitString when __GLASGOW_HASKELL__...
--(I wonder, is unicode / multi-byte characters allowed in LitStrings
-- at all?)
{-# INLINE mkLitString #-}
mkLitString :: String -> LitString
mkLitString s =
 unsafePerformIO (do
   p <- mallocBytes (length s + 1)
   let
     loop :: Int -> String -> IO ()
     loop n cs | n `seq` null cs = pokeByteOff p n (0 :: Word8)
     loop n (c:cs) = do
        pokeByteOff p n (fromIntegral (ord c) :: Word8)
        loop (1+n) cs
   loop 0 s
   return p
 )

unpackLitString :: LitString -> String
unpackLitString p_ = case pUnbox p_ of
 p -> unpack (_ILIT(0))
  where
    unpack n = case indexWord8OffFastPtrAsFastChar p n of
      ch -> if ch `eqFastChar` _CLIT('\0')
            then [] else cBox ch : unpack (n +# _ILIT(1))

strLength :: LitString -> Int
strLength = ptrStrLength

-- for now, use a simple String representation
--no, let's not do that right now - it's work in other places
#if 0
type LitString = String

mkLitString :: String -> LitString
mkLitString = id

unpackLitString :: LitString -> String
unpackLitString = id

strLength :: LitString -> Int
strLength = length

#endif
551 552 553 554

-- -----------------------------------------------------------------------------
-- under the carpet

555 556
foreign import ccall unsafe "ghc_strlen" 
  ptrStrLength :: Ptr Word8 -> Int
557

558
-- NB. does *not* add a '\0'-terminator.
559 560
-- We only use CChar here to be parallel to the imported
-- peekC(A)StringLen.
561 562 563
pokeCAString :: Ptr CChar -> String -> IO ()
pokeCAString ptr str =
  let
564
	go [] n     = return ()
565 566 567 568
    	go (c:cs) n = do pokeElemOff ptr n (castCharToCChar c); go cs (n+1)
  in
  go str 0

569
#if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ <= 602
570 571
peekCAStringLen = peekCStringLen
#endif
572
\end{code}