Commit 1e33535e authored by simonmar's avatar simonmar
Browse files

[project @ 2005-11-07 15:09:48 by simonmar]

Add Jan-Willem Maessen's hash table test
parent 5027d948
-----------------------------------------------------------------------------
-- |
-- Module : Data.HashTable
-- Copyright : (c) The University of Glasgow 2003
-- License : BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer : libraries@haskell.org
-- Stability : provisional
-- Portability : portable
--
-- An implementation of extensible hash tables, as described in
-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
-- pp. 446--457. The implementation is also derived from the one
-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
--
-----------------------------------------------------------------------------
module Data.HashTab (
-- * Basic hash table operations
HashTable, new, insert, delete, lookup, update,
-- * Converting to and from lists
fromList, toList,
-- * Hash functions
-- $hash_functions
hashInt, hashString,
prime,
-- * Diagnostics
longestChain
) where
-- This module is imported by Data.Typeable, which is pretty low down in the
-- module hierarchy, so don't import "high-level" modules
-- Right now we import high-level modules with gay abandon.
import Prelude hiding ( lookup )
import Data.Tuple ( fst )
import Data.Bits
import Data.Maybe
import Data.List ( maximumBy, partition, concat, foldl )
import Data.Int ( Int32 )
import Data.Array.Base
import Data.Array hiding (bounds)
import Data.Array.IO
import Data.Char ( ord )
import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
import Control.Monad ( mapM, sequence_ )
-----------------------------------------------------------------------
readHTArray :: HTArray a -> Int32 -> IO a
readMutArray :: MutArray a -> Int32 -> IO a
writeMutArray :: MutArray a -> Int32 -> a -> IO ()
freezeArray :: MutArray a -> IO (HTArray a)
thawArray :: HTArray a -> IO (MutArray a)
newMutArray :: (Int32, Int32) -> a -> IO (MutArray a)
#if defined(DEBUG) || defined(__NHC__)
type MutArray a = IOArray Int32 a
type HTArray a = MutArray a
newMutArray = newArray
readHTArray = readArray
readMutArray = readArray
writeMutArray = writeArray
freezeArray = return
thawArray = return
#else
type MutArray a = IOArray Int32 a
type HTArray a = Array Int32 a
newMutArray = newArray
readHTArray arr i = return $! (unsafeAt arr (fromIntegral i))
readMutArray arr i = unsafeRead arr (fromIntegral i)
writeMutArray arr i x = unsafeWrite arr (fromIntegral i) x
freezeArray = unsafeFreeze
thawArray = unsafeThaw
#endif
newtype HashTable key val = HashTable (IORef (HT key val))
-- TODO: the IORef should really be an MVar.
data HT key val
= HT {
kcount :: !Int32, -- Total number of keys.
buckets :: !(HTArray [(key,val)]),
bmask :: !Int32,
hash_fn :: key -> Int32,
cmp :: key -> key -> Bool
}
-- -----------------------------------------------------------------------------
-- Sample hash functions
-- $hash_functions
--
-- This implementation of hash tables uses the low-order /n/ bits of the hash
-- value for a key, where /n/ varies as the hash table grows. A good hash
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
-- keys are highly variable, then you could get away with using 'id'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.
-- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
-- where P is a suitable prime (currently 1500007). Should give
-- reasonable results for most distributions of 'Int' values, except
-- when the keys are all multiples of the prime!
--
hashInt :: Int -> Int32
hashInt = (`rem` prime) . fromIntegral
-- | A sample hash function for 'String's. The implementation is:
--
-- > hashString = fromIntegral . foldr f 0
-- > where f c m = ord c + (m * 128) `rem` 1500007
--
-- which seems to give reasonable results.
--
hashString :: String -> Int32
hashString = fromIntegral . foldl f 0
where f m c = ord c + (m * 128) `rem` fromIntegral prime
-- | A prime larger than the maximum hash table size
prime :: Int32
prime = 1500007
-- -----------------------------------------------------------------------------
-- Parameters
tABLE_MAX = 1024 * 1024 :: Int32 -- Maximum size of hash table
#if tABLE_MIN
#else
tABLE_MIN = 16 :: Int32
hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
hYSTERESIS = 0 :: Int32 -- entries to ignore in load computation
#endif
{- Hysteresis favors long association-list-like behavior for small tables. -}
-- -----------------------------------------------------------------------------
-- Creating a new hash table
-- | Creates a new hash table. The following property should hold for the @eq@
-- and @hash@ functions passed to 'new':
--
-- > eq A B => hash A == hash B
--
new
:: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
-> (key -> Int32) -- ^ @hash@: A hash function on keys
-> IO (HashTable key val) -- ^ Returns: an empty hash table
new cmpr hash = do
-- make a new hash table with a single, empty, segment
let mask = tABLE_MIN-1
bkts' <- newMutArray (0,mask) []
bkts <- freezeArray bkts'
let
kcnt = 0
ht = HT { buckets=bkts, kcount=kcnt, bmask=mask,
hash_fn=hash, cmp=cmpr }
table <- newIORef ht
return (HashTable table)
-- -----------------------------------------------------------------------------
-- Inserting a key\/value pair into the hash table
-- | Inserts an key\/value mapping into the hash table.
--
-- Note that 'insert' doesn't remove the old entry from the table -
-- the behaviour is like an association list, where 'lookup' returns
-- the most-recently-inserted mapping for a key in the table. The
-- reason for this is to keep 'insert' as efficient as possible. If
-- you need to update a mapping, then we provide 'update'.
--
insert :: HashTable key val -> key -> val -> IO ()
insert (HashTable ref) key val = do
table@HT{ kcount=k, buckets=bkts, bmask=b } <- readIORef ref
let table1 = table{ kcount = k+1 }
indx = bucketIndex table key
bucket <- readHTArray bkts indx
bkts' <- thawArray bkts
writeMutArray bkts' indx ((key,val):bucket)
freezeArray bkts'
table2 <-
if tooBig k b
then expandHashTable table1
else return table1
writeIORef ref table2
tooBig :: Int32 -> Int32 -> Bool
tooBig k b = k-hYSTERESIS > hLOAD * b
bucketIndex :: HT key val -> key -> Int32
bucketIndex HT{ hash_fn=hash, bmask=mask } key =
let h = hash key
in (h .&. mask)
expandHashTable :: HT key val -> IO (HT key val)
expandHashTable
table@HT{ buckets=bkts, bmask=mask } = do
let
oldsize = mask + 1
newmask = mask + mask + 1
newsize = newmask + 1
--
if newsize > tABLE_MAX
then return table
else do
--
newbkts' <- newMutArray (0,newmask) []
let
table'=table{ bmask=newmask }
splitBucket oldindex = do
bucket <- readHTArray bkts oldindex
let (oldb,newb) = partition ((oldindex==).bucketIndex table' . fst) bucket
writeMutArray newbkts' oldindex oldb
writeMutArray newbkts' (oldindex + oldsize) newb
mapM_ splitBucket [0..mask]
newbkts <- freezeArray newbkts'
return ( table'{ buckets=newbkts } )
-- -----------------------------------------------------------------------------
-- Deleting a mapping from the hash table
-- Remove a key from a bucket
deleteBucket :: (key -> Bool) -> [(key,val)] -> (Int32, [(key, val)])
deleteBucket _ [] = (0,[])
deleteBucket del (pair@(k,_):bucket) =
case deleteBucket del bucket of
(dels, bucket') | del k -> dels' `seq` (dels', bucket')
| otherwise -> (dels, pair:bucket')
where dels' = dels + 1
-- | Remove an entry from the hash table.
delete :: HashTable key val -> key -> IO ()
delete (HashTable ref) key = do
table@HT{ buckets=bkts, kcount=kcnt, cmp=cmpr } <- readIORef ref
let indx = bucketIndex table key
bkts' <- thawArray bkts
bucket <- readMutArray bkts' indx
let (removed,bucket') = deleteBucket (cmpr key) bucket
writeMutArray bkts' indx bucket'
freezeArray bkts'
writeIORef ref ( table{kcount = kcnt - removed} )
-- -----------------------------------------------------------------------------
-- Updating a mapping in the hash table
-- | Updates an entry in the hash table, returning 'True' if there was
-- already an entry for this key, or 'False' otherwise. After 'update'
-- there will always be exactly one entry for the given key in the table.
--
-- 'insert' is more efficient than 'update' if you don't care about
-- multiple entries, or you know for sure that multiple entries can't
-- occur. However, 'update' is more efficient than 'delete' followed
-- by 'insert'.
update :: HashTable key val -> key -> val -> IO Bool
update (HashTable ref) key val = do
table@HT{ kcount=k, buckets=bkts, cmp=cmpr, bmask=b } <- readIORef ref
let indx = bucketIndex table key
bkts' <- thawArray bkts
bucket <- readMutArray bkts' indx
let (deleted,bucket') = deleteBucket (cmpr key) bucket
k' = k + 1 - deleted
table1 = table{ kcount=k' }
writeMutArray bkts' indx ((key,val):bucket')
freezeArray bkts'
table2 <-
if tooBig k' b -- off by one from insert's resize heuristic.
then expandHashTable table1
else return table1
writeIORef ref table2
return (deleted>0)
-- -----------------------------------------------------------------------------
-- Looking up an entry in the hash table
-- | Looks up the value of a key in the hash table.
lookup :: HashTable key val -> key -> IO (Maybe val)
lookup (HashTable ref) key = do
table@HT{ buckets=bkts, cmp=cmpr } <- readIORef ref
let indx = bucketIndex table key
bucket <- readHTArray bkts indx
case [ val | (key',val) <- bucket, cmpr key key' ] of
[] -> return Nothing
(v:_) -> return (Just v)
-- -----------------------------------------------------------------------------
-- Converting to/from lists
-- | Convert a list of key\/value pairs into a hash table. Equality on keys
-- is taken from the Eq instance for the key type.
--
fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
fromList hash list = do
table <- new (==) hash
sequence_ [ insert table k v | (k,v) <- list ]
return table
-- | Converts a hash table to a list of key\/value pairs.
--
toList :: (Ord key, Ord val) => HashTable key val -> IO [(key,val)]
toList (HashTable ref) = do
HT{ buckets=bkts, bmask=b } <- readIORef ref
fmap concat (mapM (readHTArray bkts) [0..b])
-- -----------------------------------------------------------------------------
-- Diagnostics
-- | This function is useful for determining whether your hash function
-- is working well for your data set. It returns the longest chain
-- of key\/value pairs in the hash table for which all the keys hash to
-- the same bucket. If this chain is particularly long (say, longer
-- than 10 elements), then it might be a good idea to try a different
-- hash function.
--
longestChain :: HashTable key val -> IO [(key,val)]
longestChain (HashTable ref) = do
HT{ buckets=bkts, bmask=b } <- readIORef ref
let lengthCmp (_:x)(_:y) = lengthCmp x y
lengthCmp [] [] = EQ
lengthCmp [] _ = LT
lengthCmp _ [] = GT
fmap (maximumBy lengthCmp) (mapM (readHTArray bkts) [0..b])
{- Test code for Data.HashTable -}
module Main(main) where
import Prelude hiding (lookup)
import qualified Prelude (lookup)
import Data.Maybe(isJust,isNothing)
import Data.Int(Int32)
import Test.QuickCheck
import System.IO.Unsafe(unsafePerformIO)
import Data.HashTab
import Control.Monad(liftM2, foldM)
import System.Random
import System.Environment
infixr 0 ==.
infixr 0 ==~
infixr 0 ~~
type HT = HashTable Int Int
newtype HashFun = HF {unHF :: (Int -> Int32)}
data Empty = E {e :: (IO HT), hfe :: HashFun}
data MkH = H {h :: (IO HT), hfh :: HashFun}
newtype List a = L [a]
data Action = Lookup Int
| Insert Int Int
| Delete Int
| Update Int Int
deriving (Show)
instance Arbitrary Action where
arbitrary = frequency [(10,fmap Lookup arbitrary),
(5, liftM2 Insert arbitrary arbitrary),
(3, liftM2 Update arbitrary arbitrary),
(1, fmap Delete arbitrary)]
coarbitrary = error "coarbitrary Action"
simA :: [Action] -> [Either Bool [Int]]
simA = fst . foldl sim ([],[])
where sim :: ([Either Bool [Int]], [Action]) -> Action ->
([Either Bool [Int]], [Action])
sim (res, past) (Lookup k) = (Right (lkup k past) : res, past)
sim (res, past) (Insert k v) = (res, Insert k v : past)
sim (res, past) (Delete k) = (res, Delete k : past)
sim (res, past) (Update k v) =
(Left (not (null l)) : res, Update k v : past)
where l = lkup k past
lkup _ [] = []
lkup k (Delete k' : _)
| k==k' = []
lkup k (Update k' v : _)
| k==k' = [v]
lkup k (Insert k' v : past)
| k==k' = v:lkup k past
lkup k (_ : past) = lkup k past
runA :: HashFun -> [Action] -> IO [Either Bool (Maybe Int)]
runA hf acts = do
ht <- new (==) (unHF hf)
let run res (Lookup a) = fmap (lkup res) $ lookup ht a
run res (Insert k v) = insert ht k v >> return res
run res (Delete k) = delete ht k >> return res
run res (Update k v) = fmap (upd res) $ update ht k v
lkup res m = Right m : res
upd res b = Left b : res
foldM run [] acts
(~~) :: IO [Either Bool (Maybe Int)] -> [Either Bool [Int]] -> Bool
acts ~~ sims = and $ zipWith same (unsafePerformIO acts) sims
where same (Left b) (Left b') = b==b'
same (Right Nothing) (Right []) = True
same (Right (Just a)) (Right xs) = a `elem` xs
same _ _ = False
lookups :: HT -> [Int] -> IO [Maybe Int]
lookups ht ks = mapM (lookup ht) ks
instance Show HashFun where
showsPrec _ (HF hf) r
| hf 1 == 0 = "degenerate"++r
| otherwise = "usual"++r
instance Show Empty where
showsPrec _ ee r = shows (hfe ee) r
instance Show MkH where
showsPrec _ hh r = shows (hfh hh) $
("; "++shows (unsafePerformIO (h hh >>= toList)) r)
instance Show a => Show (List a) where
showsPrec _ (L l) r = shows l r
instance Arbitrary HashFun where
arbitrary = frequency [(20,return (HF hashInt)),
(1,return (HF (const 0)))]
coarbitrary = error "coarbitrary HashFun"
instance Arbitrary Empty where
arbitrary = fmap mkE arbitrary
where mkE (HF hf) = E {e = new (==) hf, hfe=HF hf}
coarbitrary = error "coarbitrary Empty"
instance Arbitrary a => Arbitrary (List a) where
arbitrary = do
sz <- frequency [(50, sized return),
(1,return (4096*2)),
(0, return (1024*1024))]
resize sz $ fmap L $ sized vector
coarbitrary = error "coarbitrary (List a)"
instance Arbitrary MkH where
arbitrary = do
hf <- arbitrary
L list <- arbitrary
let mkH act = H { h = act, hfh = hf }
return (mkH . fromList (unHF hf) $ list)
coarbitrary = error "coarbitrary MkH"
(==~) :: (Eq a) => IO a -> IO a -> Bool
act1 ==~ act2 = unsafePerformIO act1 == unsafePerformIO act2
(==.) :: (Eq a) => IO a -> a -> Bool
act ==. v = unsafePerformIO act == v
notin :: (Testable a) => Int -> MkH -> a -> Property
k `notin` hh = \prop ->
let f = (not . isJust . unsafePerformIO) (h hh >>= flip lookup k) in
f `trivial` prop
prop_emptyLookup :: Empty -> Int -> Bool
prop_emptyLookup ee k =
isNothing . unsafePerformIO $
(do mt <- e ee
lookup mt k)
prop_emptyToList :: Empty -> Bool
prop_emptyToList ee =
(do mt <- e ee
toList mt) ==. []
prop_emptyFromList :: HashFun -> Int -> Bool
prop_emptyFromList hf k =
(do mt <- new (==) (unHF hf) :: IO HT
lookup mt k) ==~
(do mt <- fromList (unHF hf) []
lookup mt k)
prop_insert :: MkH -> Int -> Int -> Bool
prop_insert hh k v =
(do ht <- h hh
insert ht k v
lookup ht k) ==. Just v
prop_insertu :: MkH -> Int -> Int -> List Int -> Bool
prop_insertu hh k v (L ks) =
let ks' = filter (k /=) ks in
(do ht <- h hh
insert ht k v
lookups ht ks') ==~
(do ht <- h hh
lookups ht ks')
prop_delete :: MkH -> Int -> Property
prop_delete hh k =
k `notin` hh $
isNothing . unsafePerformIO $
(do ht <- h hh
delete ht k
lookup ht k)
prop_deleteu :: MkH -> Int -> List Int -> Bool
prop_deleteu hh k (L ks) =
let ks' = filter (k /=) ks in
(do ht <- h hh
delete ht k
lookups ht ks') ==~
(do ht <- h hh
lookups ht ks')
naiveUpdate :: HT -> Int -> Int -> IO ()
naiveUpdate ht k v = do
delete ht k
insert ht k v
prop_update :: MkH -> Int -> Int -> List Int -> Bool
prop_update hh k v (L ks) =
(do ht <- h hh
_ <- update ht k v
lookups ht ks) ==~
(do ht <- h hh
naiveUpdate ht k v
lookups ht ks)
prop_updatec :: MkH -> Int -> Int -> Bool
prop_updatec hh k v =
(do ht <- h hh
_ <- update ht k v
lookup ht k) ==. Just v
prop_updateLookup :: MkH -> Int -> Int -> Property
prop_updateLookup hh k v =
k `notin` hh $
(do ht <- h hh
update ht k v) ==~
(do ht <- h hh
fmap isJust (lookup ht k))
prop_simulation :: HashFun -> List Action -> Property
prop_simulation hf (L acts) =
(null acts `trivial`) $
runA hf acts ~~ simA acts
{-
For "fromList" and "toList" properties we're a bit sloppy: we perform
multiple insertions for a key (potentially) but give nor promises
about which one we will retrieve with lookup, or what order they'll be
returned by toList (or if they'll all be returned at all). Thus we
insert all occurrences of a key with the same value, and do all
checking via lookups.
-}
prop_fromList :: HashFun -> List Int -> List Int -> Property
prop_fromList hf (L l) (L ks) =
null l `trivial`
let assocs = map (\t -> (t,t)) l in
( do ht <- fromList (unHF hf) assocs
lookups ht ks) ==. (map (`Prelude.lookup` assocs) ks)
prop_fromListInsert :: HashFun -> List (Int,Int) -> Int -> Int -> List Int -> Property
prop_fromListInsert hf (L l) k v (L ks) =
null l `trivial`
(( do ht <- fromList (unHF hf) l
insert ht k v