{-# OPTIONS_GHC -XNoImplicitPrelude -funbox-strict-fields -fno-warn-name-shadowing #-}------------------------------------------------------------------------------- |-- Module : Data.HashTable-- Copyright : (c) The University of Glasgow 2003-- License : BSD-style (see the file libraries/base/LICENSE)---- Maintainer : libraries@haskell.org-- Stability : provisional-- Portability : portable---- An implementation of extensible hash tables, as described in-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,-- pp. 446--457. The implementation is also derived from the one-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).-------------------------------------------------------------------------------moduleData.HashTable(-- * Basic hash table operationsHashTable,new,newHint,insert,delete,lookup,update,-- * Converting to and from listsfromList,toList,-- * Hash functions-- $hash_functionshashInt,hashString,prime,-- * DiagnosticslongestChain)where-- This module is imported by Data.Dynamic, which is pretty low down in the-- module hierarchy, so don't import "high-level" modules#ifdef __GLASGOW_HASKELL__importGHC.Base#elseimportPreludehiding(lookup)#endifimportData.Tuple(fst)importData.BitsimportData.MaybeimportData.List(maximumBy,length,concat,foldl',partition)importData.Int(Int32)#if defined(__GLASGOW_HASKELL__)importGHC.NumimportGHC.Real(fromIntegral)importGHC.Show(Show(..))importGHC.Int(Int64)importGHC.IOimportGHC.IOArrayimportGHC.IORef#elseimportData.Char(ord)importData.IORef(IORef,newIORef,readIORef,writeIORef)importSystem.IO.Unsafe(unsafePerformIO)importData.Int(Int64)# if defined(__HUGS__)importHugs.IOArray(IOArray,newIOArray,unsafeReadIOArray,unsafeWriteIOArray)# elif defined(__NHC__)importNHC.IOExtras(IOArray,newIOArray,readIOArray,writeIOArray)# endif#endifimportControl.Monad(mapM,mapM_,sequence_)-----------------------------------------------------------------------iNSTRUMENTED::BooliNSTRUMENTED=False-----------------------------------------------------------------------readHTArray::HTArraya->Int32->IOawriteMutArray::MutArraya->Int32->a->IO()newMutArray::(Int32,Int32)->a->IO(MutArraya)newMutArray=newIOArraytypeMutArraya=IOArrayInt32atypeHTArraya=MutArraya#if defined(DEBUG) || defined(__NHC__)readHTArray=readIOArraywriteMutArray=writeIOArray#elsereadHTArrayarri=unsafeReadIOArrayarr(fromIntegrali)writeMutArrayarrix=unsafeWriteIOArrayarr(fromIntegrali)x#endifdataHashTablekeyval=HashTable{cmp::!(key->key->Bool),hash_fn::!(key->Int32),tab::!(IORef(HTkeyval))}-- TODO: the IORef should really be an MVar.dataHTkeyval=HT{kcount::!Int32,-- Total number of keys.bmask::!Int32,buckets::!(HTArray[(key,val)])}-- -------------------------------------------------------------- Instrumentation for performance tuning-- This ought to be roundly ignored after optimization when-- iNSTRUMENTED=False.-- STRICT version of modifyIORef!modifyIORef::IORefa->(a->a)->IO()modifyIORefrf=dov<-readIORefrletz=fvinz`seq`writeIORefrzdataHashData=HD{tables::!Integer,insertions::!Integer,lookups::!Integer,totBuckets::!Integer,maxEntries::!Int32,maxChain::!Int,maxBuckets::!Int32}deriving(Eq,Show){-# NOINLINE hashData #-}hashData::IORefHashDatahashData=unsafePerformIO(newIORef(HD{tables=0,insertions=0,lookups=0,totBuckets=0,maxEntries=0,maxChain=0,maxBuckets=tABLE_MIN}))instrument::(HashData->HashData)->IO()instrumenti|iNSTRUMENTED=modifyIORefhashDatai|otherwise=return()recordNew::IO()recordNew=instrumentrecwhererechd@HD{tables=t,totBuckets=b}=hd{tables=t+1,totBuckets=b+fromIntegraltABLE_MIN}recordIns::Int32->Int32->[a]->IO()recordInsiszbkt=instrumentrecwhererechd@HD{insertions=ins,maxEntries=mx,maxChain=mc}=hd{insertions=ins+fromIntegrali,maxEntries=mx`max`sz,maxChain=mc`max`lengthbkt}recordResize::Int32->Int32->IO()recordResizeoldernewer=instrumentrecwhererechd@HD{totBuckets=b,maxBuckets=mx}=hd{totBuckets=b+fromIntegral(newer-older),maxBuckets=mx`max`newer}recordLookup::IO()recordLookup=instrumentlkupwherelkuphd@HD{lookups=l}=hd{lookups=l+1}-- stats :: IO String-- stats = fmap show $ readIORef hashData-- ------------------------------------------------------------------------------ Sample hash functions-- $hash_functions---- This implementation of hash tables uses the low-order /n/ bits of the hash-- value for a key, where /n/ varies as the hash table grows. A good hash-- function therefore will give an even distribution regardless of /n/.---- If your keyspace is integrals such that the low-order bits between-- keys are highly variable, then you could get away with using 'fromIntegral'-- as the hash function.---- We provide some sample hash functions for 'Int' and 'String' below.golden::Int32golden=1013904242-- = round ((sqrt 5 - 1) * 2^32) :: Int32-- was -1640531527 = round ((sqrt 5 - 1) * 2^31) :: Int32-- but that has bad mulHi properties (even adding 2^32 to get its inverse)-- Whereas the above works well and contains no hash duplications for-- [-32767..65536]hashInt32::Int32->Int32hashInt32x=mulHixgolden+x-- | A sample (and useful) hash function for Int and Int32,-- implemented by extracting the uppermost 32 bits of the 64-bit-- result of multiplying by a 33-bit constant. The constant is from-- Knuth, derived from the golden ratio:---- > golden = round ((sqrt 5 - 1) * 2^32)---- We get good key uniqueness on small inputs-- (a problem with previous versions):-- (length $ group $ sort $ map hashInt [-32767..65536]) == 65536 + 32768--hashInt::Int->Int32hashIntx=hashInt32(fromIntegralx)-- hi 32 bits of a x-bit * 32 bit -> 64-bit multiplymulHi::Int32->Int32->Int32mulHiab=fromIntegral(r`shiftR`32)wherer::Int64r=fromIntegrala*fromIntegralb-- | A sample hash function for Strings. We keep multiplying by the-- golden ratio and adding. The implementation is:---- > hashString = foldl' f golden-- > where f m c = fromIntegral (ord c) * magic + hashInt32 m-- > magic = 0xdeadbeef---- Where hashInt32 works just as hashInt shown above.---- Knuth argues that repeated multiplication by the golden ratio-- will minimize gaps in the hash space, and thus it's a good choice-- for combining together multiple keys to form one.---- Here we know that individual characters c are often small, and this-- produces frequent collisions if we use ord c alone. A-- particular problem are the shorter low ASCII and ISO-8859-1-- character strings. We pre-multiply by a magic twiddle factor to-- obtain a good distribution. In fact, given the following test:---- > testp :: Int32 -> Int-- > testp k = (n - ) . length . group . sort . map hs . take n $ ls-- > where ls = [] : [c : l | l <- ls, c <- ['\0'..'\xff']]-- > hs = foldl' f golden-- > f m c = fromIntegral (ord c) * k + hashInt32 m-- > n = 100000---- We discover that testp magic = 0.hashString::String->Int32hashString=foldl'fgoldenwherefmc=fromIntegral(ordc)*magic+hashInt32mmagic=0xdeadbeef-- | A prime larger than the maximum hash table sizeprime::Int32prime=33554467-- ------------------------------------------------------------------------------- ParameterstABLE_MAX::Int32tABLE_MAX=32*1024*1024-- Maximum size of hash tabletABLE_MIN::Int32tABLE_MIN=8hLOAD::Int32hLOAD=7-- Maximum average load of a single hash buckethYSTERESIS::Int32hYSTERESIS=64-- entries to ignore in load computation{- Hysteresis favors long association-list-like behavior for small tables. -}-- ------------------------------------------------------------------------------- Creating a new hash table-- | Creates a new hash table. The following property should hold for the @eq@-- and @hash@ functions passed to 'new':---- > eq A B => hash A == hash B--new::(key->key->Bool)-- ^ @eq@: An equality comparison on keys->(key->Int32)-- ^ @hash@: A hash function on keys->IO(HashTablekeyval)-- ^ Returns: an empty hash tablenewcmprhash=dorecordNew-- make a new hash table with a single, empty, segmentletmask=tABLE_MIN-1bkts<-newMutArray(0,mask)[]letkcnt=0ht=HT{buckets=bkts,kcount=kcnt,bmask=mask}table<-newIORefhtreturn(HashTable{tab=table,hash_fn=hash,cmp=cmpr}){-
bitTwiddleSameAs takes as arguments positive Int32s less than maxBound/2 and
returns the smallest power of 2 that is greater than or equal to the
argument.
http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2-}bitTwiddleSameAs::Int32->Int32bitTwiddleSameAsv0=letv1=v0-1v2=v1.|.(v1`shiftR`1)v3=v2.|.(v2`shiftR`2)v4=v3.|.(v3`shiftR`4)v5=v4.|.(v4`shiftR`8)v6=v5.|.(v5`shiftR`16)inv6+1{-
powerOver takes as arguments Int32s and returns the smallest power of 2
that is greater than or equal to the argument if that power of 2 is
within [tABLE_MIN,tABLE_MAX]
-}powerOver::Int32->Int32powerOvern=ifn<=tABLE_MINthentABLE_MINelseifn>=tABLE_MAXthentABLE_MAXelsebitTwiddleSameAsn-- | Creates a new hash table with the given minimum size.newHint::(key->key->Bool)-- ^ @eq@: An equality comparison on keys->(key->Int32)-- ^ @hash@: A hash function on keys->Int-- ^ @minSize@: initial table size->IO(HashTablekeyval)-- ^ Returns: an empty hash tablenewHintcmprhashminSize=dorecordNew-- make a new hash table with a single, empty, segmentletmask=powerOver$fromIntegralminSizebkts<-newMutArray(0,mask)[]letkcnt=0ht=HT{buckets=bkts,kcount=kcnt,bmask=mask}table<-newIORefhtreturn(HashTable{tab=table,hash_fn=hash,cmp=cmpr})-- ------------------------------------------------------------------------------- Inserting a key\/value pair into the hash table-- | Inserts a key\/value mapping into the hash table.---- Note that 'insert' doesn't remove the old entry from the table --- the behaviour is like an association list, where 'lookup' returns-- the most-recently-inserted mapping for a key in the table. The-- reason for this is to keep 'insert' as efficient as possible. If-- you need to update a mapping, then we provide 'update'.--insert::HashTablekeyval->key->val->IO()inserthtkeyval=updatingBucketCanInsert(\bucket->((key,val):bucket,1,()))htkey-- -------------------------------------------------------------- The core of the implementation is lurking down here, in findBucket,-- updatingBucket, and expandHashTable.tooBig::Int32->Int32->BooltooBigkb=k-hYSTERESIS>hLOAD*b-- index of bucket within table.bucketIndex::Int32->Int32->Int32bucketIndexmaskh=h.&.mask-- find the bucket in which the key belongs.-- returns (key equality, bucket index, bucket)---- This rather grab-bag approach gives enough power to do pretty much-- any bucket-finding thing you might want to do. We rely on inlining-- to throw away the stuff we don't want. I'm proud to say that this-- plus updatingBucket below reduce most of the other definitions to a-- few lines of code, while actually speeding up the hashtable-- implementation when compared with a version which does everything-- from scratch.{-# INLINE findBucket #-}findBucket::HashTablekeyval->key->IO(HTkeyval,Int32,[(key,val)])findBucketHashTable{tab=ref,hash_fn=hash}key=dotable@HT{buckets=bkts,bmask=b}<-readIORefrefletindx=bucketIndexb(hashkey)bucket<-readHTArraybktsindxreturn(table,indx,bucket)dataInserts=CanInsert|Can'tInsertderiving(Eq)-- updatingBucket is the real workhorse of all single-element table-- updates. It takes a hashtable and a key, along with a function-- describing what to do with the bucket in which that key belongs. A-- flag indicates whether this function may perform table insertions.-- The function returns the new contents of the bucket, the number of-- bucket entries inserted (negative if entries were deleted), and a-- value which becomes the return value for the function as a whole.-- The table sizing is enforced here, calling out to expandSubTable as-- necessary.-- This function is intended to be inlined and specialized for every-- calling context (eg every provided bucketFn).{-# INLINE updatingBucket #-}updatingBucket::Inserts->([(key,val)]->([(key,val)],Int32,a))->HashTablekeyval->key->IOaupdatingBucketcanEnlargebucketFnht@HashTable{tab=ref,hash_fn=hash}key=do(table@HT{kcount=k,buckets=bkts,bmask=b},indx,bckt)<-findBuckethtkey(bckt',inserts,result)<-return$bucketFnbcktletk'=k+insertstable1=table{kcount=k'}writeMutArraybktsindxbckt'table2<-ifcanEnlarge==CanInsert&&inserts>0thendorecordInsinsertsk'bckt'iftooBigk'bthenexpandHashTablehashtable1elsereturntable1elsereturntable1writeIORefreftable2returnresultexpandHashTable::(key->Int32)->HTkeyval->IO(HTkeyval)expandHashTablehashtable@HT{buckets=bkts,bmask=mask}=doletoldsize=mask+1newmask=mask+mask+1recordResizeoldsize(newmask+1)--ifnewmask>tABLE_MAX-1thenreturntableelsedo--newbkts<-newMutArray(0,newmask)[]letsplitBucketoldindex=dobucket<-readHTArraybktsoldindexlet(oldb,newb)=partition((oldindex==).bucketIndexnewmask.hash.fst)bucketwriteMutArraynewbktsoldindexoldbwriteMutArraynewbkts(oldindex+oldsize)newbmapM_splitBucket[0..mask]return(table{buckets=newbkts,bmask=newmask})-- ------------------------------------------------------------------------------- Deleting a mapping from the hash table-- Remove a key from a bucketdeleteBucket::(key->Bool)->[(key,val)]->([(key,val)],Int32,())deleteBucket_[]=([],0,())deleteBucketdel(pair@(k,_):bucket)=casedeleteBucketdelbucketof(bucket',dels,_)|delk->dels'`seq`(bucket',dels',())|otherwise->(pair:bucket',dels,())wheredels'=dels-1-- | Remove an entry from the hash table.delete::HashTablekeyval->key->IO()deleteht@HashTable{cmp=eq}key=updatingBucketCan'tInsert(deleteBucket(eqkey))htkey-- ------------------------------------------------------------------------------- Updating a mapping in the hash table-- | Updates an entry in the hash table, returning 'True' if there was-- already an entry for this key, or 'False' otherwise. After 'update'-- there will always be exactly one entry for the given key in the table.---- 'insert' is more efficient than 'update' if you don't care about-- multiple entries, or you know for sure that multiple entries can't-- occur. However, 'update' is more efficient than 'delete' followed-- by 'insert'.update::HashTablekeyval->key->val->IOBoolupdateht@HashTable{cmp=eq}keyval=updatingBucketCanInsert(\bucket->let(bucket',dels,_)=deleteBucket(eqkey)bucketin((key,val):bucket',1+dels,dels/=0))htkey-- ------------------------------------------------------------------------------- Looking up an entry in the hash table-- | Looks up the value of a key in the hash table.lookup::HashTablekeyval->key->IO(Maybeval)lookupht@HashTable{cmp=eq}key=dorecordLookup(_,_,bucket)<-findBuckethtkeyletfirstHit(k,v)r|eqkeyk=Justv|otherwise=rreturn(foldrfirstHitNothingbucket)-- ------------------------------------------------------------------------------- Converting to/from lists-- | Convert a list of key\/value pairs into a hash table. Equality on keys-- is taken from the Eq instance for the key type.--fromList::(Eqkey)=>(key->Int32)->[(key,val)]->IO(HashTablekeyval)fromListhashlist=dotable<-new(==)hashsequence_[inserttablekv|(k,v)<-list]returntable-- | Converts a hash table to a list of key\/value pairs.--toList::HashTablekeyval->IO[(key,val)]toList=mapReduceidconcat{-# INLINE mapReduce #-}mapReduce::([(key,val)]->r)->([r]->r)->HashTablekeyval->IOrmapReducemrHashTable{tab=ref}=doHT{buckets=bckts,bmask=b}<-readIORefreffmapr(mapM(fmapm.readHTArraybckts)[0..b])-- ------------------------------------------------------------------------------- Diagnostics-- | This function is useful for determining whether your hash-- function is working well for your data set. It returns the longest-- chain of key\/value pairs in the hash table for which all the keys-- hash to the same bucket. If this chain is particularly long (say,-- longer than 14 elements or so), then it might be a good idea to try-- a different hash function.--longestChain::HashTablekeyval->IO[(key,val)]longestChain=mapReduceid(maximumBylengthCmp)wherelengthCmp(_:x)(_:y)=lengthCmpxylengthCmp[][]=EQlengthCmp[]_=LTlengthCmp_[]=GT