moduleDatabase.TxtSushi.ExternalSort(externalSort,externalSortBy,externalSortByConstrained,defaultByteQuota,defaultMaxOpenFiles)whereimportControl.MonadimportData.BinaryimportData.Binary.GetimportData.IntimportqualifiedData.ByteString.LazyasBSimportData.ListimportSystem.IOimportSystem.IO.UnsafeimportSystem.Directory-- | performs an external sort on the given list using the default resource-- constraintsexternalSort::(Binaryb,Ordb)=>[b]->[b]externalSort=externalSortBycompare-- | performs an external sort on the given list using the given comparison-- function and the default resource constraintsexternalSortBy::(Binaryb)=>(b->b->Ordering)->[b]->[b]externalSortBy=externalSortByConstraineddefaultByteQuotadefaultMaxOpenFiles-- | Currently 16 MB. Don't rely on this value staying the same in future-- releases!defaultByteQuota::IntdefaultByteQuota=16*1024*1024-- | Currently 17 files. Don't rely on this value staying the same in future-- releases!defaultMaxOpenFiles::IntdefaultMaxOpenFiles=17-- | performs an external sort on the given list using the given resource-- constraints{-# NOINLINE externalSortByConstrained #-}externalSortByConstrained::(Binaryb,Integrali)=>i->i->(b->b->Ordering)->[b]->[b]externalSortByConstrainedbyteQuotamaxOpenFilescmpxs=unsafePerformIO$dopartialSortFiles<-bufferPartialSortsBy(fromIntegralbyteQuota)cmpxs-- now we must merge together the partial sortsexternalMergeAllBy(fromIntegralmaxOpenFiles)cmppartialSortFiles-- | merge a list of sorted lists into a single sorted listmergeAllBy::(a->a->Ordering)->[[a]]->[a]mergeAllBy_[]=[]mergeAllBy_[singletonList]=singletonListmergeAllBycmp(fstList:sndList:[])=mergeBycmpfstListsndListmergeAllBycmplistList=-- recurse after breking the list down by about 1/2 the sizemergeAllBycmp(partitionAndMerge2cmplistList)-- TODO add a smart adjustment so that the last partition will not ever-- be more than 1 element different than the others-- | partitions the given sorted lists into groupings containing `partitionSize`-- or fewer lists then merges each of those partitions. So the returned-- list should normally be shorter than the given listpartitionAndMerge::Int->(a->a->Ordering)->[[a]]->[[a]]partitionAndMerge__[]=[]partitionAndMergepartitionSizecmplistList=map(mergeAllBycmp)(regularPartitionspartitionSizelistList)-- | chops up the given list at regular intervalsregularPartitions::Int->[a]->[[a]]regularPartitions_[]=[]regularPartitionspartitionSizexs=let(currPartition,otherXs)=splitAtpartitionSizexsincurrPartition:regularPartitionspartitionSizeotherXs-- | merge two sorted lists into a single sorted listmergeBy::(a->a->Ordering)->[a]->[a]->[a]mergeBy_[]list2=list2mergeBy_list1[]=list1mergeBycomparisonFunctionlist1@(head1:tail1)list2@(head2:tail2)=casehead1`comparisonFunction`head2ofGT->head2:mergeBycomparisonFunctionlist1tail2_->head1:mergeBycomparisonFunctiontail1list2externalMergePass::Binaryb=>Int->(b->b->Ordering)->[String]->IO[String]externalMergePass__[]=return[]externalMergePassmaxOpenFilescmpfiles=do-- we use (maxOpenFiles - 1) because we need to account for the file-- handle that we're reading fromlet(mergeNowFiles,mergeLaterFiles)=splitAt(maxOpenFiles-1)filesmergeNowBinStrs<-readThenDelBinFilesmergeNowFilesletmergeNowBinaries=mapdecodeAllmergeNowBinStrsmergedNowFile<-bufferToTempFile$mergeAllBycmpmergeNowBinariesmergedLaterFiles<-externalMergePassmaxOpenFilescmpmergeLaterFilesreturn$mergedNowFile:mergedLaterFilesexternalMergeAllBy::Binaryb=>Int->(b->b->Ordering)->[String]->IO[b]externalMergeAllBy__[]=return[]-- TODO do i need to write singleton lists to file in order to keep the max open file promise??externalMergeAllBy__[singletonFile]=readThenDelBinFilesingletonFile>>=return.decodeAllexternalMergeAllBymaxOpenFilescmpfiles=dopartiallyMergedFiles<-externalMergePassmaxOpenFilescmpfilesexternalMergeAllBymaxOpenFilescmppartiallyMergedFiles-- | create a list of parial sortsbufferPartialSortsBy::(Binaryb)=>Int64->(b->b->Ordering)->[b]->IO[String]bufferPartialSortsBy__[]=return[]bufferPartialSortsBybyteQuotacmpxs=dolet(sortNowList,sortLaterList)=splitAfterQuotabyteQuotaxssortedRows=sortBycmpsortNowListsortBuffer<-bufferToTempFilesortedRowsotherSortBuffers<-bufferPartialSortsBybyteQuotacmpsortLaterListreturn(sortBuffer:otherSortBuffers)-- TODO not efficiet! we're converting to binary twice so that we don't have-- the bytestrings buffered to memory during the sort (that would about double-- our mem usage). I think the right answer is to add a class extending binary-- that has a sizeOf functionsplitAfterQuota::(Binaryb)=>Int64->[b]->([b],[b])splitAfterQuota_[]=([],[])splitAfterQuotaquotaInBytes(binaryHead:binaryTail)=letquotaRemaining=quotaInBytes-BS.length(encodebinaryHead)(fstBinsTail,sndBins)=splitAfterQuotaquotaRemainingbinaryTailinifquotaRemaining<=0then([binaryHead],binaryTail)else(binaryHead:fstBinsTail,sndBins)-- | lazily reads then deletes the given filesreadThenDelBinFiles::[String]->IO[BS.ByteString]readThenDelBinFiles=sequence.mapreadThenDelBinFile-- | lazily reads then deletes the given file after the last byte is readreadThenDelBinFile::String->IOBS.ByteStringreadThenDelBinFilefileName=dobinStr<-BS.readFilefileNameemptyStr<-unsafeInterleaveIO$removeFilefileName>>returnBS.emptyreturn$binStr`BS.append`emptyStr-- | buffer the binaries to a temporary file and return a handle to that filebufferToTempFile::(Binaryb)=>[b]->IOStringbufferToTempFile[]=return[]bufferToTempFilexs=dotempDir<-getTemporaryDirectory(tempFilePath,tempFileHandle)<-openBinaryTempFiletempDir"sort.txt"BS.hPuttempFileHandle(encodeAllxs)hClosetempFileHandlereturntempFilePathencodeAll::(Binaryb)=>[b]->BS.ByteStringencodeAll=BS.concat.mapencodedecodeAll::(Binaryb)=>BS.ByteString->[b]decodeAllbs|BS.nullbs=[]|otherwise=let(decodedBin,remainingBs,_)=runGetStategetbs0indecodedBin:decodeAllremainingBs