Navigation

"""Basic indexing and searching example adapted from http://lucene.apache.org/core/4_10_1/core/index.html"""importlucenelucene.initVM()fromorg.apache.luceneimportanalysis,document,index,queryparser,search,store,utilfromlupyneimportengine### lucene ###analyzer=analysis.standard.StandardAnalyzer(util.Version.LUCENE_CURRENT)# Store the index in memory:directory=store.RAMDirectory()# To store an index on disk, use this instead:# Directory directory = FSDirectory.open(File("/tmp/testindex"))config=index.IndexWriterConfig(util.Version.LUCENE_CURRENT,analyzer)iwriter=index.IndexWriter(directory,config)doc=document.Document()text="This is the text to be indexed."doc.add(document.Field("fieldname",text,document.TextField.TYPE_STORED))iwriter.addDocument(doc)iwriter.close()# Now search the index:ireader=index.IndexReader.open(directory)isearcher=search.IndexSearcher(ireader)# Parse a simple query that searches for "text":parser=queryparser.classic.QueryParser(util.Version.LUCENE_CURRENT,"fieldname",analyzer)query=parser.parse("text")hits=isearcher.search(query,None,1000).scoreDocsassertlen(hits)==1# Iterate through the results:forhitinhits:hitDoc=isearcher.doc(hit.doc)asserthitDoc['fieldname']==textireader.close()directory.close()### lupyne #### Store the index in memory:indexer=engine.Indexer()# Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaultsindexer.set('fieldname',stored=True)# settings for all documents of indexer; indexed and tokenized is the defaultindexer.add(fieldname=text)# add documentindexer.commit()# commit changes and refresh searcher# Now search the index:hits=indexer.search('text',field='fieldname')# parsing handled if necessaryassertlen(hits)==1forhitinhits:# hits support mapping interfaceasserthit['fieldname']==text# closing is handled automatically

"""Advanced searching with custom fields.Prefix and Range queries are a potential pitfall in Lucene.As the queries expand to more terms, the performance drops off precipitously.A common example is where datetimes are indexed, but a large span of date ranges are being searched.The usual workaround is to only index the amount of granularity needed, e.g., just the dates.But this may not be sufficient, or the datetimes may be necessary for other searches.The general solution is to index the term values into a prefix tree.Then each query can expand to only values of the appropriate granularity.Lucene's NumericFields encode numbers to be sortable, so it is also able to cluster prefixes into the same field.Whereas Lupyne's NestedField assumes the value is already a sortable string, so different fields must be used to cluster the prefixes.There are trade-offs to each approach: * NumericFields support range queries natively, but must translate prefix queries. * NestedFields support prefix queries optimally, but must translate range queries. * NumericFields only support numbers, and result in unreadable values in the index. * NestedFields support any searchable values, but pollute the field namespace.Lupyne PointFields and DateTimeFields are now implemented as NumericFields since both are easily encoded as numbers.NestedFields could still be used however, as demonstrated on dates below."""fromdatetimeimportdateimportlucenelucene.initVM()fromorg.apache.luceneimportsearchfromlupyneimportenginedocs=[{'city':'San Francisco','state':'CA','incorporated':'1850-04-15','population':808976,'longitude':-122.4192,'latitude':37.7752},{'city':'Los Angeles','state':'CA','incorporated':'1850-04-04','population':3849378,'longitude':-118.2434,'latitude':34.0521},{'city':'Portland','state':'OR','incorporated':'1851-02-08','population':575930,'longitude':-122.6703,'latitude':45.5238},]indexer=engine.Indexer()indexer.set('city',stored=True,indexed=False)indexer.set('state',stored=True,indexed=False)# set method supports custom field types inheriting their default settingsindexer.set('incorporated',engine.DateTimeField)indexer.set('year-month-day',engine.NestedField,sep='-')indexer.set('population',engine.NumericField,type=int)indexer.set('point',engine.PointField,precision=10)# assigned fields can have a different key from their underlying field nameindexer.fields['location']=engine.NestedField('state.city')fordocindocs:doc['year-month-day']=doc['incorporated']point=doc.pop('longitude'),doc.pop('latitude')location=doc['state']+'.'+doc['city']incorporated=map(int,doc.pop('incorporated').split('-'))indexer.add(doc,location=location,incorporated=date(*incorporated),point=[point])indexer.commit()query=indexer.fields['incorporated'].prefix([1850])assertquery.max.doubleValue()-query.min.doubleValue()==60*60*24*365assert[hit['city']forhitinindexer.search(query)]==['San Francisco','Los Angeles']query=indexer.fields['incorporated'].range(date(1850,4,10),None)assertquery.maxisNoneassert[hit['city']forhitinindexer.search(query)]==['San Francisco','Portland']query=indexer.fields['year-month-day'].prefix('1850')assertstr(query)=='year:1850*'assert[hit['city']forhitinindexer.search(query)]==['San Francisco','Los Angeles']query=indexer.fields['year-month-day'].range('1850-04-10',None)assertstr(query)=='year-month-day:[1850-04-10 TO *}'assert[hit['city']forhitinindexer.search(query)]==['San Francisco','Portland']query=indexer.fields['population'].range(0,1000000)assertstr(query)=='population:[0 TO 1000000}'assert[hit['city']forhitinindexer.search(query)]==['San Francisco','Portland']cities=['San Francisco','Los Angeles','Portland']forindex,distanceinenumerate([1e3,1e5,2e5,1e6]):query=indexer.fields['point'].within(-122.4,37.7,distance=distance)assertisinstance(query,search.BooleanQuery)andlen(query)<=4assertset(hit['city']forhitinindexer.search(query))==set(cities[:index])query=indexer.fields['location'].prefix('CA.San')# works like any prefix queryassertstr(query)=='state.city:CA.San*'assert[hit['city']forhitinindexer.search(query)]==['San Francisco']query=indexer.fields['location'].prefix('CA')# optimized to search the best fieldassertstr(query)=='state:CA*'assert[hit['city']forhitinindexer.search(query)]==['San Francisco','Los Angeles']

"""PyLucene has several pitfalls when collecting or sorting a large query result.Generally they involve the overhead of traversing the VM in an internal loop.Lucene also requires supplying a maximum doc count for searches,and supplying an excessively large count is a poor workaround because the collection heap is pre-allocated.Finally the custom sorting interface, although well-supported in PyLucene, has horrible performance.The sort key of every potential doc must realistically be cached,but the overhead of O(n log n) comparison calls dispatched through the VM is far worse than iterating ScoreDocs.To mitigate all these problems, Lupyne first provides a unified search interface.The same Hits type is returned regardless of optional doc count or sorting parameters.As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.Internally a CachingCollector is used when all docs are requested.The search method allows lucene Sort parameters to be passed through, since that's still optimal.Additionally the hits themselves can be sorted afterwards with any python callable key.The IndexSearcher.comparator method is convenient for creating a sort key table from indexed fields.The upshot is custom sorting and sorting large results are both easier and faster.Custom sorting isn't necessary in the below example of course, just there for demonstration."""importlucenelucene.initVM()fromorg.apache.luceneimportsearch,utilfromorg.apache.pylucene.searchimportPythonFieldComparator,PythonFieldComparatorSourcefromlupyneimportenginecolors='red','green','blue','cyan','magenta','yellow'indexer=engine.Indexer()indexer.set('color',stored=True,tokenized=False)forcolorincolors:indexer.add(color=color)indexer.commit()### lucene ###searcher=search.IndexSearcher(indexer.indexReader)sorter=search.Sort(search.SortField('color',search.SortField.Type.STRING))topdocs=searcher.search(search.MatchAllDocsQuery(),None,10,sorter)assert[searcher.doc(scoredoc.doc)['color']forscoredocintopdocs.scoreDocs]==sorted(colors)classComparatorSource(PythonFieldComparatorSource):classnewComparator(PythonFieldComparator):def__init__(self,name,numHits,sortPos,reversed):PythonFieldComparator.__init__(self)self.name=nameself.values=[None]*numHitsdefsetNextReader(self,reader,*args):ifnotargs:reader=reader.reader()comparator=search.FieldCache.DEFAULT.getTermsIndex(reader,self.name)iflucene.VERSION.startswith('4.8'):br=util.BytesRef()self.comparator=[comparator.get(id,br)orbr.utf8ToString()foridinrange(reader.maxDoc())]else:self.comparator=[comparator.get(id).utf8ToString()foridinrange(reader.maxDoc())]returnselfdefcompare(self,slot1,slot2):returncmp(self.values[slot1],self.values[slot2])defsetBottom(self,slot):self._bottom=self.values[slot]defcompareBottom(self,doc):returncmp(self._bottom,self.comparator[doc])defcopy(self,slot,doc):self.values[slot]=self.comparator[doc]defvalue(self,slot):passsorter=search.Sort(search.SortField('color',ComparatorSource()))# still must supply excessive doc count to use the sortertopdocs=searcher.search(search.MatchAllDocsQuery(),None,10,sorter)assert[searcher.doc(scoredoc.doc)['color']forscoredocintopdocs.scoreDocs]==sorted(colors)### lupyne ###hits=indexer.search(sort='color')assert[hit['color']forhitinhits]==sorted(colors)comparator=indexer.comparator('color')assertlist(comparator)==list(colors)hits=indexer.search().sorted(comparator.__getitem__)assert[hit['color']forhitinhits]==sorted(colors)

"""Grouping and facets.Lupyne supports lucene's contrib grouping.GroupingSearch interface, but it has some limitations.GroupingSearch objects only support single-valued strings, and won't find zero-valued facets.Lupyne also supports grouping hits by an arbitrary function after the original search,Similar to sorting, the native approach is generally more efficient, proportional to the number of documents culled.Lupyne also supports using cached filters to compute facet counts.Although seemingly less efficient, it is significantly faster with small numbers of terms.It also has no limitations on multiple values, and can be fully customized without reindexing."""importitertoolsimportlucenelucene.initVM()fromlupyneimportenginecolors='red','green','blue','cyan','magenta','yellow'facets=dict(zip(colors,itertools.count(1)))indexer=engine.Indexer()indexer.set('color',stored=True,tokenized=False)forcolorinfacets:forindexinrange(facets[color]):indexer.add(color=color)indexer.commit()query=engine.Query.alldocs()# group using native GroupingSearchforhitsinindexer.groupby('color',query):assertfacets[hits.value]==hits.counthit,=hitsasserthit['color']==hits.value# group using Hits interfaceforhitsinindexer.search(query).groupby(indexer.comparator('color').__getitem__,docs=1):assertfacets[hits.value]==hits.counthit,=hitsasserthit['color']==hits.value# facets use a GroupingSearch if no filters are registeredassertindexer.facets(query,'color')['color']==facets# filters allow flexible customizations without any indexing changesindexer.filters['color']={'additive':engine.Query.any(color=colors[:3]).filter(),'subtractive':engine.Query.any(color=colors[3:]).filter(),}assertindexer.facets(query,'color')['color']=={'additive':6,'subtractive':15}

"""Parallel indexing.One of Lucene's shortcomings as a general purpose database is the lack of atomic partial updates.IndexWriter.updateDocument merely deletes and adds a document in a transaction.The burden is on the application to handle both the inefficiency and concurrency issues of updating unchanged fields.This is poorly suited for many scenarios, where there are large static fields (e.g. text) and small volatile fields (e.g. tags).Thus many applications must keep volatile data in an external database, with poor performance when joining searches across vertical partitions.Solutions have been discussed for years (https://issues.apache.org/jira/browse/LUCENE-1879) with little progress.IndexWriters can now update DocValues in-place, but that's only a partial workaround since DocValues aren't indexed.ParallelReaders allow keeping the volatile fields in a separate index, but require syncing the ephemeral doc nums.This is essentially useless, as the whole point is that the indices wouldn't be updated with the same frequency.Lupyne provides another solution: parallel indexing with syncing on a unique indexed field.The most efficient way to intersect a search with outside data is to use a cached TermsFilter.Lupyne's TermsFilter provides a set-like interface for managing which unique terms should match.For simplicity and efficiency a searcher must also be registered with the filter before using it in a search.The TermsFilter instance manages the thread-safe cache, with optimal incremental updates of both terms and searchers.Additionally TermsFilters can be registered with IndexSearchers, such that reopening keeps the filter updated.Finally, for applications which can also keep the volatile data in a separate Lucene index,a ParallelIndexer will manage the matching terms by mapping the real underlying filters into terms,keeping the registered TermsFilters updated with every commit."""importlucenelucene.initVM()fromlupyneimportengine# setup main index with unique name fieldprimary=engine.Indexer()primary.set('name',stored=True,tokenized=False)primary.set('text')fornamein('alpha','bravo'):primary.add(name=name,text='large body of text')primary.commit()# setup parallel index with matching unique field and additional volatile fieldsecondary=engine.ParallelIndexer('name')secondary.set('votes',engine.NumericField)secondary.add(name='alpha',votes=1)secondary.add(name='bravo',votes=0)secondary.add(name='charlie',votes=1)secondary.commit()# automatically create and register TermsFilter, which matches positive votesreal_filter=secondary.fields['votes'].filter(1,None)assertstr(real_filter)=="votes:[1 TO *}"auto_filter=secondary.termsfilter(real_filter,primary)# instead of using parallel index, manually create and register TermsFilterman_filter=primary.termsfilter('name',['alpha','charlie'])# in either case: alpha matches, bravo doesn't, charlie doesn't exist (yet)forfilterin(man_filter,auto_filter):assert[hit['name']forhitinprimary.search(filter=filter)]==['alpha']# update vote countssecondary.update('alpha',votes=0)secondary.update('bravo',votes=1)secondary.commit()# instead of using parallel index, simulate the updates manuallyman_filter.discard('alpha')man_filter.add('bravo')# add missing document to main indexprimary.add(name='charlie')primary.commit()# in either case: alpha no longer matches, bravo now does, charlie now existsforfilterin(man_filter,auto_filter):assert[hit['name']forhitinprimary.search(filter=filter)]==['bravo','charlie']

"""Custom server.Fields settings are assigned directly to the root.Indexing is done here just to populate the example.A custom filter and sorter are demonstrated by transforming a date field into a year field.Filters are also used for faceting; sorters are also used for grouping.Example queries: * http://localhost:8080/search?q=date:17*&group=year * http://localhost:8080/search?q=date:17*&group=year&sort=-year * http://localhost:8080/search?count=0&facets=year * http://localhost:8080/search?q=text:right&count=3&facets=year"""importlucenefromlupyneimportengine,serverfromtestimportfixturedefparse(date):returnint(date.utf8ToString().split('-')[0])if__name__=='__main__':lucene.initVM(vmargs='-Xrs')root=server.WebIndexer()# assign field settingsroot.indexer.set('amendment',stored=True,tokenized=False)root.indexer.set('date',stored=True,tokenized=False)root.indexer.set('text')# populate indexfordocinfixture.constitution.docs():if'amendment'indoc:root.indexer.add(doc)root.update()# assign custom filter and sorter based on yearroot.searcher.sorters['year']=engine.SortField('date',int,parse)years=set(date.split('-')[0]fordateinroot.searcher.terms('date'))root.searcher.filters['year']=dict((year,engine.Query.prefix('date',year).filter())foryearinyears)# start with pretty-printingserver.start(root,config={'global':{'tools.json_out.indent':2}})