Skip to content

Commit

Permalink
Ideas for lucene query perf
Browse files Browse the repository at this point in the history
  • Loading branch information
bbimber committed Feb 4, 2024
1 parent 262b703 commit 6690bda
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 14 deletions.
26 changes: 25 additions & 1 deletion jbrowse/src/org/labkey/jbrowse/JBrowseController.java
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ public ApiResponse execute(LuceneQueryForm form, BindException errors)

try
{
return new ApiSimpleResponse(searcher.doSearch(getUser(), PageFlowUtil.decode(form.getSearchString()), form.getPageSize(), form.getOffset()));
return new ApiSimpleResponse(searcher.doSearch(getUser(), PageFlowUtil.decode(form.getSearchString()), form.getPageSize(), form.getOffset(), form.getLastDoc(), form.getLastScore()));
}
catch (Exception e)
{
Expand Down Expand Up @@ -947,6 +947,10 @@ public static class LuceneQueryForm

private int _offset = 0;

private int _lastDoc = -1;

private int _lastScore = -1;

public String getSearchString()
{
return _searchString;
Expand Down Expand Up @@ -996,6 +1000,26 @@ public void setTrackId(String trackId)
{
_trackId = trackId;
}

public int getLastDoc()
{
return _lastDoc;
}

public void setLastDoc(int lastDoc)
{
_lastDoc = lastDoc;
}

public int getLastScore()
{
return _lastScore;
}

public void setLastScore(int lastScore)
{
_lastScore = lastScore;
}
}

public static class ResolveVcfFieldsForm
Expand Down
52 changes: 39 additions & 13 deletions jbrowse/src/org/labkey/jbrowse/JBrowseLuceneSearch.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jetbrains.annotations.Nullable;
Expand Down Expand Up @@ -57,6 +58,8 @@

public class JBrowseLuceneSearch
{
private static final String ALL_DOCS = "all";

private final JBrowseSession _session;
private final JsonFile _jsonFile;
private final User _user;
Expand Down Expand Up @@ -130,7 +133,7 @@ public String extractFieldName(String queryString) {
return parts.length > 0 ? parts[0].trim() : null;
}

public JSONObject doSearch(User u, String searchString, final int pageSize, final int offset) throws IOException, ParseException
public JSONObject doSearch(User u, String searchString, final int pageSize, final int offset, final int lastDoc, final int lastScore) throws IOException, ParseException
{
searchString = tryUrlDecode(searchString);
File indexPath = _jsonFile.getExpectedLocationOfLuceneIndex(true);
Expand Down Expand Up @@ -182,14 +185,14 @@ public JSONObject doSearch(User u, String searchString, final int pageSize, fina

BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();

if (searchString.equals("all")) {
if (searchString.equals(ALL_DOCS)) {
booleanQueryBuilder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
}

// Split input into tokens, 1 token per query separated by &
StringTokenizer tokenizer = new StringTokenizer(searchString, "&");

while (tokenizer.hasMoreTokens() && !searchString.equals("all"))
while (tokenizer.hasMoreTokens() && !searchString.equals(ALL_DOCS))
{
String queryString = tokenizer.nextToken();
Query query = null;
Expand Down Expand Up @@ -226,18 +229,38 @@ else if (numericQueryParserFields.contains(fieldName))

BooleanQuery query = booleanQueryBuilder.build();

// TODO: if the index is generated with a sort on genomicPosition, maybe we just use Sort.INDEXORDER?
// Sort sort = Sort.INDEXORDER;
Sort sort = new Sort(new SortField("genomicPosition", SortField.Type.INT, false));

// Get chunks of size {pageSize}. Default to 1 chunk -- add to the offset to get more.
// We then iterate over the range of documents we want based on the offset. This does grow in memory
// linearly with the number of documents, but my understanding is that these are just score,id pairs
// Get chunks of size {pageSize}. Default to 1 chunk -- add to the offset to get more.
// We then iterate over the range of documents we want based on the offset. This does grow in memory
// linearly with the number of documents, but my understanding is that these are just score,id pairs
// rather than full documents, so mem usage *should* still be pretty low.
//TopDocs topDocs = indexSearcher.search(query, pageSize * (offset + 1));

// Define sort field
SortField sortField = new SortField("pos", SortField.Type.INT, false);
Sort sort = new Sort(sortField);

// Perform the search with sorting
TopFieldDocs topDocs = indexSearcher.search(query, pageSize * (offset + 1), sort);
// TODO: rather than a simple offset, the client should be able to provide the max(genomicPosition).
// We could add this as a lucene filter (i.e., get the first pageSize docs above that value),
// which might really reduce what we need to scan though.
// Note that filter is different than query string (https://javaranch.com/journal/2009/02/filtering-a-lucene-search.html)
// A second idea is to use searchAfter(). If we knew the max(genomicPosition) of the last query, we could directly add it to that method
TopDocs topDocs;
//if (searchString.equals(ALL_DOCS))
//{
// // TODO: since this is loaded on each page, consider special casing it to grad the first 100 records as fast as possible.
// // this will return the index size
// indexSearcher.getIndexReader().numDocs()
//}

if (lastDoc > -1)
{
topDocs = indexSearcher.searchAfter(new ScoreDoc(lastDoc, lastScore), query, pageSize, sort);
}
else
{
topDocs = indexSearcher.search(query, pageSize * (offset + 1), sort);
}

JSONObject results = new JSONObject();

Expand All @@ -247,7 +270,8 @@ else if (numericQueryParserFields.contains(fieldName))
for (int i = pageSize * offset; i < Math.min(pageSize * (offset + 1), topDocs.scoreDocs.length); i++)
{
JSONObject elem = new JSONObject();
Document doc = indexSearcher.doc(topDocs.scoreDocs[i].doc);
ScoreDoc sd = topDocs.scoreDocs[i];
Document doc = indexSearcher.doc(sd.doc);

for (IndexableField field : doc.getFields()) {
String fieldName = field.name();
Expand All @@ -266,6 +290,8 @@ else if (numericQueryParserFields.contains(fieldName))

results.put("data", data);
results.put("totalHits", topDocs.totalHits.value);
results.put("lastDoc", topDocs.scoreDocs[topDocs.scoreDocs.length - 1].doc);
results.put("lastScore", topDocs.scoreDocs[topDocs.scoreDocs.length - 1].score);

//TODO: we should probably stream this
return results;
Expand Down

0 comments on commit 6690bda

Please sign in to comment.