Skip to content

Improve execution of terms queries over wildcard fields #128986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/128986.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 128986
summary: Improve execution of terms queries over wildcard fields
area: Search
type: bug
issues:
- 128201
1 change: 1 addition & 0 deletions x-pack/plugin/wildcard/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

apply plugin: 'elasticsearch.internal-es-plugin'
apply plugin: 'elasticsearch.internal-yaml-rest-test'
apply plugin: 'elasticsearch.internal-cluster-test'

esplugin {
name = 'wildcard'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.wildcard.search;

import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.index.query.TermsQueryBuilder;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentFactory;
import org.elasticsearch.xcontent.XContentType;
import org.elasticsearch.xpack.wildcard.Wildcard;
import org.hamcrest.Matchers;
import org.junit.Before;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse;

public class WildcardSearchIT extends ESIntegTestCase {

private List<String> terms = null;

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return List.of(Wildcard.class);
}

@Before
public void setup() throws IOException {
terms = new ArrayList<>();
XContentBuilder xcb = XContentFactory.jsonBuilder()
.startObject()
.startObject("properties")
.startObject("wildcard")
.field("type", "wildcard")
.endObject()
.startObject("keyword")
.field("type", "keyword")
.endObject()
.endObject()
.endObject();
indicesAdmin().prepareCreate("test").setMapping(xcb).get();
final int numDocs = randomIntBetween(100, 1000);
final BulkRequestBuilder builder = client().prepareBulk();
for (int i = 0; i < numDocs; i++) {
if (rarely()) {
indexMultiValue(builder);
} else {
indexSingleValue(builder);
}
}
assertFalse(builder.get().hasFailures());
indicesAdmin().prepareRefresh("test").get();
}

private void indexSingleValue(BulkRequestBuilder builder) {
String term = randomIndexString();
builder.add(
new IndexRequest("test").source("{\"wildcard\" : \"" + term + "\", \"keyword\" : \"" + term + "\"}", XContentType.JSON)
);
terms.add(term);
}

private void indexMultiValue(BulkRequestBuilder builder) {
int docSize = randomIntBetween(1, 10);
String[] docTerms = new String[docSize];
for (int i = 0; i < docSize; i++) {
String term = randomIndexString();
terms.add(term);
docTerms[i] = "\"" + term + "\"";
}
builder.add(
new IndexRequest("test").source(
"{\"wildcard\" : " + Arrays.toString(docTerms) + ", \"keyword\" : " + Arrays.toString(docTerms) + "}",
XContentType.JSON
)
);
}

public void testTermQueryDuel() {
for (int i = 0; i < 50; i++) {
String term = randomQueryString(terms);
TermQueryBuilder termQueryBuilder1 = new TermQueryBuilder("wildcard", term);
TermQueryBuilder termQueryBuilder2 = new TermQueryBuilder("keyword", term);
assertResponse(
client().prepareSearch("test").setQuery(termQueryBuilder1),
response -> assertResponse(
client().prepareSearch("test").setQuery(termQueryBuilder2),
response2 -> assertThat(
response.getHits().getTotalHits().value(),
Matchers.equalTo(response2.getHits().getTotalHits().value())
)
)
);
}
}

public void testTermsQueryDuel() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How long this test will run in number of terms is 8192? Is the performance acceptable to be included as a regular test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is very fast, the full suite takes a couple of seconds.

for (int i = 0; i < 10; i++) {
String[] terms = new String[randomIntBetween(2, 8192)];
for (int j = 0; j < terms.length; j++) {
terms[j] = randomQueryString(this.terms);
}
TermsQueryBuilder termsQueryBuilder1 = new TermsQueryBuilder("wildcard", terms);
TermsQueryBuilder termsQueryBuilder2 = new TermsQueryBuilder("keyword", terms);
assertResponse(
client().prepareSearch("test").setQuery(termsQueryBuilder1),
response -> assertResponse(
client().prepareSearch("test").setQuery(termsQueryBuilder2),
response2 -> assertThat(
response.getHits().getTotalHits().value(),
Matchers.equalTo(response2.getHits().getTotalHits().value())
)
)
);
}
}

private static String randomIndexString() {
String string = randomAlphaOfLength(randomIntBetween(0, 30));
if (rarely()) {
return string + "*";
} else if (rarely()) {
return "*" + string;
} else if (rarely()) {
return "*" + string + "*";
} else {
return string;
}
}

private static String randomQueryString(List<String> terms) {
if (rarely()) {
return terms.get(randomIntBetween(0, terms.size() - 1));
} else if (randomBoolean()) {
return randomAlphaOfLength(randomIntBetween(0, 30));
} else {
return randomAlphaOfLength(1) + "*";
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
Expand All @@ -33,6 +34,7 @@
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
Expand Down Expand Up @@ -80,11 +82,14 @@
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import static org.elasticsearch.index.IndexSettings.IGNORE_ABOVE_SETTING;

Expand All @@ -95,6 +100,7 @@ public class WildcardFieldMapper extends FieldMapper {

public static final String CONTENT_TYPE = "wildcard";
public static final short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
private static final int WILDCARD_TERMS_EXPANSION_LIMIT = 16;
public static final int NGRAM_SIZE = 3;

static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
Expand Down Expand Up @@ -859,6 +865,63 @@ public Query termQuery(Object value, SearchExecutionContext context) {
}
}

@Override
public Query termsQuery(Collection<?> values, @Nullable SearchExecutionContext context) {
final BytesRef[] terms = buildTerms(values);
final Query aproxQuery;
if (terms.length < WILDCARD_TERMS_EXPANSION_LIMIT) {
// If there are few terms, we can approximate each term using a BooleanQuery.
final BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (BytesRef term : terms) {
final BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
final Integer numClauses = getApproxWildCardQuery(escapeWildcardSyntax(term.utf8ToString()), rewritten);
if (numClauses != null && numClauses > 0) {
builder.add(rewritten.build(), Occur.SHOULD);
}
}
aproxQuery = builder.build();
} else {
// If there are too many terms, we cannot rewrite approximate into a BooleanQuery as it will use too much memory.
// Instead, we generate a TermInSetQuery. In order to match the necessary documents we need to add at least one token
// per term, ideally we should add the token that makes the term most different from the others.
final Set<String> tokens = new LinkedHashSet<>();
final Set<BytesRef> tokenList = new TreeSet<>();
for (BytesRef term : terms) {
// Break search term into tokens
final boolean matchAll = breakIntoTokens(escapeWildcardSyntax(term.utf8ToString()), tokens);
assert matchAll == false;
if (tokens.isEmpty() == false) {
// If there are tokens, we take the middle one to represent the term
// which is probably the most different one.
tokenList.add(getMiddleToken(tokens));
}
tokens.clear();
}
aproxQuery = new TermInSetQuery(name(), tokenList);
}
return BinaryDvConfirmedQuery.fromTerms(new ConstantScoreQuery(aproxQuery), name(), terms);
}

private static BytesRef getMiddleToken(Set<String> tokens) {
int mid = (tokens.size() + 1) / 2;
Iterator<String> iterator = tokens.iterator();
for (int i = 0; i < mid - 1; i++) {
iterator.next();
}
assert iterator.hasNext();
return BytesRefs.toBytesRef(iterator.next());
}

private static BytesRef[] buildTerms(Collection<?> values) {
final Set<?> dedupe = new HashSet<>(values);
final BytesRef[] terms = new BytesRef[dedupe.size()];
final Iterator<?> iterator = dedupe.iterator();
for (int i = 0; i < dedupe.size(); i++) {
terms[i] = BytesRefs.toBytesRef(iterator.next());
}
return terms;
}

private static String escapeWildcardSyntax(String term) {
StringBuilder result = new StringBuilder();
for (int i = 0; i < term.length();) {
Expand Down
Loading