Improve execution of terms queries over wildcard fields (elastic#128986)

iverase · valeriy42 · commit e31f5edbd313 · 2025-06-12T13:27:36.000+02:00
This commit implements TermsQuery(Collection<?> values, @nullable SearchExecutionContext context) in the WildCardFieldMapper to avoid memory pressure when building the query.
diff --git a/docs/changelog/128986.yaml b/docs/changelog/128986.yaml
@@ -0,0 +1,6 @@
+pr: 128986
+summary: Improve execution of terms queries over wildcard fields
+area: Search
+type: bug
+issues:
+ - 128201
diff --git a/x-pack/plugin/wildcard/build.gradle b/x-pack/plugin/wildcard/build.gradle
@@ -7,6 +7,7 @@
 
 apply plugin: 'elasticsearch.internal-es-plugin'
 apply plugin: 'elasticsearch.internal-yaml-rest-test'
+apply plugin: 'elasticsearch.internal-cluster-test'
 
 esplugin {
   name = 'wildcard'
diff --git a/x-pack/plugin/wildcard/src/internalClusterTest/java/org/elasticsearch/xpack/wildcard/search/WildcardSearchIT.java b/x-pack/plugin/wildcard/src/internalClusterTest/java/org/elasticsearch/xpack/wildcard/search/WildcardSearchIT.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.wildcard.search;
+
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.index.query.TermQueryBuilder;
+import org.elasticsearch.index.query.TermsQueryBuilder;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentFactory;
+import org.elasticsearch.xcontent.XContentType;
+import org.elasticsearch.xpack.wildcard.Wildcard;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse;
+
+public class WildcardSearchIT extends ESIntegTestCase {
+
+    private List<String> terms = null;
+
+    @Override
+    protected Collection<Class<? extends Plugin>> nodePlugins() {
+        return List.of(Wildcard.class);
+    }
+
+    @Before
+    public void setup() throws IOException {
+        terms = new ArrayList<>();
+        XContentBuilder xcb = XContentFactory.jsonBuilder()
+            .startObject()
+            .startObject("properties")
+            .startObject("wildcard")
+            .field("type", "wildcard")
+            .endObject()
+            .startObject("keyword")
+            .field("type", "keyword")
+            .endObject()
+            .endObject()
+            .endObject();
+        indicesAdmin().prepareCreate("test").setMapping(xcb).get();
+        final int numDocs = randomIntBetween(100, 1000);
+        final BulkRequestBuilder builder = client().prepareBulk();
+        for (int i = 0; i < numDocs; i++) {
+            if (rarely()) {
+                indexMultiValue(builder);
+            } else {
+                indexSingleValue(builder);
+            }
+        }
+        assertFalse(builder.get().hasFailures());
+        indicesAdmin().prepareRefresh("test").get();
+    }
+
+    private void indexSingleValue(BulkRequestBuilder builder) {
+        String term = randomIndexString();
+        builder.add(
+            new IndexRequest("test").source("{\"wildcard\" : \"" + term + "\", \"keyword\" : \"" + term + "\"}", XContentType.JSON)
+        );
+        terms.add(term);
+    }
+
+    private void indexMultiValue(BulkRequestBuilder builder) {
+        int docSize = randomIntBetween(1, 10);
+        String[] docTerms = new String[docSize];
+        for (int i = 0; i < docSize; i++) {
+            String term = randomIndexString();
+            terms.add(term);
+            docTerms[i] = "\"" + term + "\"";
+        }
+        builder.add(
+            new IndexRequest("test").source(
+                "{\"wildcard\" : " + Arrays.toString(docTerms) + ", \"keyword\" : " + Arrays.toString(docTerms) + "}",
+                XContentType.JSON
+            )
+        );
+    }
+
+    public void testTermQueryDuel() {
+        for (int i = 0; i < 50; i++) {
+            String term = randomQueryString(terms);
+            TermQueryBuilder termQueryBuilder1 = new TermQueryBuilder("wildcard", term);
+            TermQueryBuilder termQueryBuilder2 = new TermQueryBuilder("keyword", term);
+            assertResponse(
+                client().prepareSearch("test").setQuery(termQueryBuilder1),
+                response -> assertResponse(
+                    client().prepareSearch("test").setQuery(termQueryBuilder2),
+                    response2 -> assertThat(
+                        response.getHits().getTotalHits().value(),
+                        Matchers.equalTo(response2.getHits().getTotalHits().value())
+                    )
+                )
+            );
+        }
+    }
+
+    public void testTermsQueryDuel() {
+        for (int i = 0; i < 10; i++) {
+            String[] terms = new String[randomIntBetween(2, 8192)];
+            for (int j = 0; j < terms.length; j++) {
+                terms[j] = randomQueryString(this.terms);
+            }
+            TermsQueryBuilder termsQueryBuilder1 = new TermsQueryBuilder("wildcard", terms);
+            TermsQueryBuilder termsQueryBuilder2 = new TermsQueryBuilder("keyword", terms);
+            assertResponse(
+                client().prepareSearch("test").setQuery(termsQueryBuilder1),
+                response -> assertResponse(
+                    client().prepareSearch("test").setQuery(termsQueryBuilder2),
+                    response2 -> assertThat(
+                        response.getHits().getTotalHits().value(),
+                        Matchers.equalTo(response2.getHits().getTotalHits().value())
+                    )
+                )
+            );
+        }
+    }
+
+    private static String randomIndexString() {
+        String string = randomAlphaOfLength(randomIntBetween(0, 30));
+        if (rarely()) {
+            return string + "*";
+        } else if (rarely()) {
+            return "*" + string;
+        } else if (rarely()) {
+            return "*" + string + "*";
+        } else {
+            return string;
+        }
+    }
+
+    private static String randomQueryString(List<String> terms) {
+        if (rarely()) {
+            return terms.get(randomIntBetween(0, terms.size() - 1));
+        } else if (randomBoolean()) {
+            return randomAlphaOfLength(randomIntBetween(0, 30));
+        } else {
+            return randomAlphaOfLength(1) + "*";
+        }
+    }
+}
diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java
@@ -25,6 +25,7 @@
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.FieldExistsQuery;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
@@ -33,6 +34,7 @@
 import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
@@ -80,11 +82,14 @@
 import java.nio.charset.StandardCharsets;
 import java.time.ZoneId;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.TreeSet;
 
 import static org.elasticsearch.index.IndexSettings.IGNORE_ABOVE_SETTING;
 
@@ -95,6 +100,7 @@ public class WildcardFieldMapper extends FieldMapper {
 
     public static final String CONTENT_TYPE = "wildcard";
     public static final short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
+    private static final int WILDCARD_TERMS_EXPANSION_LIMIT = 16;
     public static final int NGRAM_SIZE = 3;
 
     static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
@@ -859,6 +865,63 @@ public Query termQuery(Object value, SearchExecutionContext context) {
             }
         }
 
+        @Override
+        public Query termsQuery(Collection<?> values, @Nullable SearchExecutionContext context) {
+            final BytesRef[] terms = buildTerms(values);
+            final Query aproxQuery;
+            if (terms.length < WILDCARD_TERMS_EXPANSION_LIMIT) {
+                // If there are few terms, we can approximate each term using a BooleanQuery.
+                final BooleanQuery.Builder builder = new BooleanQuery.Builder();
+                for (BytesRef term : terms) {
+                    final BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
+                    final Integer numClauses = getApproxWildCardQuery(escapeWildcardSyntax(term.utf8ToString()), rewritten);
+                    if (numClauses != null && numClauses > 0) {
+                        builder.add(rewritten.build(), Occur.SHOULD);
+                    }
+                }
+                aproxQuery = builder.build();
+            } else {
+                // If there are too many terms, we cannot rewrite approximate into a BooleanQuery as it will use too much memory.
+                // Instead, we generate a TermInSetQuery. In order to match the necessary documents we need to add at least one token
+                // per term, ideally we should add the token that makes the term most different from the others.
+                final Set<String> tokens = new LinkedHashSet<>();
+                final Set<BytesRef> tokenList = new TreeSet<>();
+                for (BytesRef term : terms) {
+                    // Break search term into tokens
+                    final boolean matchAll = breakIntoTokens(escapeWildcardSyntax(term.utf8ToString()), tokens);
+                    assert matchAll == false;
+                    if (tokens.isEmpty() == false) {
+                        // If there are tokens, we take the middle one to represent the term
+                        // which is probably the most different one.
+                        tokenList.add(getMiddleToken(tokens));
+                    }
+                    tokens.clear();
+                }
+                aproxQuery = new TermInSetQuery(name(), tokenList);
+            }
+            return BinaryDvConfirmedQuery.fromTerms(new ConstantScoreQuery(aproxQuery), name(), terms);
+        }
+
+        private static BytesRef getMiddleToken(Set<String> tokens) {
+            int mid = (tokens.size() + 1) / 2;
+            Iterator<String> iterator = tokens.iterator();
+            for (int i = 0; i < mid - 1; i++) {
+                iterator.next();
+            }
+            assert iterator.hasNext();
+            return BytesRefs.toBytesRef(iterator.next());
+        }
+
+        private static BytesRef[] buildTerms(Collection<?> values) {
+            final Set<?> dedupe = new HashSet<>(values);
+            final BytesRef[] terms = new BytesRef[dedupe.size()];
+            final Iterator<?> iterator = dedupe.iterator();
+            for (int i = 0; i < dedupe.size(); i++) {
+                terms[i] = BytesRefs.toBytesRef(iterator.next());
+            }
+            return terms;
+        }
+
         private static String escapeWildcardSyntax(String term) {
             StringBuilder result = new StringBuilder();
             for (int i = 0; i < term.length();) {