From 2ec253406a0a6db85ebe5ea521e2626522b827af Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 5 Jun 2025 14:46:40 +0200 Subject: [PATCH 1/2] Improve execution of terms queries over wildcard fields --- x-pack/plugin/wildcard/build.gradle | 1 + .../wildcard/search/WildcardSearchIT.java | 153 ++++++++++++++++++ .../wildcard/mapper/WildcardFieldMapper.java | 63 ++++++++ 3 files changed, 217 insertions(+) create mode 100644 x-pack/plugin/wildcard/src/internalClusterTest/java/org/elasticsearch/xpack/wildcard/search/WildcardSearchIT.java diff --git a/x-pack/plugin/wildcard/build.gradle b/x-pack/plugin/wildcard/build.gradle index d87a461eaf858..a78bc548d7275 100644 --- a/x-pack/plugin/wildcard/build.gradle +++ b/x-pack/plugin/wildcard/build.gradle @@ -7,6 +7,7 @@ apply plugin: 'elasticsearch.internal-es-plugin' apply plugin: 'elasticsearch.internal-yaml-rest-test' +apply plugin: 'elasticsearch.internal-cluster-test' esplugin { name = 'wildcard' diff --git a/x-pack/plugin/wildcard/src/internalClusterTest/java/org/elasticsearch/xpack/wildcard/search/WildcardSearchIT.java b/x-pack/plugin/wildcard/src/internalClusterTest/java/org/elasticsearch/xpack/wildcard/search/WildcardSearchIT.java new file mode 100644 index 0000000000000..832f38d404ec3 --- /dev/null +++ b/x-pack/plugin/wildcard/src/internalClusterTest/java/org/elasticsearch/xpack/wildcard/search/WildcardSearchIT.java @@ -0,0 +1,153 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.wildcard.search; + +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.index.query.TermQueryBuilder; +import org.elasticsearch.index.query.TermsQueryBuilder; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xpack.wildcard.Wildcard; +import org.hamcrest.Matchers; +import org.junit.Before; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse; + +public class WildcardSearchIT extends ESIntegTestCase { + + private List terms = null; + + @Override + protected Collection> nodePlugins() { + return List.of(Wildcard.class); + } + + @Before + public void setup() throws IOException { + terms = new ArrayList<>(); + XContentBuilder xcb = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject("wildcard") + .field("type", "wildcard") + .endObject() + .startObject("keyword") + .field("type", "keyword") + .endObject() + .endObject() + .endObject(); + indicesAdmin().prepareCreate("test").setMapping(xcb).get(); + final int numDocs = randomIntBetween(100, 1000); + final BulkRequestBuilder builder = client().prepareBulk(); + for (int i = 0; i < numDocs; i++) { + if (rarely()) { + indexMultiValue(builder); + } else { + indexSingleValue(builder); + } + } + assertFalse(builder.get().hasFailures()); + indicesAdmin().prepareRefresh("test").get(); + } + + private void indexSingleValue(BulkRequestBuilder builder) { + String term = randomIndexString(); + builder.add( + new IndexRequest("test").source("{\"wildcard\" : \"" + term + "\", \"keyword\" : \"" + term + "\"}", XContentType.JSON) + ); + terms.add(term); + } + + private void indexMultiValue(BulkRequestBuilder builder) { + int docSize = randomIntBetween(1, 10); + String[] docTerms = new String[docSize]; + for (int i = 0; i < docSize; i++) { + String term = randomIndexString(); + terms.add(term); + docTerms[i] = "\"" + term + "\""; + } + builder.add( + new IndexRequest("test").source( + "{\"wildcard\" : " + Arrays.toString(docTerms) + ", \"keyword\" : " + Arrays.toString(docTerms) + "}", + XContentType.JSON + ) + ); + } + + public void testTermQueryDuel() { + for (int i = 0; i < 50; i++) { + String term = randomQueryString(terms); + TermQueryBuilder termQueryBuilder1 = new TermQueryBuilder("wildcard", term); + TermQueryBuilder termQueryBuilder2 = new TermQueryBuilder("keyword", term); + assertResponse( + client().prepareSearch("test").setQuery(termQueryBuilder1), + response -> assertResponse( + client().prepareSearch("test").setQuery(termQueryBuilder2), + response2 -> assertThat( + response.getHits().getTotalHits().value(), + Matchers.equalTo(response2.getHits().getTotalHits().value()) + ) + ) + ); + } + } + + public void testTermsQueryDuel() { + for (int i = 0; i < 10; i++) { + String[] terms = new String[randomIntBetween(2, 8192)]; + for (int j = 0; j < terms.length; j++) { + terms[j] = randomQueryString(this.terms); + } + TermsQueryBuilder termsQueryBuilder1 = new TermsQueryBuilder("wildcard", terms); + TermsQueryBuilder termsQueryBuilder2 = new TermsQueryBuilder("keyword", terms); + assertResponse( + client().prepareSearch("test").setQuery(termsQueryBuilder1), + response -> assertResponse( + client().prepareSearch("test").setQuery(termsQueryBuilder2), + response2 -> assertThat( + response.getHits().getTotalHits().value(), + Matchers.equalTo(response2.getHits().getTotalHits().value()) + ) + ) + ); + } + } + + private static String randomIndexString() { + String string = randomAlphaOfLength(randomIntBetween(0, 30)); + if (rarely()) { + return string + "*"; + } else if (rarely()) { + return "*" + string; + } else if (rarely()) { + return "*" + string + "*"; + } else { + return string; + } + } + + private static String randomQueryString(List terms) { + if (rarely()) { + return terms.get(randomIntBetween(0, terms.size() - 1)); + } else if (randomBoolean()) { + return randomAlphaOfLength(randomIntBetween(0, 30)); + } else { + return randomAlphaOfLength(1) + "*"; + } + } +} diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 393938637cde5..3aa43fa925747 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -25,6 +25,7 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MatchAllDocsQuery; @@ -33,6 +34,7 @@ import org.apache.lucene.search.MultiTermQuery.RewriteMethod; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.WildcardQuery; @@ -80,11 +82,14 @@ import java.nio.charset.StandardCharsets; import java.time.ZoneId; import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; import static org.elasticsearch.index.IndexSettings.IGNORE_ABOVE_SETTING; @@ -95,6 +100,7 @@ public class WildcardFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "wildcard"; public static final short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10; + private static final int WILDCARD_TERMS_EXPANSION_LIMIT = 16; public static final int NGRAM_SIZE = 3; static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() { @@ -859,6 +865,63 @@ public Query termQuery(Object value, SearchExecutionContext context) { } } + @Override + public Query termsQuery(Collection values, @Nullable SearchExecutionContext context) { + final BytesRef[] terms = buildTerms(values); + final Query aproxQuery; + if (terms.length < WILDCARD_TERMS_EXPANSION_LIMIT) { + // If there are few terms, we can approximate each term using a BooleanQuery. + final BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (BytesRef term : terms) { + final BooleanQuery.Builder rewritten = new BooleanQuery.Builder(); + final Integer numClauses = getApproxWildCardQuery(escapeWildcardSyntax(term.utf8ToString()), rewritten); + if (numClauses != null && numClauses > 0) { + builder.add(rewritten.build(), Occur.SHOULD); + } + } + aproxQuery = builder.build(); + } else { + // If there are too many terms, we cannot rewrite approximate into a BooleanQuery as it will use too much memory. + // Instead, we generate a TermInSetQuery. In order to match the necessary documents we need to add at least one token + // per term, ideally we should add the token that makes the term most different from the others. + final Set tokens = new LinkedHashSet<>(); + final Set tokenList = new TreeSet<>(); + for (BytesRef term : terms) { + // Break search term into tokens + final boolean matchAll = breakIntoTokens(escapeWildcardSyntax(term.utf8ToString()), tokens); + assert matchAll == false; + if (tokens.isEmpty() == false) { + // If there are tokens, we take the middle one to represent the term + // which is probably the most different one. + tokenList.add(getMiddleToken(tokens)); + } + tokens.clear(); + } + aproxQuery = new TermInSetQuery(name(), tokenList); + } + return BinaryDvConfirmedQuery.fromTerms(new ConstantScoreQuery(aproxQuery), name(), terms); + } + + private static BytesRef getMiddleToken(Set tokens) { + int mid = (tokens.size() + 1) / 2; + Iterator iterator = tokens.iterator(); + for (int i = 0; i < mid - 1; i++) { + iterator.next(); + } + assert iterator.hasNext(); + return BytesRefs.toBytesRef(iterator.next()); + } + + private static BytesRef[] buildTerms(Collection values) { + final Set dedupe = new HashSet<>(values); + final BytesRef[] terms = new BytesRef[dedupe.size()]; + final Iterator iterator = dedupe.iterator(); + for (int i = 0; i < dedupe.size(); i++) { + terms[i] = BytesRefs.toBytesRef(iterator.next()); + } + return terms; + } + private static String escapeWildcardSyntax(String term) { StringBuilder result = new StringBuilder(); for (int i = 0; i < term.length();) { From c7b8d23f9368a9e2d08c2aa9bd901ce08c0e42ab Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 5 Jun 2025 14:57:44 +0200 Subject: [PATCH 2/2] Update docs/changelog/128986.yaml --- docs/changelog/128986.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog/128986.yaml diff --git a/docs/changelog/128986.yaml b/docs/changelog/128986.yaml new file mode 100644 index 0000000000000..3b3e734536817 --- /dev/null +++ b/docs/changelog/128986.yaml @@ -0,0 +1,6 @@ +pr: 128986 +summary: Improve execution of terms queries over wildcard fields +area: Search +type: bug +issues: + - 128201