Skip to content

Commit e31f5ed

Browse files
iverasevaleriy42
authored andcommitted
Improve execution of terms queries over wildcard fields (elastic#128986)
This commit implements TermsQuery(Collection<?> values, @nullable SearchExecutionContext context) in the WildCardFieldMapper to avoid memory pressure when building the query.
1 parent b48d827 commit e31f5ed

File tree

4 files changed

+223
-0
lines changed

4 files changed

+223
-0
lines changed

docs/changelog/128986.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 128986
2+
summary: Improve execution of terms queries over wildcard fields
3+
area: Search
4+
type: bug
5+
issues:
6+
- 128201

x-pack/plugin/wildcard/build.gradle

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
apply plugin: 'elasticsearch.internal-es-plugin'
99
apply plugin: 'elasticsearch.internal-yaml-rest-test'
10+
apply plugin: 'elasticsearch.internal-cluster-test'
1011

1112
esplugin {
1213
name = 'wildcard'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.wildcard.search;
9+
10+
import org.elasticsearch.action.bulk.BulkRequestBuilder;
11+
import org.elasticsearch.action.index.IndexRequest;
12+
import org.elasticsearch.index.query.TermQueryBuilder;
13+
import org.elasticsearch.index.query.TermsQueryBuilder;
14+
import org.elasticsearch.plugins.Plugin;
15+
import org.elasticsearch.test.ESIntegTestCase;
16+
import org.elasticsearch.xcontent.XContentBuilder;
17+
import org.elasticsearch.xcontent.XContentFactory;
18+
import org.elasticsearch.xcontent.XContentType;
19+
import org.elasticsearch.xpack.wildcard.Wildcard;
20+
import org.hamcrest.Matchers;
21+
import org.junit.Before;
22+
23+
import java.io.IOException;
24+
import java.util.ArrayList;
25+
import java.util.Arrays;
26+
import java.util.Collection;
27+
import java.util.List;
28+
29+
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertResponse;
30+
31+
public class WildcardSearchIT extends ESIntegTestCase {
32+
33+
private List<String> terms = null;
34+
35+
@Override
36+
protected Collection<Class<? extends Plugin>> nodePlugins() {
37+
return List.of(Wildcard.class);
38+
}
39+
40+
@Before
41+
public void setup() throws IOException {
42+
terms = new ArrayList<>();
43+
XContentBuilder xcb = XContentFactory.jsonBuilder()
44+
.startObject()
45+
.startObject("properties")
46+
.startObject("wildcard")
47+
.field("type", "wildcard")
48+
.endObject()
49+
.startObject("keyword")
50+
.field("type", "keyword")
51+
.endObject()
52+
.endObject()
53+
.endObject();
54+
indicesAdmin().prepareCreate("test").setMapping(xcb).get();
55+
final int numDocs = randomIntBetween(100, 1000);
56+
final BulkRequestBuilder builder = client().prepareBulk();
57+
for (int i = 0; i < numDocs; i++) {
58+
if (rarely()) {
59+
indexMultiValue(builder);
60+
} else {
61+
indexSingleValue(builder);
62+
}
63+
}
64+
assertFalse(builder.get().hasFailures());
65+
indicesAdmin().prepareRefresh("test").get();
66+
}
67+
68+
private void indexSingleValue(BulkRequestBuilder builder) {
69+
String term = randomIndexString();
70+
builder.add(
71+
new IndexRequest("test").source("{\"wildcard\" : \"" + term + "\", \"keyword\" : \"" + term + "\"}", XContentType.JSON)
72+
);
73+
terms.add(term);
74+
}
75+
76+
private void indexMultiValue(BulkRequestBuilder builder) {
77+
int docSize = randomIntBetween(1, 10);
78+
String[] docTerms = new String[docSize];
79+
for (int i = 0; i < docSize; i++) {
80+
String term = randomIndexString();
81+
terms.add(term);
82+
docTerms[i] = "\"" + term + "\"";
83+
}
84+
builder.add(
85+
new IndexRequest("test").source(
86+
"{\"wildcard\" : " + Arrays.toString(docTerms) + ", \"keyword\" : " + Arrays.toString(docTerms) + "}",
87+
XContentType.JSON
88+
)
89+
);
90+
}
91+
92+
public void testTermQueryDuel() {
93+
for (int i = 0; i < 50; i++) {
94+
String term = randomQueryString(terms);
95+
TermQueryBuilder termQueryBuilder1 = new TermQueryBuilder("wildcard", term);
96+
TermQueryBuilder termQueryBuilder2 = new TermQueryBuilder("keyword", term);
97+
assertResponse(
98+
client().prepareSearch("test").setQuery(termQueryBuilder1),
99+
response -> assertResponse(
100+
client().prepareSearch("test").setQuery(termQueryBuilder2),
101+
response2 -> assertThat(
102+
response.getHits().getTotalHits().value(),
103+
Matchers.equalTo(response2.getHits().getTotalHits().value())
104+
)
105+
)
106+
);
107+
}
108+
}
109+
110+
public void testTermsQueryDuel() {
111+
for (int i = 0; i < 10; i++) {
112+
String[] terms = new String[randomIntBetween(2, 8192)];
113+
for (int j = 0; j < terms.length; j++) {
114+
terms[j] = randomQueryString(this.terms);
115+
}
116+
TermsQueryBuilder termsQueryBuilder1 = new TermsQueryBuilder("wildcard", terms);
117+
TermsQueryBuilder termsQueryBuilder2 = new TermsQueryBuilder("keyword", terms);
118+
assertResponse(
119+
client().prepareSearch("test").setQuery(termsQueryBuilder1),
120+
response -> assertResponse(
121+
client().prepareSearch("test").setQuery(termsQueryBuilder2),
122+
response2 -> assertThat(
123+
response.getHits().getTotalHits().value(),
124+
Matchers.equalTo(response2.getHits().getTotalHits().value())
125+
)
126+
)
127+
);
128+
}
129+
}
130+
131+
private static String randomIndexString() {
132+
String string = randomAlphaOfLength(randomIntBetween(0, 30));
133+
if (rarely()) {
134+
return string + "*";
135+
} else if (rarely()) {
136+
return "*" + string;
137+
} else if (rarely()) {
138+
return "*" + string + "*";
139+
} else {
140+
return string;
141+
}
142+
}
143+
144+
private static String randomQueryString(List<String> terms) {
145+
if (rarely()) {
146+
return terms.get(randomIntBetween(0, terms.size() - 1));
147+
} else if (randomBoolean()) {
148+
return randomAlphaOfLength(randomIntBetween(0, 30));
149+
} else {
150+
return randomAlphaOfLength(1) + "*";
151+
}
152+
}
153+
}

x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.lucene.search.BooleanClause;
2626
import org.apache.lucene.search.BooleanClause.Occur;
2727
import org.apache.lucene.search.BooleanQuery;
28+
import org.apache.lucene.search.ConstantScoreQuery;
2829
import org.apache.lucene.search.FieldExistsQuery;
2930
import org.apache.lucene.search.FuzzyQuery;
3031
import org.apache.lucene.search.MatchAllDocsQuery;
@@ -33,6 +34,7 @@
3334
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
3435
import org.apache.lucene.search.PrefixQuery;
3536
import org.apache.lucene.search.Query;
37+
import org.apache.lucene.search.TermInSetQuery;
3638
import org.apache.lucene.search.TermQuery;
3739
import org.apache.lucene.search.TermRangeQuery;
3840
import org.apache.lucene.search.WildcardQuery;
@@ -80,11 +82,14 @@
8082
import java.nio.charset.StandardCharsets;
8183
import java.time.ZoneId;
8284
import java.util.ArrayList;
85+
import java.util.Collection;
8386
import java.util.HashSet;
87+
import java.util.Iterator;
8488
import java.util.LinkedHashSet;
8589
import java.util.List;
8690
import java.util.Map;
8791
import java.util.Set;
92+
import java.util.TreeSet;
8893

8994
import static org.elasticsearch.index.IndexSettings.IGNORE_ABOVE_SETTING;
9095

@@ -95,6 +100,7 @@ public class WildcardFieldMapper extends FieldMapper {
95100

96101
public static final String CONTENT_TYPE = "wildcard";
97102
public static final short MAX_CLAUSES_IN_APPROXIMATION_QUERY = 10;
103+
private static final int WILDCARD_TERMS_EXPANSION_LIMIT = 16;
98104
public static final int NGRAM_SIZE = 3;
99105

100106
static final NamedAnalyzer WILDCARD_ANALYZER_7_10 = new NamedAnalyzer("_wildcard_7_10", AnalyzerScope.GLOBAL, new Analyzer() {
@@ -859,6 +865,63 @@ public Query termQuery(Object value, SearchExecutionContext context) {
859865
}
860866
}
861867

868+
@Override
869+
public Query termsQuery(Collection<?> values, @Nullable SearchExecutionContext context) {
870+
final BytesRef[] terms = buildTerms(values);
871+
final Query aproxQuery;
872+
if (terms.length < WILDCARD_TERMS_EXPANSION_LIMIT) {
873+
// If there are few terms, we can approximate each term using a BooleanQuery.
874+
final BooleanQuery.Builder builder = new BooleanQuery.Builder();
875+
for (BytesRef term : terms) {
876+
final BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
877+
final Integer numClauses = getApproxWildCardQuery(escapeWildcardSyntax(term.utf8ToString()), rewritten);
878+
if (numClauses != null && numClauses > 0) {
879+
builder.add(rewritten.build(), Occur.SHOULD);
880+
}
881+
}
882+
aproxQuery = builder.build();
883+
} else {
884+
// If there are too many terms, we cannot rewrite approximate into a BooleanQuery as it will use too much memory.
885+
// Instead, we generate a TermInSetQuery. In order to match the necessary documents we need to add at least one token
886+
// per term, ideally we should add the token that makes the term most different from the others.
887+
final Set<String> tokens = new LinkedHashSet<>();
888+
final Set<BytesRef> tokenList = new TreeSet<>();
889+
for (BytesRef term : terms) {
890+
// Break search term into tokens
891+
final boolean matchAll = breakIntoTokens(escapeWildcardSyntax(term.utf8ToString()), tokens);
892+
assert matchAll == false;
893+
if (tokens.isEmpty() == false) {
894+
// If there are tokens, we take the middle one to represent the term
895+
// which is probably the most different one.
896+
tokenList.add(getMiddleToken(tokens));
897+
}
898+
tokens.clear();
899+
}
900+
aproxQuery = new TermInSetQuery(name(), tokenList);
901+
}
902+
return BinaryDvConfirmedQuery.fromTerms(new ConstantScoreQuery(aproxQuery), name(), terms);
903+
}
904+
905+
private static BytesRef getMiddleToken(Set<String> tokens) {
906+
int mid = (tokens.size() + 1) / 2;
907+
Iterator<String> iterator = tokens.iterator();
908+
for (int i = 0; i < mid - 1; i++) {
909+
iterator.next();
910+
}
911+
assert iterator.hasNext();
912+
return BytesRefs.toBytesRef(iterator.next());
913+
}
914+
915+
private static BytesRef[] buildTerms(Collection<?> values) {
916+
final Set<?> dedupe = new HashSet<>(values);
917+
final BytesRef[] terms = new BytesRef[dedupe.size()];
918+
final Iterator<?> iterator = dedupe.iterator();
919+
for (int i = 0; i < dedupe.size(); i++) {
920+
terms[i] = BytesRefs.toBytesRef(iterator.next());
921+
}
922+
return terms;
923+
}
924+
862925
private static String escapeWildcardSyntax(String term) {
863926
StringBuilder result = new StringBuilder();
864927
for (int i = 0; i < term.length();) {

0 commit comments

Comments
 (0)