Skip to content

Commit 66d800b

Browse files
nik9000jfreden
authored andcommitted
ESQL: text == and text != pushdown (elastic#127355)
Reenables `text ==` pushdown and adds support for `text !=` pushdown. It does so by making `TranslationAware#translatable` return something we can turn into a tri-valued function. It has these values: * `YES` * `NO` * `RECHECK` `YES` means the `Expression` is entirely pushable into Lucene. They will be pushed into Lucene and removed from the plan. `NO` means the `Expression` can't be pushed to Lucene at all and will stay in the plan. `RECHECK` mean the `Expression` can push a query that makes *candidate* matches but must be rechecked. Documents that don't match the query won't match the expression, but documents that match the query might not match the expression. These are pushed to Lucene *and* left in the plan. This is required because `txt != "b"` can build a *candidate* query against the `txt.keyword` subfield but it can't be sure of the match without loading the `_source` - which we do in the compute engine. I haven't plugged rally into this, but here's some basic performance tests: ``` Before: not text eq {"took":460,"documents_found":1000000} text eq {"took":432,"documents_found":1000000} After: text eq {"took":5,"documents_found":1} not text eq {"took":351,"documents_found":800000} ``` This comes from: ``` rm -f /tmp/bulk* for a in {1..1000}; do echo '{"index":{}}' >> /tmp/bulk echo '{"text":"text '$(printf $(($a % 5)))'"}' >> /tmp/bulk done ls -l /tmp/bulk* passwd="redacted" curl -sk -uelastic:$passwd -HContent-Type:application/json -XDELETE https://localhost:9200/test curl -sk -uelastic:$passwd -HContent-Type:application/json -XPUT https://localhost:9200/test -d'{ "settings": { "index.codec": "best_compression", "index.refresh_interval": -1 }, "mappings": { "properties": { "many": { "enabled": false } } } }' for a in {1..1000}; do printf %04d: $a curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST https://localhost:9200/test/_bulk?pretty --data-binary @/tmp/bulk | grep errors done curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST https://localhost:9200/test/_forcemerge?max_num_segments=1 curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST https://localhost:9200/test/_refresh echo curl -sk -uelastic:$passwd https://localhost:9200/_cat/indices?v text_eq() { echo -n " text eq " curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST 'https://localhost:9200/_query?pretty' -d'{ "query": "FROM test | WHERE text == \"text 1\" | STATS COUNT(*)", "pragma": { "data_partitioning": "shard" } }' | jq -c '{took, documents_found}' } not_text_eq() { echo -n "not text eq " curl -sk -uelastic:$passwd -HContent-Type:application/json -XPOST 'https://localhost:9200/_query?pretty' -d'{ "query": "FROM test | WHERE NOT text == \"text 1\" | STATS COUNT(*)", "pragma": { "data_partitioning": "shard" } }' | jq -c '{took, documents_found}' } for a in {1..100}; do text_eq not_text_eq done ```
1 parent a6e1d52 commit 66d800b

File tree

30 files changed

+559
-123
lines changed

30 files changed

+559
-123
lines changed

docs/changelog/127355.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 127355
2+
summary: '`text ==` and `text !=` pushdown'
3+
area: ES|QL
4+
type: enhancement
5+
issues: []

x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushQueriesIT.java

Lines changed: 168 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77

88
package org.elasticsearch.xpack.esql.qa.single_node;
99

10+
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
1011
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters;
1112

1213
import org.elasticsearch.client.Request;
1314
import org.elasticsearch.client.Response;
15+
import org.elasticsearch.client.ResponseException;
1416
import org.elasticsearch.test.ListMatcher;
1517
import org.elasticsearch.test.MapMatcher;
1618
import org.elasticsearch.test.TestClustersThreadFilter;
@@ -27,6 +29,7 @@
2729
import java.util.Locale;
2830
import java.util.Map;
2931
import java.util.regex.Pattern;
32+
import java.util.stream.Stream;
3033

3134
import static org.elasticsearch.test.ListMatcher.matchesList;
3235
import static org.elasticsearch.test.MapMatcher.assertMap;
@@ -48,50 +51,161 @@ public class PushQueriesIT extends ESRestTestCase {
4851
@ClassRule
4952
public static ElasticsearchCluster cluster = Clusters.testCluster();
5053

51-
public void testPushEqualityOnDefaults() throws IOException {
54+
@ParametersFactory(argumentFormatting = "%1s")
55+
public static List<Object[]> args() {
56+
return Stream.of("auto", "text", "match_only_text", "semantic_text").map(s -> new Object[] { s }).toList();
57+
}
58+
59+
private final String type;
60+
61+
public PushQueriesIT(String type) {
62+
this.type = type;
63+
}
64+
65+
public void testEquality() throws IOException {
5266
String value = "v".repeat(between(0, 256));
53-
testPushQuery(value, """
67+
String esqlQuery = """
5468
FROM test
5569
| WHERE test == "%value"
56-
""", "*:*", true, true);
70+
""";
71+
String luceneQuery = switch (type) {
72+
case "text", "auto" -> "#test.keyword:%value -_ignored:test.keyword";
73+
case "match_only_text" -> "*:*";
74+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
75+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
76+
};
77+
boolean filterInCompute = switch (type) {
78+
case "text", "auto" -> false;
79+
case "match_only_text", "semantic_text" -> true;
80+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
81+
};
82+
testPushQuery(value, esqlQuery, luceneQuery, filterInCompute, true);
5783
}
5884

59-
public void testPushEqualityOnDefaultsTooBigToPush() throws IOException {
85+
public void testEqualityTooBigToPush() throws IOException {
6086
String value = "a".repeat(between(257, 1000));
61-
testPushQuery(value, """
87+
String esqlQuery = """
6288
FROM test
6389
| WHERE test == "%value"
64-
""", "*:*", true, true);
90+
""";
91+
String luceneQuery = switch (type) {
92+
case "text", "auto", "match_only_text" -> "*:*";
93+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
94+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
95+
};
96+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
97+
}
98+
99+
/**
100+
* Turns into an {@code IN} which isn't currently pushed.
101+
*/
102+
public void testEqualityOrTooBig() throws IOException {
103+
String value = "v".repeat(between(0, 256));
104+
String tooBig = "a".repeat(between(257, 1000));
105+
String esqlQuery = """
106+
FROM test
107+
| WHERE test == "%value" OR test == "%tooBig"
108+
""".replace("%tooBig", tooBig);
109+
String luceneQuery = switch (type) {
110+
case "text", "auto", "match_only_text" -> "*:*";
111+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
112+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
113+
};
114+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
115+
}
116+
117+
public void testEqualityOrOther() throws IOException {
118+
String value = "v".repeat(between(0, 256));
119+
String esqlQuery = """
120+
FROM test
121+
| WHERE test == "%value" OR foo == 2
122+
""";
123+
String luceneQuery = switch (type) {
124+
case "text", "auto" -> "(#test.keyword:%value -_ignored:test.keyword) foo:[2 TO 2]";
125+
case "match_only_text" -> "*:*";
126+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
127+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
128+
};
129+
boolean filterInCompute = switch (type) {
130+
case "text", "auto" -> false;
131+
case "match_only_text", "semantic_text" -> true;
132+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
133+
};
134+
testPushQuery(value, esqlQuery, luceneQuery, filterInCompute, true);
65135
}
66136

67-
public void testPushInequalityOnDefaults() throws IOException {
137+
public void testEqualityAndOther() throws IOException {
68138
String value = "v".repeat(between(0, 256));
69-
testPushQuery(value, """
139+
String esqlQuery = """
140+
FROM test
141+
| WHERE test == "%value" AND foo == 1
142+
""";
143+
String luceneQuery = switch (type) {
144+
case "text", "auto" -> "#test.keyword:%value -_ignored:test.keyword #foo:[1 TO 1]";
145+
case "match_only_text" -> "foo:[1 TO 1]";
146+
case "semantic_text" ->
147+
/*
148+
* single_value_match is here because there are extra documents hiding in the index
149+
* that don't have the `foo` field.
150+
*/
151+
"#foo:[1 TO 1] #single_value_match(foo)";
152+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
153+
};
154+
boolean filterInCompute = switch (type) {
155+
case "text", "auto" -> false;
156+
case "match_only_text", "semantic_text" -> true;
157+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
158+
};
159+
testPushQuery(value, esqlQuery, luceneQuery, filterInCompute, true);
160+
}
161+
162+
public void testInequality() throws IOException {
163+
String value = "v".repeat(between(0, 256));
164+
String esqlQuery = """
70165
FROM test
71166
| WHERE test != "%different_value"
72-
""", "*:*", true, true);
167+
""";
168+
String luceneQuery = switch (type) {
169+
case "text", "auto" -> "(-test.keyword:%different_value #*:*) _ignored:test.keyword";
170+
case "match_only_text" -> "*:*";
171+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
172+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
173+
};
174+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
73175
}
74176

75-
public void testPushInequalityOnDefaultsTooBigToPush() throws IOException {
177+
public void testInequalityTooBigToPush() throws IOException {
76178
String value = "a".repeat(between(257, 1000));
77-
testPushQuery(value, """
179+
String esqlQuery = """
78180
FROM test
79181
| WHERE test != "%value"
80-
""", "*:*", true, false);
182+
""";
183+
String luceneQuery = switch (type) {
184+
case "text", "auto", "match_only_text" -> "*:*";
185+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
186+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
187+
};
188+
testPushQuery(value, esqlQuery, luceneQuery, true, false);
81189
}
82190

83-
public void testPushCaseInsensitiveEqualityOnDefaults() throws IOException {
191+
public void testCaseInsensitiveEquality() throws IOException {
84192
String value = "a".repeat(between(0, 256));
85-
testPushQuery(value, """
193+
String esqlQuery = """
86194
FROM test
87195
| WHERE TO_LOWER(test) == "%value"
88-
""", "*:*", true, true);
196+
""";
197+
String luceneQuery = switch (type) {
198+
case "text", "auto", "match_only_text" -> "*:*";
199+
case "semantic_text" -> "FieldExistsQuery [field=_primary_term]";
200+
default -> throw new UnsupportedOperationException("unknown type [" + type + "]");
201+
};
202+
testPushQuery(value, esqlQuery, luceneQuery, true, true);
89203
}
90204

91205
private void testPushQuery(String value, String esqlQuery, String luceneQuery, boolean filterInCompute, boolean found)
92206
throws IOException {
93207
indexValue(value);
94-
String differentValue = randomValueOtherThan(value, () -> randomAlphaOfLength(value.length() == 0 ? 1 : value.length()));
208+
String differentValue = randomValueOtherThan(value, () -> randomAlphaOfLength(value.isEmpty() ? 1 : value.length()));
95209

96210
String replacedQuery = esqlQuery.replaceAll("%value", value).replaceAll("%different_value", differentValue);
97211
RestEsqlTestCase.RequestObjectBuilder builder = requestObjectBuilder().query(replacedQuery + "\n| KEEP test");
@@ -148,15 +262,43 @@ private void testPushQuery(String value, String esqlQuery, String luceneQuery, b
148262
}
149263

150264
private void indexValue(String value) throws IOException {
265+
try {
266+
// Delete the index if it has already been created.
267+
client().performRequest(new Request("DELETE", "test"));
268+
} catch (ResponseException e) {
269+
if (e.getResponse().getStatusLine().getStatusCode() != 404) {
270+
throw e;
271+
}
272+
}
273+
151274
Request createIndex = new Request("PUT", "test");
152-
createIndex.setJsonEntity("""
275+
String json = """
153276
{
154277
"settings": {
155278
"index": {
156279
"number_of_shards": 1
157280
}
158-
}
159-
}""");
281+
}""";
282+
if (false == "auto".equals(type)) {
283+
json += """
284+
,
285+
"mappings": {
286+
"properties": {
287+
"test": {
288+
"type": "%type",
289+
"fields": {
290+
"keyword": {
291+
"type": "keyword",
292+
"ignore_above": 256
293+
}
294+
}
295+
}
296+
}
297+
}
298+
}""".replace("%type", type);
299+
}
300+
json += "}";
301+
createIndex.setJsonEntity(json);
160302
Response createResponse = client().performRequest(createIndex);
161303
assertThat(
162304
entityToMap(createResponse.getEntity(), XContentType.JSON),
@@ -167,7 +309,7 @@ private void indexValue(String value) throws IOException {
167309
bulk.addParameter("refresh", "");
168310
bulk.setJsonEntity(String.format(Locale.ROOT, """
169311
{"create":{"_index":"test"}}
170-
{"test":"%s"}
312+
{"test":"%s","foo":1}
171313
""", value));
172314
Response bulkResponse = client().performRequest(bulk);
173315
assertThat(entityToMap(bulkResponse.getEntity(), XContentType.JSON), matchesMap().entry("errors", false).extraOk());
@@ -190,4 +332,10 @@ private static String checkOperatorProfile(Map<String, Object> o, String query)
190332
protected String getTestRestCluster() {
191333
return cluster.getHttpAddresses();
192334
}
335+
336+
@Override
337+
protected boolean preserveClusterUponCompletion() {
338+
// Preserve the cluser to speed up the semantic_text tests
339+
return true;
340+
}
193341
}

x-pack/plugin/esql/qa/testFixtures/src/main/resources/data/mv_text.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
2023-10-23T13:55:01.544Z,Connected to 10.1.0.1
44
2023-10-23T13:55:01.545Z,[Connected to 10.1.0.1, More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100]
55
2023-10-23T13:55:01.546Z,More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100
6+
2023-10-23T13:55:01.547Z,[More than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100,Second than one hundred characters long so it isn't indexed by the sub keyword field with ignore_above:100]

0 commit comments

Comments
 (0)