Skip to content

Commit 8514c0d

Browse files
Optimized text for full unicode and some escape sequences (#129169) (#129360)
Follow-up to #126492 to apply the json parsing optimization to strings containing unicode characters and some backslash-escaped characters. Supporting backslash-escaped strings is tricky as it requires modifying the string. There are two types of modification: some just remove the backslash (e.g. \", \\), and some replace the whole escape sequence with a new character (e.g. \n, \r, \u00e5). In this implementation, the optimization only supports the first case--removing the backslash. This is done by making a copy of the data, skipping the backslash. It should still be more optimized than full String decoding, but it won't be as fast as non-backslashed strings where we can directly reference the input bytes. Relates to #129072.
1 parent d7bcda3 commit 8514c0d

File tree

2 files changed

+168
-43
lines changed

2 files changed

+168
-43
lines changed

libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java

Lines changed: 67 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,14 @@
2121

2222
import java.io.IOException;
2323
import java.io.InputStream;
24+
import java.util.ArrayList;
25+
import java.util.List;
2426

2527
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
2628
protected int stringEnd = -1;
29+
protected int stringLength;
30+
31+
private final List<Integer> backslashes = new ArrayList<>();
2732

2833
public ESUTF8StreamJsonParser(
2934
IOContext ctxt,
@@ -43,15 +48,12 @@ public ESUTF8StreamJsonParser(
4348
/**
4449
* Method that will try to get underlying UTF-8 encoded bytes of the current string token.
4550
* This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
46-
* Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
4751
*/
4852
public Text getValueAsText() throws IOException {
4953
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
5054
if (stringEnd > 0) {
5155
final int len = stringEnd - 1 - _inputPtr;
52-
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
53-
// which means each character uses exactly 1 byte.
54-
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
56+
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), stringLength);
5557
}
5658
return _finishAndReturnText();
5759
}
@@ -69,21 +71,71 @@ protected Text _finishAndReturnText() throws IOException {
6971
final int[] codes = INPUT_CODES_UTF8;
7072
final int max = _inputEnd;
7173
final byte[] inputBuffer = _inputBuffer;
72-
while (ptr < max) {
74+
stringLength = 0;
75+
backslashes.clear();
76+
77+
loop: while (true) {
78+
if (ptr >= max) {
79+
return null;
80+
}
7381
int c = inputBuffer[ptr] & 0xFF;
74-
if (codes[c] != 0) {
75-
if (c == INT_QUOTE) {
76-
stringEnd = ptr + 1;
77-
final int len = ptr - startPtr;
78-
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
79-
// which means each character uses exactly 1 byte.
80-
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
82+
switch (codes[c]) {
83+
case 0 -> {
84+
++ptr;
85+
++stringLength;
86+
}
87+
case 1 -> {
88+
if (c == INT_QUOTE) {
89+
// End of the string
90+
break loop;
91+
}
92+
assert c == INT_BACKSLASH;
93+
backslashes.add(ptr);
94+
++ptr;
95+
if (ptr >= max) {
96+
// Backslash at end of file
97+
return null;
98+
}
99+
c = inputBuffer[ptr] & 0xFF;
100+
if (c == '"' || c == '/' || c == '\\') {
101+
ptr += 1;
102+
stringLength += 1;
103+
} else {
104+
// Any other escaped sequence requires replacing the sequence with
105+
// a new character, which we don't support in the optimized path
106+
return null;
107+
}
108+
}
109+
case 2, 3, 4 -> {
110+
int bytesToSkip = codes[c];
111+
if (ptr + bytesToSkip > max) {
112+
return null;
113+
}
114+
ptr += bytesToSkip;
115+
++stringLength;
116+
}
117+
default -> {
118+
return null;
81119
}
82-
return null;
83120
}
84-
++ptr;
85121
}
86-
return null;
122+
123+
stringEnd = ptr + 1;
124+
if (backslashes.isEmpty()) {
125+
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
126+
} else {
127+
byte[] buff = new byte[ptr - startPtr - backslashes.size()];
128+
int copyPtr = startPtr;
129+
int destPtr = 0;
130+
for (Integer backslash : backslashes) {
131+
int length = backslash - copyPtr;
132+
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
133+
destPtr += length;
134+
copyPtr = backslash + 1;
135+
}
136+
System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
137+
return new Text(new XContentString.UTF8Bytes(buff), stringLength);
138+
}
87139
}
88140

89141
@Override

libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java

Lines changed: 101 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,14 @@
1313
import com.fasterxml.jackson.core.JsonParser;
1414
import com.fasterxml.jackson.core.JsonToken;
1515

16-
import org.elasticsearch.common.Strings;
1716
import org.elasticsearch.core.CheckedConsumer;
1817
import org.elasticsearch.test.ESTestCase;
19-
import org.elasticsearch.xcontent.XContentBuilder;
2018
import org.elasticsearch.xcontent.XContentString;
21-
import org.elasticsearch.xcontent.json.JsonXContent;
2219
import org.hamcrest.Matchers;
2320

2421
import java.io.IOException;
2522
import java.nio.charset.StandardCharsets;
23+
import java.util.Locale;
2624

2725
public class ESUTF8StreamJsonParserTests extends ESTestCase {
2826

@@ -45,11 +43,13 @@ public void testGetValueAsText() throws IOException {
4543
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
4644
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
4745

48-
var textRef = parser.getValueAsText().bytes();
49-
assertThat(textRef, Matchers.notNullValue());
50-
assertThat(textRef.offset(), Matchers.equalTo(9));
51-
assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12));
52-
assertTextRef(textRef, "bar");
46+
var text = parser.getValueAsText();
47+
assertThat(text, Matchers.notNullValue());
48+
49+
var bytes = text.bytes();
50+
assertThat(bytes.offset(), Matchers.equalTo(9));
51+
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12));
52+
assertTextRef(bytes, "bar");
5353

5454
assertThat(parser.getValueAsString(), Matchers.equalTo("bar"));
5555
assertThat(parser.getValueAsText(), Matchers.nullValue());
@@ -62,17 +62,36 @@ public void testGetValueAsText() throws IOException {
6262
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
6363
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
6464

65+
var text = parser.getValueAsText();
66+
assertThat(text, Matchers.notNullValue());
67+
assertTextRef(text.bytes(), "bar\"baz\"");
68+
});
69+
70+
testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> {
71+
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
72+
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
73+
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
74+
6575
assertThat(parser.getValueAsText(), Matchers.nullValue());
66-
assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\""));
76+
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
6777
});
6878

6979
testParseJson("{\"foo\": \"bår\"}", parser -> {
7080
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
7181
assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
7282
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
7383

74-
assertThat(parser.getValueAsText(), Matchers.nullValue());
84+
var text = parser.getValueAsText();
85+
assertThat(text, Matchers.notNullValue());
86+
87+
var bytes = text.bytes();
88+
assertThat(bytes.offset(), Matchers.equalTo(9));
89+
assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13));
90+
assertTextRef(bytes, "bår");
91+
7592
assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
93+
94+
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
7695
});
7796

7897
testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> {
@@ -112,43 +131,97 @@ public void testGetValueAsText() throws IOException {
112131
});
113132
}
114133

115-
private boolean validForTextRef(String value) {
116-
for (char c : value.toCharArray()) {
117-
if (c == '"') {
118-
return false;
134+
private record TestInput(String input, String result, boolean supportsOptimized) {}
135+
136+
private static final TestInput[] ESCAPE_SEQUENCES = {
137+
new TestInput("\\b", "\b", false),
138+
new TestInput("\\t", "\t", false),
139+
new TestInput("\\n", "\n", false),
140+
new TestInput("\\f", "\f", false),
141+
new TestInput("\\r", "\r", false),
142+
new TestInput("\\\"", "\"", true),
143+
new TestInput("\\/", "/", true),
144+
new TestInput("\\\\", "\\", true) };
145+
146+
private int randomCodepoint(boolean includeAscii) {
147+
while (true) {
148+
char val = Character.toChars(randomInt(0xFFFF))[0];
149+
if (val <= 0x7f && includeAscii == false) {
150+
continue;
119151
}
120-
if (c == '\\') {
121-
return false;
152+
if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) {
153+
continue;
122154
}
123-
if ((int) c < 32 || (int) c >= 128) {
124-
return false;
155+
return val;
156+
}
157+
}
158+
159+
private TestInput buildRandomInput(int length) {
160+
StringBuilder input = new StringBuilder(length);
161+
StringBuilder result = new StringBuilder(length);
162+
boolean forceSupportOptimized = randomBoolean();
163+
boolean doesSupportOptimized = true;
164+
for (int i = 0; i < length; ++i) {
165+
if (forceSupportOptimized == false && randomBoolean()) {
166+
switch (randomInt(9)) {
167+
case 0 -> {
168+
var escape = randomFrom(ESCAPE_SEQUENCES);
169+
input.append(escape.input());
170+
result.append(escape.result());
171+
doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized();
172+
}
173+
case 1 -> {
174+
int value = randomCodepoint(true);
175+
input.append(String.format(Locale.ENGLISH, "\\u%04x", value));
176+
result.append(Character.toChars(value));
177+
doesSupportOptimized = false;
178+
}
179+
default -> {
180+
var value = Character.toChars(randomCodepoint(false));
181+
input.append(value);
182+
result.append(value);
183+
}
184+
}
185+
} else {
186+
var value = randomAlphanumericOfLength(1);
187+
input.append(value);
188+
result.append(value);
125189
}
126190
}
127-
return true;
191+
return new TestInput(input.toString(), result.toString(), doesSupportOptimized);
128192
}
129193

130194
public void testGetValueRandomized() throws IOException {
131-
XContentBuilder jsonBuilder = JsonXContent.contentBuilder().startObject();
195+
StringBuilder inputBuilder = new StringBuilder();
196+
inputBuilder.append('{');
197+
132198
final int numKeys = 128;
133199
String[] keys = new String[numKeys];
134-
String[] values = new String[numKeys];
200+
TestInput[] inputs = new TestInput[numKeys];
135201
for (int i = 0; i < numKeys; i++) {
136202
String currKey = randomAlphanumericOfLength(6);
137-
String currVal = randomUnicodeOfLengthBetween(0, 512);
138-
jsonBuilder.field(currKey, currVal);
203+
var currVal = buildRandomInput(randomInt(512));
204+
inputBuilder.append('"');
205+
inputBuilder.append(currKey);
206+
inputBuilder.append("\":\"");
207+
inputBuilder.append(currVal.input());
208+
inputBuilder.append('"');
209+
if (i < numKeys - 1) {
210+
inputBuilder.append(',');
211+
}
139212
keys[i] = currKey;
140-
values[i] = currVal;
213+
inputs[i] = currVal;
141214
}
142215

143-
jsonBuilder.endObject();
144-
testParseJson(Strings.toString(jsonBuilder), parser -> {
216+
inputBuilder.append('}');
217+
testParseJson(inputBuilder.toString(), parser -> {
145218
assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
146219
for (int i = 0; i < numKeys; i++) {
147220
assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i]));
148221
assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
149222

150-
String currVal = values[i];
151-
if (validForTextRef(currVal)) {
223+
String currVal = inputs[i].result();
224+
if (inputs[i].supportsOptimized()) {
152225
assertTextRef(parser.getValueAsText().bytes(), currVal);
153226
} else {
154227
assertThat(parser.getValueAsText(), Matchers.nullValue());

0 commit comments

Comments
 (0)