diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java index 5f91036dee96e..0491efcea0a0f 100644 --- a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java @@ -21,9 +21,14 @@ import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser { protected int stringEnd = -1; + protected int stringLength; + + private final List backslashes = new ArrayList<>(); public ESUTF8StreamJsonParser( IOContext ctxt, @@ -43,15 +48,12 @@ public ESUTF8StreamJsonParser( /** * Method that will try to get underlying UTF-8 encoded bytes of the current string token. * This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null. - * Currently, this is only implemented for ascii-only strings that do not contain escaped characters. */ public Text getValueAsText() throws IOException { if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) { if (stringEnd > 0) { final int len = stringEnd - 1 - _inputPtr; - // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings, - // which means each character uses exactly 1 byte. - return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len); + return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), stringLength); } return _finishAndReturnText(); } @@ -69,21 +71,71 @@ protected Text _finishAndReturnText() throws IOException { final int[] codes = INPUT_CODES_UTF8; final int max = _inputEnd; final byte[] inputBuffer = _inputBuffer; - while (ptr < max) { + stringLength = 0; + backslashes.clear(); + + loop: while (true) { + if (ptr >= max) { + return null; + } int c = inputBuffer[ptr] & 0xFF; - if (codes[c] != 0) { - if (c == INT_QUOTE) { - stringEnd = ptr + 1; - final int len = ptr - startPtr; - // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings, - // which means each character uses exactly 1 byte. - return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len); + switch (codes[c]) { + case 0 -> { + ++ptr; + ++stringLength; + } + case 1 -> { + if (c == INT_QUOTE) { + // End of the string + break loop; + } + assert c == INT_BACKSLASH; + backslashes.add(ptr); + ++ptr; + if (ptr >= max) { + // Backslash at end of file + return null; + } + c = inputBuffer[ptr] & 0xFF; + if (c == '"' || c == '/' || c == '\\') { + ptr += 1; + stringLength += 1; + } else { + // Any other escaped sequence requires replacing the sequence with + // a new character, which we don't support in the optimized path + return null; + } + } + case 2, 3, 4 -> { + int bytesToSkip = codes[c]; + if (ptr + bytesToSkip > max) { + return null; + } + ptr += bytesToSkip; + ++stringLength; + } + default -> { + return null; } - return null; } - ++ptr; } - return null; + + stringEnd = ptr + 1; + if (backslashes.isEmpty()) { + return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength); + } else { + byte[] buff = new byte[ptr - startPtr - backslashes.size()]; + int copyPtr = startPtr; + int destPtr = 0; + for (Integer backslash : backslashes) { + int length = backslash - copyPtr; + System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length); + destPtr += length; + copyPtr = backslash + 1; + } + System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr); + return new Text(new XContentString.UTF8Bytes(buff), stringLength); + } } @Override diff --git a/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java index 411f77be8b1e6..df52eeb3632ad 100644 --- a/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java +++ b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java @@ -13,16 +13,14 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; -import org.elasticsearch.common.Strings; import org.elasticsearch.core.CheckedConsumer; import org.elasticsearch.test.ESTestCase; -import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentString; -import org.elasticsearch.xcontent.json.JsonXContent; import org.hamcrest.Matchers; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.Locale; public class ESUTF8StreamJsonParserTests extends ESTestCase { @@ -45,11 +43,13 @@ public void testGetValueAsText() throws IOException { assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); - var textRef = parser.getValueAsText().bytes(); - assertThat(textRef, Matchers.notNullValue()); - assertThat(textRef.offset(), Matchers.equalTo(9)); - assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12)); - assertTextRef(textRef, "bar"); + var text = parser.getValueAsText(); + assertThat(text, Matchers.notNullValue()); + + var bytes = text.bytes(); + assertThat(bytes.offset(), Matchers.equalTo(9)); + assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12)); + assertTextRef(bytes, "bar"); assertThat(parser.getValueAsString(), Matchers.equalTo("bar")); assertThat(parser.getValueAsText(), Matchers.nullValue()); @@ -62,8 +62,18 @@ public void testGetValueAsText() throws IOException { assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + var text = parser.getValueAsText(); + assertThat(text, Matchers.notNullValue()); + assertTextRef(text.bytes(), "bar\"baz\""); + }); + + testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> { + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + assertThat(parser.getValueAsText(), Matchers.nullValue()); - assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\"")); + assertThat(parser.getValueAsString(), Matchers.equalTo("bår")); }); testParseJson("{\"foo\": \"bår\"}", parser -> { @@ -71,8 +81,17 @@ public void testGetValueAsText() throws IOException { assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); - assertThat(parser.getValueAsText(), Matchers.nullValue()); + var text = parser.getValueAsText(); + assertThat(text, Matchers.notNullValue()); + + var bytes = text.bytes(); + assertThat(bytes.offset(), Matchers.equalTo(9)); + assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13)); + assertTextRef(bytes, "bår"); + assertThat(parser.getValueAsString(), Matchers.equalTo("bår")); + + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT)); }); testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> { @@ -112,43 +131,97 @@ public void testGetValueAsText() throws IOException { }); } - private boolean validForTextRef(String value) { - for (char c : value.toCharArray()) { - if (c == '"') { - return false; + private record TestInput(String input, String result, boolean supportsOptimized) {} + + private static final TestInput[] ESCAPE_SEQUENCES = { + new TestInput("\\b", "\b", false), + new TestInput("\\t", "\t", false), + new TestInput("\\n", "\n", false), + new TestInput("\\f", "\f", false), + new TestInput("\\r", "\r", false), + new TestInput("\\\"", "\"", true), + new TestInput("\\/", "/", true), + new TestInput("\\\\", "\\", true) }; + + private int randomCodepoint(boolean includeAscii) { + while (true) { + char val = Character.toChars(randomInt(0xFFFF))[0]; + if (val <= 0x7f && includeAscii == false) { + continue; } - if (c == '\\') { - return false; + if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) { + continue; } - if ((int) c < 32 || (int) c >= 128) { - return false; + return val; + } + } + + private TestInput buildRandomInput(int length) { + StringBuilder input = new StringBuilder(length); + StringBuilder result = new StringBuilder(length); + boolean forceSupportOptimized = randomBoolean(); + boolean doesSupportOptimized = true; + for (int i = 0; i < length; ++i) { + if (forceSupportOptimized == false && randomBoolean()) { + switch (randomInt(9)) { + case 0 -> { + var escape = randomFrom(ESCAPE_SEQUENCES); + input.append(escape.input()); + result.append(escape.result()); + doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized(); + } + case 1 -> { + int value = randomCodepoint(true); + input.append(String.format(Locale.ENGLISH, "\\u%04x", value)); + result.append(Character.toChars(value)); + doesSupportOptimized = false; + } + default -> { + var value = Character.toChars(randomCodepoint(false)); + input.append(value); + result.append(value); + } + } + } else { + var value = randomAlphanumericOfLength(1); + input.append(value); + result.append(value); } } - return true; + return new TestInput(input.toString(), result.toString(), doesSupportOptimized); } public void testGetValueRandomized() throws IOException { - XContentBuilder jsonBuilder = JsonXContent.contentBuilder().startObject(); + StringBuilder inputBuilder = new StringBuilder(); + inputBuilder.append('{'); + final int numKeys = 128; String[] keys = new String[numKeys]; - String[] values = new String[numKeys]; + TestInput[] inputs = new TestInput[numKeys]; for (int i = 0; i < numKeys; i++) { String currKey = randomAlphanumericOfLength(6); - String currVal = randomUnicodeOfLengthBetween(0, 512); - jsonBuilder.field(currKey, currVal); + var currVal = buildRandomInput(randomInt(512)); + inputBuilder.append('"'); + inputBuilder.append(currKey); + inputBuilder.append("\":\""); + inputBuilder.append(currVal.input()); + inputBuilder.append('"'); + if (i < numKeys - 1) { + inputBuilder.append(','); + } keys[i] = currKey; - values[i] = currVal; + inputs[i] = currVal; } - jsonBuilder.endObject(); - testParseJson(Strings.toString(jsonBuilder), parser -> { + inputBuilder.append('}'); + testParseJson(inputBuilder.toString(), parser -> { assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); for (int i = 0; i < numKeys; i++) { assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i])); assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); - String currVal = values[i]; - if (validForTextRef(currVal)) { + String currVal = inputs[i].result(); + if (inputs[i].supportsOptimized()) { assertTextRef(parser.getValueAsText().bytes(), currVal); } else { assertThat(parser.getValueAsText(), Matchers.nullValue());