elastic · jordan-powers · Jun 12, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
@@ -21,9 +21,14 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
 
 public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
     protected int stringEnd = -1;
+    protected int stringLength;
+
+    private final List<Integer> backslashes = new ArrayList<>();
 
     public ESUTF8StreamJsonParser(
         IOContext ctxt,
@@ -43,15 +48,12 @@ public ESUTF8StreamJsonParser(
     /**
      * Method that will try to get underlying UTF-8 encoded bytes of the current string token.
      * This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
-     * Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
      */
     public Text getValueAsText() throws IOException {
         if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
             if (stringEnd > 0) {
                 final int len = stringEnd - 1 - _inputPtr;
-                // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
-                // which means each character uses exactly 1 byte.
-                return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
+                return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), stringLength);
             }
             return _finishAndReturnText();
         }
@@ -69,21 +71,71 @@ protected Text _finishAndReturnText() throws IOException {
         final int[] codes = INPUT_CODES_UTF8;
         final int max = _inputEnd;
         final byte[] inputBuffer = _inputBuffer;
-        while (ptr < max) {
+        stringLength = 0;
+        backslashes.clear();
+
+        loop: while (true) {
+            if (ptr >= max) {
+                return null;
+            }
             int c = inputBuffer[ptr] & 0xFF;
-            if (codes[c] != 0) {
-                if (c == INT_QUOTE) {
-                    stringEnd = ptr + 1;
-                    final int len = ptr - startPtr;
-                    // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
-                    // which means each character uses exactly 1 byte.
-                    return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
+            switch (codes[c]) {
+                case 0 -> {
+                    ++ptr;
+                    ++stringLength;
+                }
+                case 1 -> {
+                    if (c == INT_QUOTE) {
+                        // End of the string
+                        break loop;
+                    }
+                    assert c == INT_BACKSLASH;
+                    backslashes.add(ptr);
+                    ++ptr;
+                    if (ptr >= max) {
+                        // Backslash at end of file
+                        return null;
+                    }
+                    c = inputBuffer[ptr] & 0xFF;
+                    if (c == '"' || c == '/' || c == '\\') {
+                        ptr += 1;
+                        stringLength += 1;
+                    } else {
+                        // Any other escaped sequence requires replacing the sequence with
+                        // a new character, which we don't support in the optimized path
+                        return null;
+                    }
+                }
+                case 2, 3, 4 -> {
+                    int bytesToSkip = codes[c];
+                    if (ptr + bytesToSkip > max) {
+                        return null;
+                    }
+                    ptr += bytesToSkip;
+                    ++stringLength;
+                }
+                default -> {
+                    return null;
                 }
-                return null;
             }
-            ++ptr;
         }
-        return null;
+
+        stringEnd = ptr + 1;
+        if (backslashes.isEmpty()) {
+            return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, ptr - startPtr), stringLength);
+        } else {
+            byte[] buff = new byte[ptr - startPtr - backslashes.size()];
+            int copyPtr = startPtr;
+            int destPtr = 0;
+            for (Integer backslash : backslashes) {
+                int length = backslash - copyPtr;
+                System.arraycopy(inputBuffer, copyPtr, buff, destPtr, length);
+                destPtr += length;
+                copyPtr = backslash + 1;
+            }
+            System.arraycopy(inputBuffer, copyPtr, buff, destPtr, ptr - copyPtr);
+            return new Text(new XContentString.UTF8Bytes(buff), stringLength);
+        }
     }
 
     @Override

@@ -13,16 +13,14 @@
 import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.core.JsonToken;
 
-import org.elasticsearch.common.Strings;
 import org.elasticsearch.core.CheckedConsumer;
 import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentString;
-import org.elasticsearch.xcontent.json.JsonXContent;
 import org.hamcrest.Matchers;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.Locale;
 
 public class ESUTF8StreamJsonParserTests extends ESTestCase {
 
@@ -45,11 +43,13 @@ public void testGetValueAsText() throws IOException {
             assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
             assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
 
-            var textRef = parser.getValueAsText().bytes();
-            assertThat(textRef, Matchers.notNullValue());
-            assertThat(textRef.offset(), Matchers.equalTo(9));
-            assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12));
-            assertTextRef(textRef, "bar");
+            var text = parser.getValueAsText();
+            assertThat(text, Matchers.notNullValue());
+
+            var bytes = text.bytes();
+            assertThat(bytes.offset(), Matchers.equalTo(9));
+            assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12));
+            assertTextRef(bytes, "bar");
 
             assertThat(parser.getValueAsString(), Matchers.equalTo("bar"));
             assertThat(parser.getValueAsText(), Matchers.nullValue());
@@ -62,17 +62,36 @@ public void testGetValueAsText() throws IOException {
             assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
             assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
 
+            var text = parser.getValueAsText();
+            assertThat(text, Matchers.notNullValue());
+            assertTextRef(text.bytes(), "bar\"baz\"");
+        });
+
+        testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> {
+            assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
+            assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
+            assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
+
             assertThat(parser.getValueAsText(), Matchers.nullValue());
-            assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\""));
+            assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
         });
 
         testParseJson("{\"foo\": \"bår\"}", parser -> {
             assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
             assertThat(parser.nextFieldName(), Matchers.equalTo("foo"));
             assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
 
-            assertThat(parser.getValueAsText(), Matchers.nullValue());
+            var text = parser.getValueAsText();
+            assertThat(text, Matchers.notNullValue());
+
+            var bytes = text.bytes();
+            assertThat(bytes.offset(), Matchers.equalTo(9));
+            assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13));
+            assertTextRef(bytes, "bår");
+
             assertThat(parser.getValueAsString(), Matchers.equalTo("bår"));
+
+            assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT));
         });
 
         testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> {
@@ -112,43 +131,97 @@ public void testGetValueAsText() throws IOException {
         });
     }
 
-    private boolean validForTextRef(String value) {
-        for (char c : value.toCharArray()) {
-            if (c == '"') {
-                return false;
+    private record TestInput(String input, String result, boolean supportsOptimized) {}
+
+    private static final TestInput[] ESCAPE_SEQUENCES = {
+        new TestInput("\\b", "\b", false),
+        new TestInput("\\t", "\t", false),
+        new TestInput("\\n", "\n", false),
+        new TestInput("\\f", "\f", false),
+        new TestInput("\\r", "\r", false),
+        new TestInput("\\\"", "\"", true),
+        new TestInput("\\/", "/", true),
+        new TestInput("\\\\", "\\", true) };
+
+    private int randomCodepoint(boolean includeAscii) {
+        while (true) {
+            char val = Character.toChars(randomInt(0xFFFF))[0];
+            if (val <= 0x7f && includeAscii == false) {
+                continue;
             }
-            if (c == '\\') {
-                return false;
+            if (val >= Character.MIN_SURROGATE && val <= Character.MAX_SURROGATE) {
+                continue;
             }
-            if ((int) c < 32 || (int) c >= 128) {
-                return false;
+            return val;
+        }
+    }
+
+    private TestInput buildRandomInput(int length) {
+        StringBuilder input = new StringBuilder(length);
+        StringBuilder result = new StringBuilder(length);
+        boolean forceSupportOptimized = randomBoolean();
+        boolean doesSupportOptimized = true;
+        for (int i = 0; i < length; ++i) {
+            if (forceSupportOptimized == false && randomBoolean()) {
+                switch (randomInt(9)) {
+                    case 0 -> {
+                        var escape = randomFrom(ESCAPE_SEQUENCES);
+                        input.append(escape.input());
+                        result.append(escape.result());
+                        doesSupportOptimized = doesSupportOptimized && escape.supportsOptimized();
+                    }
+                    case 1 -> {
+                        int value = randomCodepoint(true);
+                        input.append(String.format(Locale.ENGLISH, "\\u%04x", value));
+                        result.append(Character.toChars(value));
+                        doesSupportOptimized = false;
+                    }
+                    default -> {
+                        var value = Character.toChars(randomCodepoint(false));
+                        input.append(value);
+                        result.append(value);
+                    }
+                }
+            } else {
+                var value = randomAlphanumericOfLength(1);
+                input.append(value);
+                result.append(value);
             }
         }
-        return true;
+        return new TestInput(input.toString(), result.toString(), doesSupportOptimized);
     }
 
     public void testGetValueRandomized() throws IOException {
-        XContentBuilder jsonBuilder = JsonXContent.contentBuilder().startObject();
+        StringBuilder inputBuilder = new StringBuilder();
+        inputBuilder.append('{');
+
         final int numKeys = 128;
         String[] keys = new String[numKeys];
-        String[] values = new String[numKeys];
+        TestInput[] inputs = new TestInput[numKeys];
         for (int i = 0; i < numKeys; i++) {
             String currKey = randomAlphanumericOfLength(6);
-            String currVal = randomUnicodeOfLengthBetween(0, 512);
-            jsonBuilder.field(currKey, currVal);
+            var currVal = buildRandomInput(randomInt(512));
+            inputBuilder.append('"');
+            inputBuilder.append(currKey);
+            inputBuilder.append("\":\"");
+            inputBuilder.append(currVal.input());
+            inputBuilder.append('"');
+            if (i < numKeys - 1) {
+                inputBuilder.append(',');
+            }
             keys[i] = currKey;
-            values[i] = currVal;
+            inputs[i] = currVal;
         }
 
-        jsonBuilder.endObject();
-        testParseJson(Strings.toString(jsonBuilder), parser -> {
+        inputBuilder.append('}');
+        testParseJson(inputBuilder.toString(), parser -> {
             assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT));
             for (int i = 0; i < numKeys; i++) {
                 assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i]));
                 assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING));
 
-                String currVal = values[i];
-                if (validForTextRef(currVal)) {
+                String currVal = inputs[i].result();
+                if (inputs[i].supportsOptimized()) {
                     assertTextRef(parser.getValueAsText().bytes(), currVal);
                 } else {
                     assertThat(parser.getValueAsText(), Matchers.nullValue());