Improve ASCII performance (#568)

sugmanue · web-flow · commit 107c6fb3845e · 2025-04-06T18:07:46.000-07:00
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java
@@ -5,15 +5,19 @@
 import java.math.BigInteger;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Stack;
 
 import com.fasterxml.jackson.core.*;
 import com.fasterxml.jackson.core.base.ParserMinimalBase;
 import com.fasterxml.jackson.core.io.IOContext;
 import com.fasterxml.jackson.core.io.NumberInput;
 import com.fasterxml.jackson.core.json.DupDetector;
 import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
-import com.fasterxml.jackson.core.util.*;
+import com.fasterxml.jackson.core.util.ByteArrayBuilder;
+import com.fasterxml.jackson.core.util.JacksonFeatureSet;
+import com.fasterxml.jackson.core.util.TextBuffer;
 
 import static com.fasterxml.jackson.dataformat.cbor.CBORConstants.*;
 
@@ -2289,10 +2293,9 @@ protected void _finishToken() throws IOException
 
         if ((available >= len)
                 // if not, could we read? NOTE: we do not require it, just attempt to read
-                    || ((_inputBuffer.length >= len)
-                            && _tryToLoadToHaveAtLeast(len))) {
-                _finishShortText(len);
-                return;
+                || _tryToLoadToHaveAtLeast(len)) {
+            _finishShortText(len);
+            return;
         }
         // If not enough space, need handling similar to chunked
         _finishLongText(len);
@@ -2331,11 +2334,9 @@ protected String _finishTextToken(int ch) throws IOException
         //    due to inputBuffer never being even close to that big).
 
         final int available = _inputEnd - _inputPtr;
-
         if ((available >= len)
             // if not, could we read? NOTE: we do not require it, just attempt to read
-                || ((_inputBuffer.length >= len)
-                        && _tryToLoadToHaveAtLeast(len))) {
+                || _tryToLoadToHaveAtLeast(len)) {
             return _finishShortText(len);
         }
         // If not enough space, need handling similar to chunked
@@ -2364,19 +2365,22 @@ private final String _finishShortText(int len) throws IOException
 
         // Let's actually do a tight loop for ASCII first:
         final int end = _inputPtr;
-
-        int i;
-        while ((i = inputBuf[inPtr]) >= 0) {
+        int i = 0;
+        while (inPtr < end && i >= 0) {
+            i = inputBuf[inPtr++];
             outBuf[outPtr++] = (char) i;
-            if (++inPtr == end) {
-                String str = _textBuffer.setCurrentAndReturn(outPtr);
-                if (stringRefs != null) {
-                    stringRefs.stringRefs.add(str);
-                    _sharedString = str;
-                }
-                return str;
+        }
+        if (inPtr == end && i >= 0) {
+            String str = _textBuffer.setCurrentAndReturn(outPtr);
+            if (stringRefs != null) {
+                stringRefs.stringRefs.add(str);
+                _sharedString = str;
             }
+            return str;
         }
+        // Correct extra increments
+        outPtr -= 1;
+        inPtr -= 1;
         final int[] codes = UTF8_UNIT_CODES;
         do {
             i = inputBuf[inPtr++] & 0xFF;
@@ -2443,10 +2447,17 @@ private final String _finishShortText(int len) throws IOException
 
     private final String _finishLongText(int len) throws IOException
     {
-        char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
-        int outPtr = 0;
-        final int[] codes = UTF8_UNIT_CODES;
+        StringRefList stringRefs = null;
+        if (!_stringRefs.empty() &&
+                shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
+            stringRefs = _stringRefs.peek();
+        }
+        // First a tight loop for ASCII.
+        len = _finishLongTextAscii(len);
+        char[] outBuf = _textBuffer.getBufferWithoutReset();
+        int outPtr = _textBuffer.getCurrentSegmentSize();
         int outEnd = outBuf.length;
+        final int[] codes = UTF8_UNIT_CODES;
 
         while (--len >= 0) {
             int c = _nextByte() & 0xFF;
@@ -2500,14 +2511,51 @@ private final String _finishLongText(int len) throws IOException
             outBuf[outPtr++] = (char) c;
         }
         String str = _textBuffer.setCurrentAndReturn(outPtr);
-        if (!_stringRefs.empty() &&
-                shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
-            _stringRefs.peek().stringRefs.add(str);
+        if (stringRefs != null) {
+            stringRefs.stringRefs.add(str);
             _sharedString = str;
         }
         return str;
     }
 
+    /**
+     * Consumes as many ascii chars as possible in a tight loop. Returns the amount of bytes remaining.
+     */
+    private final int _finishLongTextAscii(int len) throws IOException
+    {
+        char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
+        final byte[] input = _inputBuffer;
+        while (len > 0) {
+            // load as much input as possible
+            int size = Math.min(len, Math.min(outBuf.length, input.length));
+            if (!_tryToLoadToHaveAtLeast(size)) {
+                return len;
+            }
+            int outEnd = size;
+            int outPtr = 0;
+            int inPtr = _inputPtr;
+            int i = 0;
+            // Tight loop to copy into the output buffer, bail if a non-ascii char is found
+            while (outPtr < outEnd && i >= 0) {
+                i = input[inPtr++];
+                outBuf[outPtr++] = (char) i;
+            }
+            // Found a non-ascii char, correct pointers and return to the caller.
+            if (i < 0) {
+                --outPtr;
+                _inputPtr = inPtr - 1;
+                _textBuffer.setCurrentLength(outPtr);
+                return len - outPtr;
+            }
+            _inputPtr = inPtr;
+            if (outPtr >= outBuf.length) {
+                outBuf = _textBuffer.finishCurrentSegment();
+            }
+            len -= size;
+        }
+        return len;
+    }
+
     private final void _finishChunkedText() throws IOException
     {
         char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
@@ -2532,7 +2580,6 @@ private final void _finishChunkedText() throws IOException
                         }
                         break;
                     }
-                    _chunkLeft = len;
                     int end = _inputPtr + len;
                     if (end <= _inputEnd) { // all within buffer
                         _chunkLeft = 0;
@@ -2541,19 +2588,22 @@ private final void _finishChunkedText() throws IOException
                         _chunkLeft = (end - _inputEnd);
                         _chunkEnd = _inputEnd;
                     }
-                }
-                // besides of which just need to ensure there's content
-                if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
-                    loadMoreGuaranteed();
-                    int end = _inputPtr + _chunkLeft;
-                    if (end <= _inputEnd) { // all within buffer
-                        _chunkLeft = 0;
-                        _chunkEnd = end;
-                    } else { // stretches beyond
-                        _chunkLeft = (end - _inputEnd);
-                        _chunkEnd = _inputEnd;
+                    // start of a new chunk
+                    // First a tight loop for ASCII.
+                    _textBuffer.setCurrentLength(outPtr);
+                    if (_finishChunkedTextAscii()) {
+                        // chunk fully consumed, let's get the next one
+                        outBuf = _textBuffer.getBufferWithoutReset();
+                        outPtr = _textBuffer.getCurrentSegmentSize();
+                        outEnd = outBuf.length;
+                        continue;
                     }
+                    outBuf = _textBuffer.getBufferWithoutReset();
+                    outPtr = _textBuffer.getCurrentSegmentSize();
+                    outEnd = outBuf.length;
                 }
+                // besides of which just need to ensure there's content
+                _loadMoreForChunkIfNeeded();
             }
             int c = input[_inputPtr++] & 0xFF;
             int code = codes[c];
@@ -2563,9 +2613,9 @@ private final void _finishChunkedText() throws IOException
             }
 
             switch (code) {
-            case 0:
-                break;
-            case 1: // 2-byte UTF
+                case 0:
+                    break;
+                case 1: // 2-byte UTF
                 {
                     int d = _nextChunkedByte();
                     if ((d & 0xC0) != 0x080) {
@@ -2574,24 +2624,24 @@ private final void _finishChunkedText() throws IOException
                     c = ((c & 0x1F) << 6) | (d & 0x3F);
                 }
                 break;
-            case 2: // 3-byte UTF
-                c = _decodeChunkedUTF8_3(c);
-                break;
-            case 3: // 4-byte UTF
-                c = _decodeChunkedUTF8_4(c);
-                // Let's add first part right away:
-                if (outPtr >= outBuf.length) {
-                    outBuf = _textBuffer.finishCurrentSegment();
-                    outPtr = 0;
-                    outEnd = outBuf.length;
-                }
-                outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
-                c = 0xDC00 | (c & 0x3FF);
-                // And let the other char output down below
-                break;
-            default:
-                // Is this good enough error message?
-                _reportInvalidInitial(c);
+                case 2: // 3-byte UTF
+                    c = _decodeChunkedUTF8_3(c);
+                    break;
+                case 3: // 4-byte UTF
+                    c = _decodeChunkedUTF8_4(c);
+                    // Let's add first part right away:
+                    if (outPtr >= outBuf.length) {
+                        outBuf = _textBuffer.finishCurrentSegment();
+                        outPtr = 0;
+                        outEnd = outBuf.length;
+                    }
+                    outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
+                    c = 0xDC00 | (c & 0x3FF);
+                    // And let the other char output down below
+                    break;
+                default:
+                    // Is this good enough error message?
+                    _reportInvalidInitial(c);
             }
             // Need more room?
             if (outPtr >= outEnd) {
@@ -2602,9 +2652,75 @@ private final void _finishChunkedText() throws IOException
             // Ok, let's add char to output:
             outBuf[outPtr++] = (char) c;
         }
+
         _textBuffer.setCurrentLength(outPtr);
     }
 
+    /**
+     * Reads in a tight loop ASCII text until a non-ASCII char is found. If any, then it returns false to signal the
+     * caller that the chunk wasn't finished. The caller will keep adding to the _outBuf at the _outPtr position to
+     * finish the current text buffer segment
+     */
+    private final boolean _finishChunkedTextAscii() throws IOException
+    {
+        final byte[] input = _inputBuffer;
+        int outPtr = _textBuffer.getCurrentSegmentSize();
+        char[] outBuf = _textBuffer.getBufferWithoutReset();
+        int outEnd = outBuf.length;
+        while (true) {
+            // besides of which just need to ensure there's content
+            _loadMoreForChunkIfNeeded();
+
+            // Find the size of the loop
+            int inSize =  _chunkEnd - _inputPtr;
+            int outSize = outEnd - outPtr;
+            int inputPtr = _inputPtr;
+            int inputPtrEnd = _inputPtr + Math.min(inSize, outSize);
+            int i = 0;
+            // loop with copying what we can.
+            while (inputPtr < inputPtrEnd && i >= 0) {
+                i = input[inputPtr++];
+                char val = (char) i;
+                outBuf[outPtr++] = val;
+            }
+            _inputPtr = inputPtr;
+
+            if (i < 0) {
+                // Found a non-ascii char, correct pointers and return to the caller.
+                _inputPtr -= 1;
+                _textBuffer.setCurrentLength(outPtr - 1);
+                // return false to signal this to the calling code to allow the multi-byte code-path to kick.
+                return false;
+            }
+            // Need more room?
+            if (outPtr >= outEnd) {
+                outBuf = _textBuffer.finishCurrentSegment();
+                outPtr = 0;
+                outEnd = outBuf.length;
+            }
+            if (_inputPtr < _chunkEnd || _chunkLeft > 0) {
+                continue;
+            }
+            _textBuffer.setCurrentLength(outPtr);
+            return true;
+        }
+    }
+
+    private final void _loadMoreForChunkIfNeeded() throws IOException
+    {
+        if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
+            loadMoreGuaranteed();
+            int end = _inputPtr + _chunkLeft;
+            if (end <= _inputEnd) { // all within buffer
+                _chunkLeft = 0;
+                _chunkEnd = end;
+            } else { // stretches beyond
+                _chunkLeft = (end - _inputEnd);
+                _chunkEnd = _inputEnd;
+            }
+        }
+    }
+
     private final int _nextByte() throws IOException {
         int inPtr = _inputPtr;
         if (inPtr < _inputEnd) {
@@ -3716,6 +3832,10 @@ protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOExcep
         if (_inputStream == null) {
             return false;
         }
+        // The code below assumes this is true, so we check it here.
+        if (_inputBuffer.length < minAvailable)  {
+            return false;
+        }
         // Need to move remaining data in front?
         int amount = _inputEnd - _inputPtr;
         if (amount > 0 && _inputPtr > 0) {
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
@@ -216,6 +216,10 @@ protected static String generateUnicodeString(int length) {
         return generateUnicodeString(length, new Random(length));
     }
 
+    protected static String generateUnicodeStringWithAsciiPrefix(int asciiPrefixLen, int length) {
+        return generateUnicodeStringWithAsciiPrefix(asciiPrefixLen, length, new Random(length));
+    }
+
     protected static String generateUnicodeString(int length, Random rnd)
     {
         StringBuilder sw = new StringBuilder(length+10);
@@ -241,6 +245,31 @@ protected static String generateUnicodeString(int length, Random rnd)
         return sw.toString();
     }
 
+    protected static String generateUnicodeStringWithAsciiPrefix(int asciiLength, int length, Random rnd)
+    {
+        StringBuilder sw = new StringBuilder(length+10);
+        // add a prefix of ascii chars
+        int num = asciiLength;
+        while (--num >= 0) {
+            sw.append((char) ('A' + (num % 32)));
+        }
+        do {
+            // Then a unicode char of 2, 3 or 4 bytes long
+            switch (rnd.nextInt() % 3) {
+                case 0:
+                    sw.append((char) (256 + rnd.nextInt() & 511));
+                    break;
+                case 1:
+                    sw.append((char) (2048 + rnd.nextInt() & 4095));
+                    break;
+                default:
+                    sw.append((char) (65536 + rnd.nextInt() & 0x3FFF));
+                    break;
+            }
+        } while (sw.length() < length);
+        return sw.toString();
+    }
+
     protected static String generateLongAsciiString(int length) {
         return generateLongAsciiString(length, new Random(length));
     }
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/BasicParserTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/BasicParserTest.java