Skip to content

Commit 107c6fb

Browse files
authored
Improve ASCII performance (#568)
1 parent 5888efa commit 107c6fb

File tree

3 files changed

+236
-60
lines changed

3 files changed

+236
-60
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java

Lines changed: 178 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,19 @@
55
import java.math.BigInteger;
66
import java.nio.charset.Charset;
77
import java.nio.charset.StandardCharsets;
8-
import java.util.*;
8+
import java.util.ArrayList;
9+
import java.util.Arrays;
10+
import java.util.Stack;
911

1012
import com.fasterxml.jackson.core.*;
1113
import com.fasterxml.jackson.core.base.ParserMinimalBase;
1214
import com.fasterxml.jackson.core.io.IOContext;
1315
import com.fasterxml.jackson.core.io.NumberInput;
1416
import com.fasterxml.jackson.core.json.DupDetector;
1517
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
16-
import com.fasterxml.jackson.core.util.*;
18+
import com.fasterxml.jackson.core.util.ByteArrayBuilder;
19+
import com.fasterxml.jackson.core.util.JacksonFeatureSet;
20+
import com.fasterxml.jackson.core.util.TextBuffer;
1721

1822
import static com.fasterxml.jackson.dataformat.cbor.CBORConstants.*;
1923

@@ -2289,10 +2293,9 @@ protected void _finishToken() throws IOException
22892293

22902294
if ((available >= len)
22912295
// if not, could we read? NOTE: we do not require it, just attempt to read
2292-
|| ((_inputBuffer.length >= len)
2293-
&& _tryToLoadToHaveAtLeast(len))) {
2294-
_finishShortText(len);
2295-
return;
2296+
|| _tryToLoadToHaveAtLeast(len)) {
2297+
_finishShortText(len);
2298+
return;
22962299
}
22972300
// If not enough space, need handling similar to chunked
22982301
_finishLongText(len);
@@ -2331,11 +2334,9 @@ protected String _finishTextToken(int ch) throws IOException
23312334
// due to inputBuffer never being even close to that big).
23322335

23332336
final int available = _inputEnd - _inputPtr;
2334-
23352337
if ((available >= len)
23362338
// if not, could we read? NOTE: we do not require it, just attempt to read
2337-
|| ((_inputBuffer.length >= len)
2338-
&& _tryToLoadToHaveAtLeast(len))) {
2339+
|| _tryToLoadToHaveAtLeast(len)) {
23392340
return _finishShortText(len);
23402341
}
23412342
// If not enough space, need handling similar to chunked
@@ -2364,19 +2365,22 @@ private final String _finishShortText(int len) throws IOException
23642365

23652366
// Let's actually do a tight loop for ASCII first:
23662367
final int end = _inputPtr;
2367-
2368-
int i;
2369-
while ((i = inputBuf[inPtr]) >= 0) {
2368+
int i = 0;
2369+
while (inPtr < end && i >= 0) {
2370+
i = inputBuf[inPtr++];
23702371
outBuf[outPtr++] = (char) i;
2371-
if (++inPtr == end) {
2372-
String str = _textBuffer.setCurrentAndReturn(outPtr);
2373-
if (stringRefs != null) {
2374-
stringRefs.stringRefs.add(str);
2375-
_sharedString = str;
2376-
}
2377-
return str;
2372+
}
2373+
if (inPtr == end && i >= 0) {
2374+
String str = _textBuffer.setCurrentAndReturn(outPtr);
2375+
if (stringRefs != null) {
2376+
stringRefs.stringRefs.add(str);
2377+
_sharedString = str;
23782378
}
2379+
return str;
23792380
}
2381+
// Correct extra increments
2382+
outPtr -= 1;
2383+
inPtr -= 1;
23802384
final int[] codes = UTF8_UNIT_CODES;
23812385
do {
23822386
i = inputBuf[inPtr++] & 0xFF;
@@ -2443,10 +2447,17 @@ private final String _finishShortText(int len) throws IOException
24432447

24442448
private final String _finishLongText(int len) throws IOException
24452449
{
2446-
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
2447-
int outPtr = 0;
2448-
final int[] codes = UTF8_UNIT_CODES;
2450+
StringRefList stringRefs = null;
2451+
if (!_stringRefs.empty() &&
2452+
shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
2453+
stringRefs = _stringRefs.peek();
2454+
}
2455+
// First a tight loop for ASCII.
2456+
len = _finishLongTextAscii(len);
2457+
char[] outBuf = _textBuffer.getBufferWithoutReset();
2458+
int outPtr = _textBuffer.getCurrentSegmentSize();
24492459
int outEnd = outBuf.length;
2460+
final int[] codes = UTF8_UNIT_CODES;
24502461

24512462
while (--len >= 0) {
24522463
int c = _nextByte() & 0xFF;
@@ -2500,14 +2511,51 @@ private final String _finishLongText(int len) throws IOException
25002511
outBuf[outPtr++] = (char) c;
25012512
}
25022513
String str = _textBuffer.setCurrentAndReturn(outPtr);
2503-
if (!_stringRefs.empty() &&
2504-
shouldReferenceString(_stringRefs.peek().stringRefs.size(), len)) {
2505-
_stringRefs.peek().stringRefs.add(str);
2514+
if (stringRefs != null) {
2515+
stringRefs.stringRefs.add(str);
25062516
_sharedString = str;
25072517
}
25082518
return str;
25092519
}
25102520

2521+
/**
2522+
* Consumes as many ascii chars as possible in a tight loop. Returns the amount of bytes remaining.
2523+
*/
2524+
private final int _finishLongTextAscii(int len) throws IOException
2525+
{
2526+
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
2527+
final byte[] input = _inputBuffer;
2528+
while (len > 0) {
2529+
// load as much input as possible
2530+
int size = Math.min(len, Math.min(outBuf.length, input.length));
2531+
if (!_tryToLoadToHaveAtLeast(size)) {
2532+
return len;
2533+
}
2534+
int outEnd = size;
2535+
int outPtr = 0;
2536+
int inPtr = _inputPtr;
2537+
int i = 0;
2538+
// Tight loop to copy into the output buffer, bail if a non-ascii char is found
2539+
while (outPtr < outEnd && i >= 0) {
2540+
i = input[inPtr++];
2541+
outBuf[outPtr++] = (char) i;
2542+
}
2543+
// Found a non-ascii char, correct pointers and return to the caller.
2544+
if (i < 0) {
2545+
--outPtr;
2546+
_inputPtr = inPtr - 1;
2547+
_textBuffer.setCurrentLength(outPtr);
2548+
return len - outPtr;
2549+
}
2550+
_inputPtr = inPtr;
2551+
if (outPtr >= outBuf.length) {
2552+
outBuf = _textBuffer.finishCurrentSegment();
2553+
}
2554+
len -= size;
2555+
}
2556+
return len;
2557+
}
2558+
25112559
private final void _finishChunkedText() throws IOException
25122560
{
25132561
char[] outBuf = _textBuffer.emptyAndGetCurrentSegment();
@@ -2532,7 +2580,6 @@ private final void _finishChunkedText() throws IOException
25322580
}
25332581
break;
25342582
}
2535-
_chunkLeft = len;
25362583
int end = _inputPtr + len;
25372584
if (end <= _inputEnd) { // all within buffer
25382585
_chunkLeft = 0;
@@ -2541,19 +2588,22 @@ private final void _finishChunkedText() throws IOException
25412588
_chunkLeft = (end - _inputEnd);
25422589
_chunkEnd = _inputEnd;
25432590
}
2544-
}
2545-
// besides of which just need to ensure there's content
2546-
if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
2547-
loadMoreGuaranteed();
2548-
int end = _inputPtr + _chunkLeft;
2549-
if (end <= _inputEnd) { // all within buffer
2550-
_chunkLeft = 0;
2551-
_chunkEnd = end;
2552-
} else { // stretches beyond
2553-
_chunkLeft = (end - _inputEnd);
2554-
_chunkEnd = _inputEnd;
2591+
// start of a new chunk
2592+
// First a tight loop for ASCII.
2593+
_textBuffer.setCurrentLength(outPtr);
2594+
if (_finishChunkedTextAscii()) {
2595+
// chunk fully consumed, let's get the next one
2596+
outBuf = _textBuffer.getBufferWithoutReset();
2597+
outPtr = _textBuffer.getCurrentSegmentSize();
2598+
outEnd = outBuf.length;
2599+
continue;
25552600
}
2601+
outBuf = _textBuffer.getBufferWithoutReset();
2602+
outPtr = _textBuffer.getCurrentSegmentSize();
2603+
outEnd = outBuf.length;
25562604
}
2605+
// besides of which just need to ensure there's content
2606+
_loadMoreForChunkIfNeeded();
25572607
}
25582608
int c = input[_inputPtr++] & 0xFF;
25592609
int code = codes[c];
@@ -2563,9 +2613,9 @@ private final void _finishChunkedText() throws IOException
25632613
}
25642614

25652615
switch (code) {
2566-
case 0:
2567-
break;
2568-
case 1: // 2-byte UTF
2616+
case 0:
2617+
break;
2618+
case 1: // 2-byte UTF
25692619
{
25702620
int d = _nextChunkedByte();
25712621
if ((d & 0xC0) != 0x080) {
@@ -2574,24 +2624,24 @@ private final void _finishChunkedText() throws IOException
25742624
c = ((c & 0x1F) << 6) | (d & 0x3F);
25752625
}
25762626
break;
2577-
case 2: // 3-byte UTF
2578-
c = _decodeChunkedUTF8_3(c);
2579-
break;
2580-
case 3: // 4-byte UTF
2581-
c = _decodeChunkedUTF8_4(c);
2582-
// Let's add first part right away:
2583-
if (outPtr >= outBuf.length) {
2584-
outBuf = _textBuffer.finishCurrentSegment();
2585-
outPtr = 0;
2586-
outEnd = outBuf.length;
2587-
}
2588-
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
2589-
c = 0xDC00 | (c & 0x3FF);
2590-
// And let the other char output down below
2591-
break;
2592-
default:
2593-
// Is this good enough error message?
2594-
_reportInvalidInitial(c);
2627+
case 2: // 3-byte UTF
2628+
c = _decodeChunkedUTF8_3(c);
2629+
break;
2630+
case 3: // 4-byte UTF
2631+
c = _decodeChunkedUTF8_4(c);
2632+
// Let's add first part right away:
2633+
if (outPtr >= outBuf.length) {
2634+
outBuf = _textBuffer.finishCurrentSegment();
2635+
outPtr = 0;
2636+
outEnd = outBuf.length;
2637+
}
2638+
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
2639+
c = 0xDC00 | (c & 0x3FF);
2640+
// And let the other char output down below
2641+
break;
2642+
default:
2643+
// Is this good enough error message?
2644+
_reportInvalidInitial(c);
25952645
}
25962646
// Need more room?
25972647
if (outPtr >= outEnd) {
@@ -2602,9 +2652,75 @@ private final void _finishChunkedText() throws IOException
26022652
// Ok, let's add char to output:
26032653
outBuf[outPtr++] = (char) c;
26042654
}
2655+
26052656
_textBuffer.setCurrentLength(outPtr);
26062657
}
26072658

2659+
/**
2660+
* Reads in a tight loop ASCII text until a non-ASCII char is found. If any, then it returns false to signal the
2661+
* caller that the chunk wasn't finished. The caller will keep adding to the _outBuf at the _outPtr position to
2662+
* finish the current text buffer segment
2663+
*/
2664+
private final boolean _finishChunkedTextAscii() throws IOException
2665+
{
2666+
final byte[] input = _inputBuffer;
2667+
int outPtr = _textBuffer.getCurrentSegmentSize();
2668+
char[] outBuf = _textBuffer.getBufferWithoutReset();
2669+
int outEnd = outBuf.length;
2670+
while (true) {
2671+
// besides of which just need to ensure there's content
2672+
_loadMoreForChunkIfNeeded();
2673+
2674+
// Find the size of the loop
2675+
int inSize = _chunkEnd - _inputPtr;
2676+
int outSize = outEnd - outPtr;
2677+
int inputPtr = _inputPtr;
2678+
int inputPtrEnd = _inputPtr + Math.min(inSize, outSize);
2679+
int i = 0;
2680+
// loop with copying what we can.
2681+
while (inputPtr < inputPtrEnd && i >= 0) {
2682+
i = input[inputPtr++];
2683+
char val = (char) i;
2684+
outBuf[outPtr++] = val;
2685+
}
2686+
_inputPtr = inputPtr;
2687+
2688+
if (i < 0) {
2689+
// Found a non-ascii char, correct pointers and return to the caller.
2690+
_inputPtr -= 1;
2691+
_textBuffer.setCurrentLength(outPtr - 1);
2692+
// return false to signal this to the calling code to allow the multi-byte code-path to kick.
2693+
return false;
2694+
}
2695+
// Need more room?
2696+
if (outPtr >= outEnd) {
2697+
outBuf = _textBuffer.finishCurrentSegment();
2698+
outPtr = 0;
2699+
outEnd = outBuf.length;
2700+
}
2701+
if (_inputPtr < _chunkEnd || _chunkLeft > 0) {
2702+
continue;
2703+
}
2704+
_textBuffer.setCurrentLength(outPtr);
2705+
return true;
2706+
}
2707+
}
2708+
2709+
private final void _loadMoreForChunkIfNeeded() throws IOException
2710+
{
2711+
if (_inputPtr >= _inputEnd) { // end of buffer, but not necessarily chunk
2712+
loadMoreGuaranteed();
2713+
int end = _inputPtr + _chunkLeft;
2714+
if (end <= _inputEnd) { // all within buffer
2715+
_chunkLeft = 0;
2716+
_chunkEnd = end;
2717+
} else { // stretches beyond
2718+
_chunkLeft = (end - _inputEnd);
2719+
_chunkEnd = _inputEnd;
2720+
}
2721+
}
2722+
}
2723+
26082724
private final int _nextByte() throws IOException {
26092725
int inPtr = _inputPtr;
26102726
if (inPtr < _inputEnd) {
@@ -3716,6 +3832,10 @@ protected final boolean _tryToLoadToHaveAtLeast(int minAvailable) throws IOExcep
37163832
if (_inputStream == null) {
37173833
return false;
37183834
}
3835+
// The code below assumes this is true, so we check it here.
3836+
if (_inputBuffer.length < minAvailable) {
3837+
return false;
3838+
}
37193839
// Need to move remaining data in front?
37203840
int amount = _inputEnd - _inputPtr;
37213841
if (amount > 0 && _inputPtr > 0) {

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ protected static String generateUnicodeString(int length) {
216216
return generateUnicodeString(length, new Random(length));
217217
}
218218

219+
protected static String generateUnicodeStringWithAsciiPrefix(int asciiPrefixLen, int length) {
220+
return generateUnicodeStringWithAsciiPrefix(asciiPrefixLen, length, new Random(length));
221+
}
222+
219223
protected static String generateUnicodeString(int length, Random rnd)
220224
{
221225
StringBuilder sw = new StringBuilder(length+10);
@@ -241,6 +245,31 @@ protected static String generateUnicodeString(int length, Random rnd)
241245
return sw.toString();
242246
}
243247

248+
protected static String generateUnicodeStringWithAsciiPrefix(int asciiLength, int length, Random rnd)
249+
{
250+
StringBuilder sw = new StringBuilder(length+10);
251+
// add a prefix of ascii chars
252+
int num = asciiLength;
253+
while (--num >= 0) {
254+
sw.append((char) ('A' + (num % 32)));
255+
}
256+
do {
257+
// Then a unicode char of 2, 3 or 4 bytes long
258+
switch (rnd.nextInt() % 3) {
259+
case 0:
260+
sw.append((char) (256 + rnd.nextInt() & 511));
261+
break;
262+
case 1:
263+
sw.append((char) (2048 + rnd.nextInt() & 4095));
264+
break;
265+
default:
266+
sw.append((char) (65536 + rnd.nextInt() & 0x3FFF));
267+
break;
268+
}
269+
} while (sw.length() < length);
270+
return sw.toString();
271+
}
272+
244273
protected static String generateLongAsciiString(int length) {
245274
return generateLongAsciiString(length, new Random(length));
246275
}

0 commit comments

Comments
 (0)