Skip to content

Commit 0548fe2

Browse files
jordan-powersvaleriy42
authored andcommitted
Skip UTF8 to UTF16 conversion during document indexing (elastic#126492)
When parsing documents, we receive the document as UTF-8 encoded data which we then parse and convert the fields to java-native UTF-16 encoded Strings. We then convert these strings back to UTF-8 for storage in lucene. This patch skips the redundant conversion, instead passing lucene a direct reference to the received UTF-8 bytes when possible.
1 parent 59207c2 commit 0548fe2

File tree

19 files changed

+675
-14
lines changed

19 files changed

+675
-14
lines changed
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.benchmark.xcontent;
11+
12+
import org.elasticsearch.benchmark.index.mapper.MapperServiceFactory;
13+
import org.elasticsearch.common.UUIDs;
14+
import org.elasticsearch.common.bytes.BytesReference;
15+
import org.elasticsearch.common.logging.LogConfigurator;
16+
import org.elasticsearch.index.mapper.MapperService;
17+
import org.elasticsearch.index.mapper.SourceToParse;
18+
import org.elasticsearch.xcontent.XContentBuilder;
19+
import org.elasticsearch.xcontent.XContentFactory;
20+
import org.elasticsearch.xcontent.XContentType;
21+
import org.openjdk.jmh.annotations.Benchmark;
22+
import org.openjdk.jmh.annotations.BenchmarkMode;
23+
import org.openjdk.jmh.annotations.Fork;
24+
import org.openjdk.jmh.annotations.Level;
25+
import org.openjdk.jmh.annotations.Measurement;
26+
import org.openjdk.jmh.annotations.Mode;
27+
import org.openjdk.jmh.annotations.OutputTimeUnit;
28+
import org.openjdk.jmh.annotations.Param;
29+
import org.openjdk.jmh.annotations.Scope;
30+
import org.openjdk.jmh.annotations.Setup;
31+
import org.openjdk.jmh.annotations.State;
32+
import org.openjdk.jmh.annotations.Threads;
33+
import org.openjdk.jmh.annotations.Warmup;
34+
import org.openjdk.jmh.infra.Blackhole;
35+
36+
import java.io.IOException;
37+
import java.util.Random;
38+
import java.util.concurrent.TimeUnit;
39+
40+
/**
41+
* Benchmark to measure indexing performance of keyword fields. Used to measure performance impact of skipping
42+
* UTF-8 to UTF-16 conversion during document parsing.
43+
*/
44+
@BenchmarkMode(Mode.AverageTime)
45+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
46+
@State(Scope.Benchmark)
47+
@Fork(1)
48+
@Threads(1)
49+
@Warmup(iterations = 1)
50+
@Measurement(iterations = 5)
51+
public class OptimizedTextBenchmark {
52+
static {
53+
// For Elasticsearch900Lucene101Codec:
54+
LogConfigurator.loadLog4jPlugins();
55+
LogConfigurator.configureESLogging();
56+
LogConfigurator.setNodeName("test");
57+
}
58+
59+
/**
60+
* Total number of documents to index.
61+
*/
62+
@Param("1048576")
63+
private int nDocs;
64+
65+
private MapperService mapperService;
66+
private SourceToParse[] sources;
67+
68+
private String randomValue(int length) {
69+
final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
70+
Random random = new Random();
71+
StringBuilder builder = new StringBuilder(length);
72+
for (int i = 0; i < length; i++) {
73+
builder.append(CHARS.charAt(random.nextInt(CHARS.length())));
74+
}
75+
return builder.toString();
76+
}
77+
78+
@Setup(Level.Trial)
79+
public void setup() throws IOException {
80+
mapperService = MapperServiceFactory.create("""
81+
{
82+
"_doc": {
83+
"dynamic": false,
84+
"properties": {
85+
"field": {
86+
"type": "keyword"
87+
}
88+
}
89+
}
90+
}
91+
""");
92+
93+
sources = new SourceToParse[nDocs];
94+
for (int i = 0; i < nDocs; i++) {
95+
XContentBuilder b = XContentFactory.jsonBuilder();
96+
b.startObject().field("field", randomValue(8)).endObject();
97+
sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON);
98+
}
99+
}
100+
101+
@Benchmark
102+
public void indexDocuments(final Blackhole bh) {
103+
final var mapper = mapperService.documentMapper();
104+
for (int i = 0; i < nDocs; i++) {
105+
bh.consume(mapper.parse(sources[i]));
106+
}
107+
}
108+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.xcontent.provider.json;
11+
12+
import com.fasterxml.jackson.core.JsonEncoding;
13+
import com.fasterxml.jackson.core.JsonFactory;
14+
import com.fasterxml.jackson.core.JsonFactoryBuilder;
15+
import com.fasterxml.jackson.core.JsonParser;
16+
import com.fasterxml.jackson.core.io.IOContext;
17+
import com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper;
18+
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
19+
20+
import java.io.IOException;
21+
22+
public class ESJsonFactory extends JsonFactory {
23+
ESJsonFactory(JsonFactoryBuilder b) {
24+
super(b);
25+
}
26+
27+
@Override
28+
protected JsonParser _createParser(byte[] data, int offset, int len, IOContext ctxt) throws IOException {
29+
if (len > 0
30+
&& Feature.CHARSET_DETECTION.enabledIn(_factoryFeatures)
31+
&& Feature.CANONICALIZE_FIELD_NAMES.enabledIn(_factoryFeatures)) {
32+
var bootstrap = new ByteSourceJsonBootstrapper(ctxt, data, offset, len);
33+
var encoding = bootstrap.detectEncoding();
34+
if (encoding == JsonEncoding.UTF8) {
35+
boolean invalidBom = false;
36+
int ptr = offset;
37+
// Skip over the BOM if present
38+
if ((data[ptr] & 0xFF) == 0xEF) {
39+
if (len < 3) {
40+
invalidBom = true;
41+
} else if ((data[ptr + 1] & 0xFF) != 0xBB) {
42+
invalidBom = true;
43+
} else if ((data[ptr + 2] & 0xFF) != 0xBF) {
44+
invalidBom = true;
45+
} else {
46+
ptr += 3;
47+
}
48+
}
49+
if (invalidBom == false) {
50+
ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures);
51+
return new ESUTF8StreamJsonParser(
52+
ctxt,
53+
_parserFeatures,
54+
null,
55+
_objectCodec,
56+
can,
57+
data,
58+
ptr,
59+
offset + len,
60+
ptr - offset,
61+
false
62+
);
63+
}
64+
}
65+
}
66+
return new ByteSourceJsonBootstrapper(ctxt, data, offset, len).constructParser(
67+
_parserFeatures,
68+
_objectCodec,
69+
_byteSymbolCanonicalizer,
70+
_rootCharSymbols,
71+
_factoryFeatures
72+
);
73+
}
74+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.xcontent.provider.json;
11+
12+
import com.fasterxml.jackson.core.JsonFactory;
13+
import com.fasterxml.jackson.core.JsonFactoryBuilder;
14+
15+
public class ESJsonFactoryBuilder extends JsonFactoryBuilder {
16+
@Override
17+
public JsonFactory build() {
18+
return new ESJsonFactory(this);
19+
}
20+
}
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.xcontent.provider.json;
11+
12+
import com.fasterxml.jackson.core.JsonToken;
13+
import com.fasterxml.jackson.core.ObjectCodec;
14+
import com.fasterxml.jackson.core.SerializableString;
15+
import com.fasterxml.jackson.core.io.IOContext;
16+
import com.fasterxml.jackson.core.json.UTF8StreamJsonParser;
17+
import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
18+
19+
import org.elasticsearch.xcontent.Text;
20+
import org.elasticsearch.xcontent.XContentString;
21+
22+
import java.io.IOException;
23+
import java.io.InputStream;
24+
25+
public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser {
26+
protected int stringEnd = -1;
27+
28+
public ESUTF8StreamJsonParser(
29+
IOContext ctxt,
30+
int features,
31+
InputStream in,
32+
ObjectCodec codec,
33+
ByteQuadsCanonicalizer sym,
34+
byte[] inputBuffer,
35+
int start,
36+
int end,
37+
int bytesPreProcessed,
38+
boolean bufferRecyclable
39+
) {
40+
super(ctxt, features, in, codec, sym, inputBuffer, start, end, bytesPreProcessed, bufferRecyclable);
41+
}
42+
43+
/**
44+
* Method that will try to get underlying UTF-8 encoded bytes of the current string token.
45+
* This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null.
46+
* Currently, this is only implemented for ascii-only strings that do not contain escaped characters.
47+
*/
48+
public Text getValueAsText() throws IOException {
49+
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) {
50+
if (stringEnd > 0) {
51+
final int len = stringEnd - 1 - _inputPtr;
52+
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
53+
// which means each character uses exactly 1 byte.
54+
return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len);
55+
}
56+
return _finishAndReturnText();
57+
}
58+
return null;
59+
}
60+
61+
protected Text _finishAndReturnText() throws IOException {
62+
int ptr = _inputPtr;
63+
if (ptr >= _inputEnd) {
64+
_loadMoreGuaranteed();
65+
ptr = _inputPtr;
66+
}
67+
68+
int startPtr = ptr;
69+
final int[] codes = INPUT_CODES_UTF8;
70+
final int max = _inputEnd;
71+
final byte[] inputBuffer = _inputBuffer;
72+
while (ptr < max) {
73+
int c = inputBuffer[ptr] & 0xFF;
74+
if (codes[c] != 0) {
75+
if (c == INT_QUOTE) {
76+
stringEnd = ptr + 1;
77+
final int len = ptr - startPtr;
78+
// For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings,
79+
// which means each character uses exactly 1 byte.
80+
return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len);
81+
}
82+
return null;
83+
}
84+
++ptr;
85+
}
86+
return null;
87+
}
88+
89+
@Override
90+
public JsonToken nextToken() throws IOException {
91+
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
92+
_inputPtr = stringEnd;
93+
_tokenIncomplete = false;
94+
}
95+
stringEnd = -1;
96+
return super.nextToken();
97+
}
98+
99+
@Override
100+
public boolean nextFieldName(SerializableString str) throws IOException {
101+
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
102+
_inputPtr = stringEnd;
103+
_tokenIncomplete = false;
104+
}
105+
stringEnd = -1;
106+
return super.nextFieldName(str);
107+
}
108+
109+
@Override
110+
public String nextFieldName() throws IOException {
111+
if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) {
112+
_inputPtr = stringEnd;
113+
_tokenIncomplete = false;
114+
}
115+
stringEnd = -1;
116+
return super.nextFieldName();
117+
}
118+
}

libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentImpl.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
import com.fasterxml.jackson.core.JsonEncoding;
1313
import com.fasterxml.jackson.core.JsonFactory;
14-
import com.fasterxml.jackson.core.JsonFactoryBuilder;
1514
import com.fasterxml.jackson.core.JsonGenerator;
1615
import com.fasterxml.jackson.core.JsonParser;
1716

@@ -47,7 +46,7 @@ public static final XContent jsonXContent() {
4746
}
4847

4948
static {
50-
jsonFactory = XContentImplUtils.configure(new JsonFactoryBuilder());
49+
jsonFactory = XContentImplUtils.configure(new ESJsonFactoryBuilder());
5150
jsonFactory.configure(JsonGenerator.Feature.QUOTE_FIELD_NAMES, true);
5251
jsonFactory.configure(JsonParser.Feature.ALLOW_COMMENTS, true);
5352
jsonFactory.configure(JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW, false); // this trips on many mappings now...

libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
import com.fasterxml.jackson.core.io.JsonEOFException;
1919

2020
import org.elasticsearch.core.IOUtils;
21+
import org.elasticsearch.xcontent.Text;
2122
import org.elasticsearch.xcontent.XContentEOFException;
2223
import org.elasticsearch.xcontent.XContentLocation;
2324
import org.elasticsearch.xcontent.XContentParseException;
2425
import org.elasticsearch.xcontent.XContentParserConfiguration;
26+
import org.elasticsearch.xcontent.XContentString;
2527
import org.elasticsearch.xcontent.XContentType;
2628
import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl;
2729
import org.elasticsearch.xcontent.support.AbstractXContentParser;
@@ -115,6 +117,20 @@ public String text() throws IOException {
115117
}
116118
}
117119

120+
@Override
121+
public XContentString optimizedText() throws IOException {
122+
if (currentToken().isValue() == false) {
123+
throwOnNoText();
124+
}
125+
if (parser instanceof ESUTF8StreamJsonParser esParser) {
126+
var bytesRef = esParser.getValueAsText();
127+
if (bytesRef != null) {
128+
return bytesRef;
129+
}
130+
}
131+
return new Text(text());
132+
}
133+
118134
private void throwOnNoText() {
119135
throw new IllegalArgumentException("Expected text at " + getTokenLocation() + " but found " + currentToken());
120136
}

0 commit comments

Comments
 (0)