diff --git a/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java b/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java new file mode 100644 index 0000000000000..9984205445fcc --- /dev/null +++ b/benchmarks/src/main/java/org/elasticsearch/benchmark/xcontent/OptimizedTextBenchmark.java @@ -0,0 +1,108 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.benchmark.xcontent; + +import org.elasticsearch.benchmark.index.mapper.MapperServiceFactory; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xcontent.XContentType; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.io.IOException; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +/** + * Benchmark to measure indexing performance of keyword fields. Used to measure performance impact of skipping + * UTF-8 to UTF-16 conversion during document parsing. + */ +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +@Fork(1) +@Threads(1) +@Warmup(iterations = 1) +@Measurement(iterations = 5) +public class OptimizedTextBenchmark { + static { + // For Elasticsearch900Lucene101Codec: + LogConfigurator.loadLog4jPlugins(); + LogConfigurator.configureESLogging(); + LogConfigurator.setNodeName("test"); + } + + /** + * Total number of documents to index. + */ + @Param("1048576") + private int nDocs; + + private MapperService mapperService; + private SourceToParse[] sources; + + private String randomValue(int length) { + final String CHARS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + Random random = new Random(); + StringBuilder builder = new StringBuilder(length); + for (int i = 0; i < length; i++) { + builder.append(CHARS.charAt(random.nextInt(CHARS.length()))); + } + return builder.toString(); + } + + @Setup(Level.Trial) + public void setup() throws IOException { + mapperService = MapperServiceFactory.create(""" + { + "_doc": { + "dynamic": false, + "properties": { + "field": { + "type": "keyword" + } + } + } + } + """); + + sources = new SourceToParse[nDocs]; + for (int i = 0; i < nDocs; i++) { + XContentBuilder b = XContentFactory.jsonBuilder(); + b.startObject().field("field", randomValue(8)).endObject(); + sources[i] = new SourceToParse(UUIDs.randomBase64UUID(), BytesReference.bytes(b), XContentType.JSON); + } + } + + @Benchmark + public void indexDocuments(final Blackhole bh) { + final var mapper = mapperService.documentMapper(); + for (int i = 0; i < nDocs; i++) { + bh.consume(mapper.parse(sources[i])); + } + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESJsonFactory.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESJsonFactory.java new file mode 100644 index 0000000000000..1ad4941dab51e --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESJsonFactory.java @@ -0,0 +1,74 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.json; + +import com.fasterxml.jackson.core.JsonEncoding; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonFactoryBuilder; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.io.IOContext; +import com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper; +import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer; + +import java.io.IOException; + +public class ESJsonFactory extends JsonFactory { + ESJsonFactory(JsonFactoryBuilder b) { + super(b); + } + + @Override + protected JsonParser _createParser(byte[] data, int offset, int len, IOContext ctxt) throws IOException { + if (len > 0 + && Feature.CHARSET_DETECTION.enabledIn(_factoryFeatures) + && Feature.CANONICALIZE_FIELD_NAMES.enabledIn(_factoryFeatures)) { + var bootstrap = new ByteSourceJsonBootstrapper(ctxt, data, offset, len); + var encoding = bootstrap.detectEncoding(); + if (encoding == JsonEncoding.UTF8) { + boolean invalidBom = false; + int ptr = offset; + // Skip over the BOM if present + if ((data[ptr] & 0xFF) == 0xEF) { + if (len < 3) { + invalidBom = true; + } else if ((data[ptr + 1] & 0xFF) != 0xBB) { + invalidBom = true; + } else if ((data[ptr + 2] & 0xFF) != 0xBF) { + invalidBom = true; + } else { + ptr += 3; + } + } + if (invalidBom == false) { + ByteQuadsCanonicalizer can = _byteSymbolCanonicalizer.makeChild(_factoryFeatures); + return new ESUTF8StreamJsonParser( + ctxt, + _parserFeatures, + null, + _objectCodec, + can, + data, + ptr, + offset + len, + ptr - offset, + false + ); + } + } + } + return new ByteSourceJsonBootstrapper(ctxt, data, offset, len).constructParser( + _parserFeatures, + _objectCodec, + _byteSymbolCanonicalizer, + _rootCharSymbols, + _factoryFeatures + ); + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESJsonFactoryBuilder.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESJsonFactoryBuilder.java new file mode 100644 index 0000000000000..2bb6905fe19d9 --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESJsonFactoryBuilder.java @@ -0,0 +1,20 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.json; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonFactoryBuilder; + +public class ESJsonFactoryBuilder extends JsonFactoryBuilder { + @Override + public JsonFactory build() { + return new ESJsonFactory(this); + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java new file mode 100644 index 0000000000000..5f91036dee96e --- /dev/null +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParser.java @@ -0,0 +1,118 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.json; + +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.ObjectCodec; +import com.fasterxml.jackson.core.SerializableString; +import com.fasterxml.jackson.core.io.IOContext; +import com.fasterxml.jackson.core.json.UTF8StreamJsonParser; +import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer; + +import org.elasticsearch.xcontent.Text; +import org.elasticsearch.xcontent.XContentString; + +import java.io.IOException; +import java.io.InputStream; + +public class ESUTF8StreamJsonParser extends UTF8StreamJsonParser { + protected int stringEnd = -1; + + public ESUTF8StreamJsonParser( + IOContext ctxt, + int features, + InputStream in, + ObjectCodec codec, + ByteQuadsCanonicalizer sym, + byte[] inputBuffer, + int start, + int end, + int bytesPreProcessed, + boolean bufferRecyclable + ) { + super(ctxt, features, in, codec, sym, inputBuffer, start, end, bytesPreProcessed, bufferRecyclable); + } + + /** + * Method that will try to get underlying UTF-8 encoded bytes of the current string token. + * This is only a best-effort attempt; if there is some reason the bytes cannot be retrieved, this method will return null. + * Currently, this is only implemented for ascii-only strings that do not contain escaped characters. + */ + public Text getValueAsText() throws IOException { + if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete) { + if (stringEnd > 0) { + final int len = stringEnd - 1 - _inputPtr; + // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings, + // which means each character uses exactly 1 byte. + return new Text(new XContentString.UTF8Bytes(_inputBuffer, _inputPtr, len), len); + } + return _finishAndReturnText(); + } + return null; + } + + protected Text _finishAndReturnText() throws IOException { + int ptr = _inputPtr; + if (ptr >= _inputEnd) { + _loadMoreGuaranteed(); + ptr = _inputPtr; + } + + int startPtr = ptr; + final int[] codes = INPUT_CODES_UTF8; + final int max = _inputEnd; + final byte[] inputBuffer = _inputBuffer; + while (ptr < max) { + int c = inputBuffer[ptr] & 0xFF; + if (codes[c] != 0) { + if (c == INT_QUOTE) { + stringEnd = ptr + 1; + final int len = ptr - startPtr; + // For now, we can use `len` for `stringLength` because we only support ascii-encoded unescaped strings, + // which means each character uses exactly 1 byte. + return new Text(new XContentString.UTF8Bytes(inputBuffer, startPtr, len), len); + } + return null; + } + ++ptr; + } + return null; + } + + @Override + public JsonToken nextToken() throws IOException { + if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) { + _inputPtr = stringEnd; + _tokenIncomplete = false; + } + stringEnd = -1; + return super.nextToken(); + } + + @Override + public boolean nextFieldName(SerializableString str) throws IOException { + if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) { + _inputPtr = stringEnd; + _tokenIncomplete = false; + } + stringEnd = -1; + return super.nextFieldName(str); + } + + @Override + public String nextFieldName() throws IOException { + if (_currToken == JsonToken.VALUE_STRING && _tokenIncomplete && stringEnd > 0) { + _inputPtr = stringEnd; + _tokenIncomplete = false; + } + stringEnd = -1; + return super.nextFieldName(); + } +} diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentImpl.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentImpl.java index 7f52467caf49b..067985b404f3e 100644 --- a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentImpl.java +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentImpl.java @@ -11,7 +11,6 @@ import com.fasterxml.jackson.core.JsonEncoding; import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonFactoryBuilder; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; @@ -47,7 +46,7 @@ public static final XContent jsonXContent() { } static { - jsonFactory = XContentImplUtils.configure(new JsonFactoryBuilder()); + jsonFactory = XContentImplUtils.configure(new ESJsonFactoryBuilder()); jsonFactory.configure(JsonGenerator.Feature.QUOTE_FIELD_NAMES, true); jsonFactory.configure(JsonParser.Feature.ALLOW_COMMENTS, true); jsonFactory.configure(JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW, false); // this trips on many mappings now... diff --git a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java index 38ef8bc2e4ef0..3471fd362f88e 100644 --- a/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java +++ b/libs/x-content/impl/src/main/java/org/elasticsearch/xcontent/provider/json/JsonXContentParser.java @@ -18,10 +18,12 @@ import com.fasterxml.jackson.core.io.JsonEOFException; import org.elasticsearch.core.IOUtils; +import org.elasticsearch.xcontent.Text; import org.elasticsearch.xcontent.XContentEOFException; import org.elasticsearch.xcontent.XContentLocation; import org.elasticsearch.xcontent.XContentParseException; import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentString; import org.elasticsearch.xcontent.XContentType; import org.elasticsearch.xcontent.provider.XContentParserConfigurationImpl; import org.elasticsearch.xcontent.support.AbstractXContentParser; @@ -115,6 +117,20 @@ public String text() throws IOException { } } + @Override + public XContentString optimizedText() throws IOException { + if (currentToken().isValue() == false) { + throwOnNoText(); + } + if (parser instanceof ESUTF8StreamJsonParser esParser) { + var bytesRef = esParser.getValueAsText(); + if (bytesRef != null) { + return bytesRef; + } + } + return new Text(text()); + } + private void throwOnNoText() { throw new IllegalArgumentException("Expected text at " + getTokenLocation() + " but found " + currentToken()); } diff --git a/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESJsonFactoryTests.java b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESJsonFactoryTests.java new file mode 100644 index 0000000000000..78f9d83cf94aa --- /dev/null +++ b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESJsonFactoryTests.java @@ -0,0 +1,70 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.json; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.json.ReaderBasedJsonParser; +import com.fasterxml.jackson.core.json.UTF8StreamJsonParser; + +import org.elasticsearch.test.ESTestCase; +import org.hamcrest.Matchers; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +public class ESJsonFactoryTests extends ESTestCase { + + public void testCreateParser() throws IOException { + JsonFactory factory = new ESJsonFactoryBuilder().build(); + assertThat(factory, Matchers.instanceOf(ESJsonFactory.class)); + + // \ufeff is the BOM + String[] inputs = { "{\"foo\": \"bar\"}", "\ufeff{\"foo\": \"bar\"}" }; + Charset[] charsets = { StandardCharsets.UTF_8, StandardCharsets.UTF_16LE, StandardCharsets.UTF_16BE }; + Class[] expectedParsers = { ESUTF8StreamJsonParser.class, ReaderBasedJsonParser.class, ReaderBasedJsonParser.class }; + + for (String input : inputs) { + for (int i = 0; i < charsets.length; i++) { + ByteBuffer encoded = charsets[i].encode(input); + JsonParser parser = factory.createParser(encoded.array()); + assertThat(parser, Matchers.instanceOf(expectedParsers[i])); + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + assertThat(parser.getValueAsString(), Matchers.equalTo("bar")); + } + } + + // Valid BOM + { + JsonParser parser = factory.createParser(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, '{', '}' }); + assertThat(parser, Matchers.instanceOf(ESUTF8StreamJsonParser.class)); + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + } + + // Invalid BOMs + { + JsonParser parser = factory.createParser(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBB, '{', '}' }); + assertThat(parser, Matchers.instanceOf(UTF8StreamJsonParser.class)); + assertThrows("Invalid UTF-8 start byte 0xbb", JsonParseException.class, parser::nextToken); + } + + { + JsonParser parser = factory.createParser(new byte[] { (byte) 0xEF, '{', '}' }); + assertThat(parser, Matchers.instanceOf(UTF8StreamJsonParser.class)); + assertThrows("Invalid UTF-8 start byte 0x7b", JsonParseException.class, parser::nextToken); + } + } +} diff --git a/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java new file mode 100644 index 0000000000000..411f77be8b1e6 --- /dev/null +++ b/libs/x-content/impl/src/test/java/org/elasticsearch/xcontent/provider/json/ESUTF8StreamJsonParserTests.java @@ -0,0 +1,161 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xcontent.provider.json; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; + +import org.elasticsearch.common.Strings; +import org.elasticsearch.core.CheckedConsumer; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentString; +import org.elasticsearch.xcontent.json.JsonXContent; +import org.hamcrest.Matchers; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +public class ESUTF8StreamJsonParserTests extends ESTestCase { + + private void testParseJson(String input, CheckedConsumer test) throws IOException { + JsonFactory factory = new ESJsonFactoryBuilder().build(); + assertThat(factory, Matchers.instanceOf(ESJsonFactory.class)); + + JsonParser parser = factory.createParser(StandardCharsets.UTF_8.encode(input).array()); + assertThat(parser, Matchers.instanceOf(ESUTF8StreamJsonParser.class)); + test.accept((ESUTF8StreamJsonParser) parser); + } + + private void assertTextRef(XContentString.UTF8Bytes textRef, String expectedValue) { + assertThat(textRef, Matchers.equalTo(new XContentString.UTF8Bytes(expectedValue.getBytes(StandardCharsets.UTF_8)))); + } + + public void testGetValueAsText() throws IOException { + testParseJson("{\"foo\": \"bar\"}", parser -> { + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + + var textRef = parser.getValueAsText().bytes(); + assertThat(textRef, Matchers.notNullValue()); + assertThat(textRef.offset(), Matchers.equalTo(9)); + assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12)); + assertTextRef(textRef, "bar"); + + assertThat(parser.getValueAsString(), Matchers.equalTo("bar")); + assertThat(parser.getValueAsText(), Matchers.nullValue()); + + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT)); + }); + + testParseJson("{\"foo\": \"bar\\\"baz\\\"\"}", parser -> { + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + + assertThat(parser.getValueAsText(), Matchers.nullValue()); + assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\"")); + }); + + testParseJson("{\"foo\": \"bår\"}", parser -> { + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + + assertThat(parser.getValueAsText(), Matchers.nullValue()); + assertThat(parser.getValueAsString(), Matchers.equalTo("bår")); + }); + + testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> { + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.START_ARRAY)); + + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + { + var textRef = parser.getValueAsText().bytes(); + assertThat(textRef, Matchers.notNullValue()); + assertThat(textRef.offset(), Matchers.equalTo(10)); + assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(15)); + assertTextRef(textRef, "lorem"); + } + + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + { + var textRef = parser.getValueAsText().bytes(); + assertThat(textRef, Matchers.notNullValue()); + assertThat(textRef.offset(), Matchers.equalTo(19)); + assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(24)); + assertTextRef(textRef, "ipsum"); + } + + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + { + var textRef = parser.getValueAsText().bytes(); + assertThat(textRef, Matchers.notNullValue()); + assertThat(textRef.offset(), Matchers.equalTo(28)); + assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(33)); + assertTextRef(textRef, "dolor"); + } + + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_ARRAY)); + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT)); + }); + } + + private boolean validForTextRef(String value) { + for (char c : value.toCharArray()) { + if (c == '"') { + return false; + } + if (c == '\\') { + return false; + } + if ((int) c < 32 || (int) c >= 128) { + return false; + } + } + return true; + } + + public void testGetValueRandomized() throws IOException { + XContentBuilder jsonBuilder = JsonXContent.contentBuilder().startObject(); + final int numKeys = 128; + String[] keys = new String[numKeys]; + String[] values = new String[numKeys]; + for (int i = 0; i < numKeys; i++) { + String currKey = randomAlphanumericOfLength(6); + String currVal = randomUnicodeOfLengthBetween(0, 512); + jsonBuilder.field(currKey, currVal); + keys[i] = currKey; + values[i] = currVal; + } + + jsonBuilder.endObject(); + testParseJson(Strings.toString(jsonBuilder), parser -> { + assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); + for (int i = 0; i < numKeys; i++) { + assertThat(parser.nextFieldName(), Matchers.equalTo(keys[i])); + assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); + + String currVal = values[i]; + if (validForTextRef(currVal)) { + assertTextRef(parser.getValueAsText().bytes(), currVal); + } else { + assertThat(parser.getValueAsText(), Matchers.nullValue()); + assertThat(parser.getValueAsString(), Matchers.equalTo(currVal)); + } + } + }); + } + +} diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/FilterXContentParser.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/FilterXContentParser.java index a9616a6702aa0..4ff1185dc560c 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/FilterXContentParser.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/FilterXContentParser.java @@ -100,6 +100,14 @@ public String textOrNull() throws IOException { return delegate().textOrNull(); } + public XContentString optimizedText() throws IOException { + return delegate().optimizedText(); + } + + public XContentString optimizedTextOrNull() throws IOException { + return delegate().optimizedTextOrNull(); + } + @Override public CharBuffer charBufferOrNull() throws IOException { return delegate().charBufferOrNull(); diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/XContentParser.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/XContentParser.java index 554043c2547cb..baee8922fd35b 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/XContentParser.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/XContentParser.java @@ -109,6 +109,10 @@ Map map(Supplier> mapFactory, CheckedFunction { @@ -23,7 +24,7 @@ public int compareTo(UTF8Bytes o) { return 0; } - return ByteBuffer.wrap(bytes, offset, length).compareTo(ByteBuffer.wrap(o.bytes, o.offset, o.length)); + return Arrays.compareUnsigned(bytes, offset, offset + length, o.bytes, o.offset, o.offset + o.length); } @Override diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/support/AbstractXContentParser.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/support/AbstractXContentParser.java index 909cf808d1f34..351bfdc4b7d49 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/support/AbstractXContentParser.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/support/AbstractXContentParser.java @@ -15,8 +15,10 @@ import org.elasticsearch.core.RestApiVersion; import org.elasticsearch.xcontent.DeprecationHandler; import org.elasticsearch.xcontent.NamedXContentRegistry; +import org.elasticsearch.xcontent.Text; import org.elasticsearch.xcontent.XContentParseException; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentString; import java.io.IOException; import java.math.BigDecimal; @@ -258,6 +260,19 @@ public final String textOrNull() throws IOException { return text(); } + @Override + public XContentString optimizedText() throws IOException { + return new Text(text()); + } + + @Override + public final XContentString optimizedTextOrNull() throws IOException { + if (currentToken() == Token.VALUE_NULL) { + return null; + } + return optimizedText(); + } + @Override public CharBuffer charBufferOrNull() throws IOException { if (currentToken() == Token.VALUE_NULL) { diff --git a/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java b/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java index 150fe1627c42c..4e637ff0248f5 100644 --- a/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java +++ b/libs/x-content/src/test/java/org/elasticsearch/xcontent/TextTests.java @@ -12,6 +12,7 @@ import org.elasticsearch.test.ESTestCase; import java.nio.charset.StandardCharsets; +import java.util.Arrays; public class TextTests extends ESTestCase { public void testConvertToBytes() { @@ -136,7 +137,8 @@ public void testCompareTo() { byte[] encodedArr2 = value2.getBytes(StandardCharsets.UTF_8); var encoded2 = new XContentString.UTF8Bytes(encodedArr2); - int compSign = (int) Math.signum(encoded1.compareTo(encoded2)); + // String.compareTo() wasn't handling surrogate pairs very well, so here we compare the full 32-bit codepoints + int compSign = (int) Math.signum(Arrays.compare(value1.codePoints().toArray(), value2.codePoints().toArray())); { var text1 = new Text(value1); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java index af691c61abe2e..adcb946d7afc4 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/CompletionFieldMapper.java @@ -32,11 +32,13 @@ import org.elasticsearch.xcontent.DeprecationHandler; import org.elasticsearch.xcontent.FilterXContentParser; import org.elasticsearch.xcontent.NamedXContentRegistry; +import org.elasticsearch.xcontent.Text; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentLocation; import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentParser.NumberType; import org.elasticsearch.xcontent.XContentParser.Token; +import org.elasticsearch.xcontent.XContentString; import org.elasticsearch.xcontent.XContentType; import org.elasticsearch.xcontent.support.MapXContentParser; @@ -707,6 +709,14 @@ public Token currentToken() { return super.currentToken(); } + @Override + public XContentString optimizedTextOrNull() throws IOException { + if (parsingObject == false) { + return new Text(textValue); + } + return super.optimizedTextOrNull(); + } + @Override public String textOrNull() throws IOException { if (parsingObject == false) { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DotExpandingXContentParser.java b/server/src/main/java/org/elasticsearch/index/mapper/DotExpandingXContentParser.java index 42784e0974417..46098baa0676a 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/DotExpandingXContentParser.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/DotExpandingXContentParser.java @@ -14,6 +14,7 @@ import org.elasticsearch.xcontent.FilterXContentParserWrapper; import org.elasticsearch.xcontent.XContentLocation; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentString; import org.elasticsearch.xcontent.XContentSubParser; import java.io.IOException; @@ -379,6 +380,14 @@ public void skipChildren() throws IOException { } } + @Override + public XContentString optimizedTextOrNull() throws IOException { + if (state == State.EXPANDING_START_OBJECT) { + throw new IllegalStateException("Can't get text on a " + currentToken() + " at " + getTokenLocation()); + } + return super.optimizedTextOrNull(); + } + @Override public String textOrNull() throws IOException { if (state == State.EXPANDING_START_OBJECT) { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/GeoPointFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/GeoPointFieldMapper.java index 764f0c4cc5031..be5c4b82bf1f2 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/GeoPointFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/GeoPointFieldMapper.java @@ -54,9 +54,11 @@ import org.elasticsearch.search.runtime.GeoPointScriptFieldDistanceFeatureQuery; import org.elasticsearch.xcontent.CopyingXContentParser; import org.elasticsearch.xcontent.FilterXContentParserWrapper; +import org.elasticsearch.xcontent.Text; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentString; import java.io.IOException; import java.io.UncheckedIOException; @@ -327,6 +329,11 @@ static class GeoHashMultiFieldParser extends FilterXContentParserWrapper { this.value = value; } + @Override + public XContentString optimizedTextOrNull() throws IOException { + return new Text(value); + } + @Override public String textOrNull() throws IOException { return value; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 840f4a3726109..f8af7043b13dd 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -66,8 +66,10 @@ import org.elasticsearch.search.runtime.StringScriptFieldRegexpQuery; import org.elasticsearch.search.runtime.StringScriptFieldTermQuery; import org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery; +import org.elasticsearch.xcontent.Text; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentString; import java.io.IOException; import java.io.UncheckedIOException; @@ -1106,15 +1108,16 @@ public String getOffsetFieldName() { } protected void parseCreateField(DocumentParserContext context) throws IOException { - String value = context.parser().textOrNull(); - if (value == null) { - value = fieldType().nullValue; + var value = context.parser().optimizedTextOrNull(); + + if (value == null && fieldType().nullValue != null) { + value = new Text(fieldType().nullValue); } boolean indexed = indexValue(context, value); if (offsetsFieldName != null && context.isImmediateParentAnArray() && context.canAddIgnoredField()) { if (indexed) { - context.getOffSetContext().recordOffset(offsetsFieldName, value); + context.getOffSetContext().recordOffset(offsetsFieldName, value.bytes()); } else if (value == null) { context.getOffSetContext().recordNull(offsetsFieldName); } @@ -1132,28 +1135,37 @@ protected void indexScriptValues( } private boolean indexValue(DocumentParserContext context, String value) { + return indexValue(context, new Text(value)); + } + + private boolean indexValue(DocumentParserContext context, XContentString value) { if (value == null) { return false; } + // if field is disabled, skip indexing if ((fieldType.indexOptions() == IndexOptions.NONE) && (fieldType.stored() == false) && (fieldType().hasDocValues() == false)) { return false; } - if (value.length() > fieldType().ignoreAbove()) { + if (value.stringLength() > fieldType().ignoreAbove()) { context.addIgnoredField(fullPath()); if (isSyntheticSource) { // Save a copy of the field so synthetic source can load it - context.doc().add(new StoredField(originalName(), new BytesRef(value))); + var utfBytes = value.bytes(); + var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()); + context.doc().add(new StoredField(originalName(), bytesRef)); } return false; } - value = normalizeValue(fieldType().normalizer(), fullPath(), value); - - // convert to utf8 only once before feeding postings/dv/stored fields - final BytesRef binaryValue = new BytesRef(value); + if (fieldType().normalizer() != Lucene.KEYWORD_ANALYZER) { + String normalizedString = normalizeValue(fieldType().normalizer(), fullPath(), value.string()); + value = new Text(normalizedString); + } + var utfBytes = value.bytes(); + var binaryValue = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()); if (fieldType().isDimension()) { context.getRoutingFields().addString(fieldType().name(), binaryValue); } diff --git a/server/src/test/java/org/elasticsearch/common/xcontent/BaseXContentTestCase.java b/server/src/test/java/org/elasticsearch/common/xcontent/BaseXContentTestCase.java index 979224ba8f9b7..0392f2a730cd2 100644 --- a/server/src/test/java/org/elasticsearch/common/xcontent/BaseXContentTestCase.java +++ b/server/src/test/java/org/elasticsearch/common/xcontent/BaseXContentTestCase.java @@ -395,6 +395,22 @@ public void testText() throws Exception { } } + public void testOptimizedText() throws Exception { + final var random = new XContentString.UTF8Bytes(randomBytes()); + XContentBuilder builder = builder().startObject().field("text", new Text(random)).endObject(); + + try (XContentParser parser = createParser(xcontentType().xContent(), BytesReference.bytes(builder))) { + assertSame(Token.START_OBJECT, parser.nextToken()); + assertSame(Token.FIELD_NAME, parser.nextToken()); + assertEquals("text", parser.currentName()); + assertTrue(parser.nextToken().isValue()); + var valueRef = parser.optimizedText().bytes(); + assertThat(valueRef, equalTo(random)); + assertSame(Token.END_OBJECT, parser.nextToken()); + assertNull(parser.nextToken()); + } + } + public void testDate() throws Exception { assertResult("{'date':null}", () -> builder().startObject().timestampField("date", (Date) null).endObject()); assertResult("{'date':null}", () -> builder().startObject().field("date").timestampValue((Date) null).endObject()); diff --git a/test/yaml-rest-runner/src/main/java/org/elasticsearch/test/rest/yaml/ParameterizableYamlXContentParser.java b/test/yaml-rest-runner/src/main/java/org/elasticsearch/test/rest/yaml/ParameterizableYamlXContentParser.java index c50115d53e629..786953b505f9e 100644 --- a/test/yaml-rest-runner/src/main/java/org/elasticsearch/test/rest/yaml/ParameterizableYamlXContentParser.java +++ b/test/yaml-rest-runner/src/main/java/org/elasticsearch/test/rest/yaml/ParameterizableYamlXContentParser.java @@ -16,6 +16,7 @@ import org.elasticsearch.xcontent.NamedXContentRegistry; import org.elasticsearch.xcontent.XContentLocation; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentString; import org.elasticsearch.xcontent.XContentType; import java.io.IOException; @@ -143,6 +144,16 @@ public String textOrNull() throws IOException { return delegate.textOrNull(); } + @Override + public XContentString optimizedText() throws IOException { + return delegate.optimizedText(); + } + + @Override + public XContentString optimizedTextOrNull() throws IOException { + return delegate.optimizedTextOrNull(); + } + @Override public CharBuffer charBufferOrNull() throws IOException { return delegate.charBufferOrNull();