diff --git a/docs/changelog/126739.yaml b/docs/changelog/126739.yaml new file mode 100644 index 0000000000000..ff61327f9878e --- /dev/null +++ b/docs/changelog/126739.yaml @@ -0,0 +1,6 @@ +pr: 126739 +summary: Update `sparse_vector` field mapping to include default setting for token + pruning +area: Mapping +type: enhancement +issues: [] diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index f8bfe69310600..dc9af7e642b13 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -24,6 +24,28 @@ PUT my-index } ``` +Also, with optional `index_options` for pruning: + +```console +PUT my-index +{ + "mappings": { + "properties": { + "text.tokens": { + "type": "sparse_vector", + "index_options": { + "prune": true, + "pruning_config": { + "tokens_freq_ratio_threshold": 5, + "tokens_weight_threshold": 0.4 + } + } + } + } + } +} +``` + See [semantic search with ELSER](docs-content://solutions/search/semantic-search/semantic-search-elser-ingest-pipelines.md) for a complete example on adding documents to a `sparse_vector` mapped field using ELSER. ## Parameters for `sparse_vector` fields [sparse-vectors-params] @@ -36,6 +58,28 @@ The following parameters are accepted by `sparse_vector` fields: * Exclude the field from [_source](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#source-filtering). * Use [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source). +index_options +: (Optional, object) You can set index options for your `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If pruning options are not set in your `sparse_query` vector, Elasticsearch will use the default options configured for the field, if any. The available options for the index options are: + +Parameters for `index_options` are: + +`prune` {applies_to}`stack: preview 9.1` +: (Optional, boolean) Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true. + +`pruning_config` {applies_to}`stack: preview 9.1` +: (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur. + + Parameters for `pruning_config` include: + + `tokens_freq_ratio_threshold` {applies_to}`stack: preview 9.1` + : (Optional, integer) Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned. This value must between 1 and 100. Default: `5`. + + `tokens_weight_threshold` {applies_to}`stack: preview 9.1` + : (Optional, float) Tokens whose weight is less than `tokens_weight_threshold` are considered insignificant and pruned. This value must be between 0 and 1. Default: `0.4`. + + ::::{note} + The default values for `tokens_freq_ratio_threshold` and `tokens_weight_threshold` were chosen based on tests using ELSERv2 that provided the most optimal results. + :::: ## Multi-value sparse vectors [index-multi-value-sparse-vectors] diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml index 7a952190205c5..c14026257bdd9 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml @@ -1,3 +1,15 @@ + +--- +teardown: + # ensure indices are cleaned up after each test + # mainly for the sparse vector tests + - do: + indices.delete: + index: ["test1", "test2"] + ignore: 404 + - do: + indices.refresh: { } + --- "cluster stats test": - do: @@ -358,6 +370,7 @@ - requires: cluster_features: [ "gte_v8.15.0" ] reason: "sparse vector stats added in 8.15" + - do: indices.create: index: test1 diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index d6f405959f2b7..d9c15a9fbc95a 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -190,6 +190,7 @@ static TransportVersion def(int id) { public static final TransportVersion JOIN_ON_ALIASES_8_19 = def(8_841_0_42); public static final TransportVersion ILM_ADD_SKIP_SETTING_8_19 = def(8_841_0_43); public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED_8_19 = def(8_841_0_44); + public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19 = def(8_841_0_45); public static final TransportVersion V_9_0_0 = def(9_000_0_09); public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10); public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11); @@ -284,6 +285,7 @@ static TransportVersion def(int id) { public static final TransportVersion JOIN_ON_ALIASES = def(9_088_0_00); public static final TransportVersion ILM_ADD_SKIP_SETTING = def(9_089_0_00); public static final TransportVersion ML_INFERENCE_MISTRAL_CHAT_COMPLETION_ADDED = def(9_090_0_00); + public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS = def(9_091_0_00); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index f32d4d7a2a302..b3f92e67c25ea 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -142,6 +142,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ_BACKPORT_8_X = def(8_530_0_00, Version.LUCENE_9_12_1); public static final IndexVersion SEMANTIC_TEXT_DEFAULTS_TO_BBQ_BACKPORT_8_X = def(8_531_0_00, Version.LUCENE_9_12_1); public static final IndexVersion INDEX_INT_SORT_INT_TYPE_8_19 = def(8_532_0_00, Version.LUCENE_9_12_1); + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X = def(8_533_0_00, Version.LUCENE_9_12_1); public static final IndexVersion UPGRADE_TO_LUCENE_10_0_0 = def(9_000_0_00, Version.LUCENE_10_0_0); public static final IndexVersion LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT = def(9_001_0_00, Version.LUCENE_10_0_0); public static final IndexVersion TIME_BASED_K_ORDERED_DOC_ID = def(9_002_0_00, Version.LUCENE_10_0_0); @@ -171,6 +172,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion DEFAULT_TO_ACORN_HNSW_FILTER_HEURISTIC = def(9_026_0_00, Version.LUCENE_10_2_1); public static final IndexVersion SEQ_NO_WITHOUT_POINTS = def(9_027_0_00, Version.LUCENE_10_2_1); public static final IndexVersion INDEX_INT_SORT_INT_TYPE = def(9_028_0_00, Version.LUCENE_10_2_1); + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_029_0_00, Version.LUCENE_10_2_1); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index fdbb03f51743e..40ae0170c9ca0 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -22,6 +22,9 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -31,6 +34,7 @@ import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.MappingParserContext; import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.mapper.SourceValueFetcher; import org.elasticsearch.index.mapper.TextSearchInfo; @@ -38,17 +42,27 @@ import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.search.fetch.StoredFieldsSpec; import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.DeprecationHandler; +import org.elasticsearch.xcontent.NamedXContentRegistry; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentParser.Token; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.support.MapXContentParser; import java.io.IOException; import java.io.UncheckedIOException; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Stream; import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST; +import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; /** * A {@link FieldMapper} that exposes Lucene's {@link FeatureField} as a sparse @@ -57,6 +71,7 @@ public class SparseVectorFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "sparse_vector"; + public static final String SPARSE_VECTOR_INDEX_OPTIONS = "index_options"; static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to " + "contain [sparse_vector] fields, but they cannot be indexed or searched."; @@ -66,6 +81,10 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; + static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; + + public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector.index_options_supported"); + private static SparseVectorFieldMapper toType(FieldMapper in) { return (SparseVectorFieldMapper) in; } @@ -73,6 +92,15 @@ private static SparseVectorFieldMapper toType(FieldMapper in) { public static class Builder extends FieldMapper.Builder { private final Parameter stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false); private final Parameter> meta = Parameter.metaParam(); + private final Parameter indexOptions = new Parameter<>( + SPARSE_VECTOR_INDEX_OPTIONS, + true, + () -> null, + (n, c, o) -> parseIndexOptions(c, o), + m -> toType(m).fieldType().indexOptions, + XContentBuilder::field, + Objects::toString + ).acceptsNull(); public Builder(String name) { super(name); @@ -85,19 +113,54 @@ public Builder setStored(boolean value) { @Override protected Parameter[] getParameters() { - return new Parameter[] { stored, meta }; + return new Parameter[] { stored, meta, indexOptions }; } @Override public SparseVectorFieldMapper build(MapperBuilderContext context) { return new SparseVectorFieldMapper( leafName(), - new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()), + new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), indexOptions.getValue()), builderParams(this, context) ); } } + public IndexOptions getIndexOptions() { + return fieldType().getIndexOptions(); + } + + private static final ConstructingObjectParser INDEX_OPTIONS_PARSER = new ConstructingObjectParser<>( + SPARSE_VECTOR_INDEX_OPTIONS, + args -> new IndexOptions((Boolean) args[0], (TokenPruningConfig) args[1]) + ); + + static { + INDEX_OPTIONS_PARSER.declareBoolean(optionalConstructorArg(), IndexOptions.PRUNE_FIELD_NAME); + INDEX_OPTIONS_PARSER.declareObject(optionalConstructorArg(), TokenPruningConfig.PARSER, IndexOptions.PRUNING_CONFIG_FIELD_NAME); + } + + private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { + if (propNode == null) { + return null; + } + + Map indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS); + + XContentParser parser = new MapXContentParser( + NamedXContentRegistry.EMPTY, + DeprecationHandler.IGNORE_DEPRECATIONS, + indexOptionsMap, + XContentType.JSON + ); + + try { + return INDEX_OPTIONS_PARSER.parse(parser, null); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + public static final TypeParser PARSER = new TypeParser((n, c) -> { if (c.indexVersionCreated().before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) { deprecationLogger.warn(DeprecationCategory.MAPPINGS, "sparse_vector", ERROR_MESSAGE_7X); @@ -109,9 +172,24 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { }, notInMultiFields(CONTENT_TYPE)); public static final class SparseVectorFieldType extends MappedFieldType { + private final IndexOptions indexOptions; public SparseVectorFieldType(String name, boolean isStored, Map meta) { + this(name, isStored, meta, null); + } + + public SparseVectorFieldType( + String name, + boolean isStored, + Map meta, + @Nullable SparseVectorFieldMapper.IndexOptions indexOptions + ) { super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); + this.indexOptions = indexOptions; + } + + public IndexOptions getIndexOptions() { + return indexOptions; } @Override @@ -364,4 +442,70 @@ public void reset() { } } + public static class IndexOptions implements ToXContent { + public static final ParseField PRUNE_FIELD_NAME = new ParseField("prune"); + public static final ParseField PRUNING_CONFIG_FIELD_NAME = new ParseField("pruning_config"); + + final Boolean prune; + final TokenPruningConfig pruningConfig; + + IndexOptions(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) { + if (pruningConfig != null && (prune == null || prune == false)) { + throw new IllegalArgumentException( + "[" + + SPARSE_VECTOR_INDEX_OPTIONS + + "] field [" + + PRUNING_CONFIG_FIELD_NAME.getPreferredName() + + "] should only be set if [" + + PRUNE_FIELD_NAME.getPreferredName() + + "] is set to true" + ); + } + + this.prune = prune; + this.pruningConfig = pruningConfig; + } + + public Boolean getPrune() { + return prune; + } + + public TokenPruningConfig getPruningConfig() { + return pruningConfig; + } + + @Override + public final boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other == null || getClass() != other.getClass()) { + return false; + } + + IndexOptions otherAsIndexOptions = (IndexOptions) other; + return Objects.equals(prune, otherAsIndexOptions.prune) && Objects.equals(pruningConfig, otherAsIndexOptions.pruningConfig); + } + + @Override + public final int hashCode() { + return Objects.hash(prune, pruningConfig); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + + if (prune != null) { + builder.field(PRUNE_FIELD_NAME.getPreferredName(), prune); + } + if (pruningConfig != null) { + builder.field(PRUNING_CONFIG_FIELD_NAME.getPreferredName(), pruningConfig); + } + + builder.endObject(); + return builder; + } + } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java similarity index 74% rename from x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfig.java rename to server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index 13358839830ed..58b32d9c08d7c 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -1,39 +1,54 @@ /* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". */ -package org.elasticsearch.xpack.core.ml.search; +package org.elasticsearch.index.mapper.vectors; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.index.query.QueryBuilder; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.DeprecationHandler; +import org.elasticsearch.xcontent.NamedXContentRegistry; import org.elasticsearch.xcontent.ParseField; import org.elasticsearch.xcontent.ToXContentObject; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.support.MapXContentParser; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Locale; +import java.util.Map; import java.util.Objects; import java.util.Set; -import static org.elasticsearch.xpack.core.ml.search.WeightedTokensQueryBuilder.PRUNING_CONFIG; +import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; public class TokenPruningConfig implements Writeable, ToXContentObject { + public static final String PRUNING_CONFIG_FIELD = "pruning_config"; + public static final ParseField TOKENS_FREQ_RATIO_THRESHOLD = new ParseField("tokens_freq_ratio_threshold"); public static final ParseField TOKENS_WEIGHT_THRESHOLD = new ParseField("tokens_weight_threshold"); public static final ParseField ONLY_SCORE_PRUNED_TOKENS_FIELD = new ParseField("only_score_pruned_tokens"); // Tokens whose frequency is more than 5 times the average frequency of all tokens in the specified field are considered outliers. public static final float DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD = 5; + public static final float MIN_TOKENS_FREQ_RATIO_THRESHOLD = 1; public static final float MAX_TOKENS_FREQ_RATIO_THRESHOLD = 100; + // A token's weight should be > 40% of the best weight in the query to be considered significant. public static final float DEFAULT_TOKENS_WEIGHT_THRESHOLD = 0.4f; + public static final float MIN_TOKENS_WEIGHT_THRESHOLD = 0.0f; + public static final float MAX_TOKENS_WEIGHT_THRESHOLD = 1.0f; private final float tokensFreqRatioThreshold; private final float tokensWeightThreshold; @@ -150,7 +165,7 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx ).contains(currentFieldName) == false) { throw new ParsingException( parser.getTokenLocation(), - "[" + PRUNING_CONFIG.getPreferredName() + "] unknown token [" + currentFieldName + "]" + "[" + PRUNING_CONFIG_FIELD + "] unknown token [" + currentFieldName + "]" ); } } else if (token.isValue()) { @@ -163,16 +178,50 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx } else { throw new ParsingException( parser.getTokenLocation(), - "[" + PRUNING_CONFIG.getPreferredName() + "] does not support [" + currentFieldName + "]" + "[" + PRUNING_CONFIG_FIELD + "] does not support [" + currentFieldName + "]" ); } } else { throw new ParsingException( parser.getTokenLocation(), - "[" + PRUNING_CONFIG.getPreferredName() + "] unknown token [" + token + "] after [" + currentFieldName + "]" + "[" + PRUNING_CONFIG_FIELD + "] unknown token [" + token + "] after [" + currentFieldName + "]" ); } } return new TokenPruningConfig(ratioThreshold, weightThreshold, onlyScorePrunedTokens); } + + public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + PRUNING_CONFIG_FIELD, + args -> new TokenPruningConfig( + args[0] == null ? DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD : (Float) args[0], + args[1] == null ? DEFAULT_TOKENS_WEIGHT_THRESHOLD : (Float) args[1], + args[2] != null && (Boolean) args[2] + ) + ); + + static { + PARSER.declareFloat(optionalConstructorArg(), TOKENS_FREQ_RATIO_THRESHOLD); + PARSER.declareFloat(optionalConstructorArg(), TOKENS_WEIGHT_THRESHOLD); + PARSER.declareBoolean(optionalConstructorArg(), ONLY_SCORE_PRUNED_TOKENS_FIELD); + } + + public static TokenPruningConfig parseFromMap(Map pruningConfigMap) { + if (pruningConfigMap == null) { + return null; + } + + try { + XContentParser parser = new MapXContentParser( + NamedXContentRegistry.EMPTY, + DeprecationHandler.IGNORE_DEPRECATIONS, + pruningConfigMap, + XContentType.JSON + ); + + return PARSER.parse(parser, null); + } catch (IOException ioEx) { + throw new UncheckedIOException(ioEx); + } + } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index b2379ba579204..a8986d5f89054 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -17,6 +17,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; +import org.elasticsearch.common.Randomness; import org.elasticsearch.common.Strings; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.IndexVersion; @@ -31,6 +32,7 @@ import org.elasticsearch.search.lookup.Source; import org.elasticsearch.test.index.IndexVersionUtils; import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParseException; import org.hamcrest.Matchers; import org.junit.AssumptionViolatedException; @@ -40,8 +42,10 @@ import java.util.List; import java.util.Map; +import static java.lang.Math.random; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.NEW_SPARSE_VECTOR_INDEX_VERSION; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.PREVIOUS_SPARSE_VECTOR_INDEX_VERSION; +import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION; import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -67,6 +71,35 @@ protected void minimalMapping(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); } + protected void mappingWithDefaultIndexOptions(XContentBuilder b) throws IOException { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD); + b.field("tokens_weight_threshold", TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD); + b.endObject(); + b.endObject(); + } + + protected void mappingWithIndexOptionsPrune(XContentBuilder b) throws IOException { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.endObject(); + } + + protected void mappingWithIndexOptionsPruningConfig(XContentBuilder b) throws IOException { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 5.0); + b.field("tokens_weight_threshold", 0.4); + b.endObject(); + b.endObject(); + } + @Override protected boolean supportsStoredFields() { return false; @@ -98,26 +131,33 @@ public void testDefaults() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); - ParsedDocument doc1 = mapper.parse(source(this::writeField)); + checkParsedDocument(mapper); + } - List fields = doc1.rootDoc().getFields("field"); - assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); - XFeatureField featureField1 = null; - XFeatureField featureField2 = null; - for (IndexableField field : fields) { - if (field.stringValue().equals("ten")) { - featureField1 = (XFeatureField) field; - } else if (field.stringValue().equals("twenty")) { - featureField2 = (XFeatureField) field; - } else { - throw new UnsupportedOperationException(); - } - } + public void testDefaultsPreIndexOptions() throws Exception { + IndexVersion indexVersion = IndexVersionUtils.randomVersionBetween( + Randomness.get(), + NEW_SPARSE_VECTOR_INDEX_VERSION, + IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION) + ); + DocumentMapper mapper = createDocumentMapper(indexVersion, fieldMapping(this::minimalMapping)); + assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); - int freq1 = getFrequency(featureField1.tokenStream(null, null)); - int freq2 = getFrequency(featureField2.tokenStream(null, null)); - assertTrue(freq1 < freq2); + checkParsedDocument(mapper); + } + + public void testWithIndexOptionsPrune() throws Exception { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); + assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPrune)), mapper.mappingSource().toString()); + + checkParsedDocument(mapper); + } + + public void testWithIndexOptionsPruningConfigOnly() throws Exception { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPruningConfig)); + assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPruningConfig)), mapper.mappingSource().toString()); + + checkParsedDocument(mapper); } public void testDotInFieldName() throws Exception { @@ -163,7 +203,9 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce // then fail appropriately assertEquals( - "[sparse_vector] fields take hashes that map a feature to a strictly positive float, but got unexpected token " + "START_ARRAY", + "[sparse_vector] fields take hashes that map a feature to a strictly positive float, " + + "but got unexpected token " + + "START_ARRAY", e.getCause().getMessage() ); @@ -202,6 +244,175 @@ public void testCannotBeUsedInMultiFields() { assertThat(e.getMessage(), containsString("Field [feature] of type [sparse_vector] can't be used in multifields")); } + public void testPruneMustBeBoolean() { + Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", "othervalue"); + b.endObject(); + }))); + assertThat(e.getMessage(), containsString("[index_options] failed to parse field [prune]")); + assertThat(e.getCause().getCause(), instanceOf(UnsupportedOperationException.class)); + assertThat(e.getCause().getCause().getMessage(), containsString("use text() instead")); + } + + public void testPruningConfigurationIsMap() { + Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.field("pruning_config", "this_is_not_a_map"); + b.endObject(); + }))); + assertThat(e.getMessage(), containsString("[index_options] pruning_config doesn't support values of type:")); + assertThat(e.getCause(), instanceOf(XContentParseException.class)); + assertThat( + e.getCause().getMessage(), + containsString("[index_options] pruning_config doesn't support values of type: VALUE_STRING") + ); + } + + public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { + + Exception eTestPruneIsFalse = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", false); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 5.0); + b.field("tokens_weight_threshold", 0.4); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestPruneIsFalse.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestPruneIsFalse.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestPruneIsFalse.getCause().getCause().getCause().getMessage(), + containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") + ); + + Exception eTestPruneIsMissing = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 5.0); + b.field("tokens_weight_threshold", 0.4); + b.endObject(); + b.endObject(); + }))); + assertThat( + eTestPruneIsMissing.getMessage(), + containsString("Failed to parse mapping: Failed to build [index_options] after last required field arrived") + ); + assertThat(eTestPruneIsMissing.getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestPruneIsMissing.getCause().getCause().getMessage(), + containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") + ); + } + + public void testTokensFreqRatioCorrect() { + Exception eTestInteger = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", "notaninteger"); + b.endObject(); + b.endObject(); + }))); + assertThat( + eTestInteger.getMessage(), + containsString("Failed to parse mapping: [0:0] [index_options] failed to parse field [pruning_config]") + ); + assertThat(eTestInteger.getCause().getCause(), instanceOf(XContentParseException.class)); + assertThat( + eTestInteger.getCause().getCause().getMessage(), + containsString("[pruning_config] failed to parse field [tokens_freq_ratio_threshold]") + ); + assertThat(eTestInteger.getCause().getCause().getCause(), instanceOf(NumberFormatException.class)); + assertThat(eTestInteger.getCause().getCause().getCause().getMessage(), containsString("For input string: \"notaninteger\"")); + + Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", -2); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeLower.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeLower.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeLower.getCause().getCause().getCause().getMessage(), + containsString("[tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0") + ); + + Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 101); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeHigher.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeHigher.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeHigher.getCause().getCause().getCause().getMessage(), + containsString("[tokens_freq_ratio_threshold] must be between [1] and [100], got 101.0") + ); + } + + public void testTokensWeightThresholdCorrect() { + Exception eTestDouble = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_weight_threshold", "notadouble"); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestDouble.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestDouble.getCause().getCause().getCause(), instanceOf(NumberFormatException.class)); + assertThat(eTestDouble.getCause().getCause().getCause().getMessage(), containsString("For input string: \"notadouble\"")); + + Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_weight_threshold", -0.1); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeLower.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeLower.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeLower.getCause().getCause().getCause().getMessage(), + containsString("[tokens_weight_threshold] must be between 0 and 1") + ); + + Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_weight_threshold", 1.1); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeHigher.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeHigher.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeHigher.getCause().getCause().getCause().getMessage(), + containsString("[tokens_weight_threshold] must be between 0 and 1") + ); + } + public void testStoreIsNotUpdateable() throws IOException { var mapperService = createMapperService(fieldMapping(this::minimalMapping)); XContentBuilder mapping = jsonBuilder().startObject() @@ -333,4 +544,27 @@ private Map toFloats(Map value) { } return result; } + + private void checkParsedDocument(DocumentMapper mapper) throws IOException { + ParsedDocument doc1 = mapper.parse(source(this::writeField)); + + List fields = doc1.rootDoc().getFields("field"); + assertEquals(2, fields.size()); + assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); + XFeatureField featureField1 = null; + XFeatureField featureField2 = null; + for (IndexableField field : fields) { + if (field.stringValue().equals("ten")) { + featureField1 = (XFeatureField) field; + } else if (field.stringValue().equals("twenty")) { + featureField2 = (XFeatureField) field; + } else { + throw new UnsupportedOperationException(); + } + } + + int freq1 = getFrequency(featureField1.tokenStream(null, null)); + int freq2 = getFrequency(featureField2.tokenStream(null, null)); + assertTrue(freq1 < freq2); + } } diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java new file mode 100644 index 0000000000000..4160b1d5432a7 --- /dev/null +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -0,0 +1,257 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.core.ml.search; + +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.Strings; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.rest.RestStatus; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; +import org.elasticsearch.xpack.core.XPackClientPlugin; +import org.hamcrest.Matchers; +import org.junit.Before; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; + +public class SparseVectorIndexOptionsIT extends ESIntegTestCase { + private static final String TEST_INDEX_NAME = "index_with_sparse_vector"; + private static final String SPARSE_VECTOR_FIELD = "sparse_vector_field"; + private static final int TEST_PRUNING_TOKENS_FREQ_THRESHOLD = 1; + private static final float TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD = 1.0f; + + private final boolean testHasIndexOptions; + private final boolean testIndexShouldPrune; + private final boolean testQueryShouldNotPrune; + private final boolean overrideQueryPruningConfig; + + public SparseVectorIndexOptionsIT(boolean setIndexOptions, boolean setIndexShouldPrune, boolean setQueryShouldNotPrune) { + this.testHasIndexOptions = setIndexOptions; + this.testIndexShouldPrune = setIndexShouldPrune; + this.testQueryShouldNotPrune = setQueryShouldNotPrune; + this.overrideQueryPruningConfig = (testHasIndexOptions && testIndexShouldPrune) && randomBoolean(); + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + List params = new ArrayList<>(); + // create a matrix of all combinations + // of our three parameters + for (int i = 0; i < 8; i++) { + params.add(new Object[] { (i & 1) == 0, (i & 2) == 0, (i & 4) == 0 }); + } + return params; + } + + @Override + protected Collection> nodePlugins() { + return List.of(XPackClientPlugin.class); + } + + @Before + public void setup() { + assertAcked(prepareCreate(TEST_INDEX_NAME).setMapping(getTestIndexMapping())); + ensureGreen(TEST_INDEX_NAME); + + for (Map.Entry doc : TEST_DOCUMENTS.entrySet()) { + index(TEST_INDEX_NAME, doc.getKey(), doc.getValue()); + } + flushAndRefresh(TEST_INDEX_NAME); + } + + public void testSparseVectorTokenPruning() throws Exception { + assertBusy(() -> { + SearchResponse response = performSearch(getBuilderForSearch()); + try { + assertThat(response.status(), Matchers.equalTo(RestStatus.OK)); + assertCorrectResponse(response); + } finally { + response.decRef(); + } + }); + } + + private void assertCorrectResponse(SearchResponse response) { + List expectedIds = getTestExpectedDocIds(); + + assertEquals( + getAssertMessage("Search result total hits count mismatch"), + expectedIds.size(), + response.getHits().getTotalHits().value() + ); + + List actualDocIds = new ArrayList<>(); + for (SearchHit doc : response.getHits().getHits()) { + actualDocIds.add(doc.getId()); + } + + assertEquals(getAssertMessage("Result document ids mismatch"), expectedIds, actualDocIds); + } + + private String getTestIndexMapping() { + try { + XContentBuilder docBuilder = XContentFactory.jsonBuilder(); + docBuilder.startObject(); + { + docBuilder.startObject("properties"); + { + docBuilder.startObject(SPARSE_VECTOR_FIELD); + { + docBuilder.field("type", "sparse_vector"); + addIndexFieldIndexOptions(docBuilder); + } + docBuilder.endObject(); + } + docBuilder.endObject(); + } + docBuilder.endObject(); + return Strings.toString(docBuilder); + } catch (IOException e) { + throw new AssertionError(e); + } + } + + private void addIndexFieldIndexOptions(XContentBuilder docBuilder) throws IOException { + if (testHasIndexOptions == false) { + return; + } + + docBuilder.startObject("index_options"); + if (testIndexShouldPrune) { + docBuilder.field("prune", true); + docBuilder.startObject("pruning_config"); + { + docBuilder.field("tokens_freq_ratio_threshold", TEST_PRUNING_TOKENS_FREQ_THRESHOLD); + docBuilder.field("tokens_weight_threshold", TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD); + } + docBuilder.endObject(); + } else { + docBuilder.field("prune", false); + } + docBuilder.endObject(); + } + + private List getTestExpectedDocIds() { + if (overrideQueryPruningConfig) { + return EXPECTED_DOC_IDS_WITH_QUERY_OVERRIDE; + } + + if (testQueryShouldNotPrune) { + // query overrides prune = false in all cases + return EXPECTED_DOC_IDS_WITHOUT_PRUNING; + } + + if (testHasIndexOptions) { + // index has set index options in the mapping + return testIndexShouldPrune ? EXPECTED_DOC_IDS_WITH_PRUNING : EXPECTED_DOC_IDS_WITHOUT_PRUNING; + } + + // default pruning should be true with default configuration + return EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING; + } + + private SearchResponse performSearch(SearchSourceBuilder source) { + SearchRequest searchRequest = new SearchRequest(TEST_INDEX_NAME); + searchRequest.source(source); + return client().search(searchRequest).actionGet(); + } + + private SearchSourceBuilder getBuilderForSearch() throws IOException { + boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); + + // if we're overriding the index pruning config in the query, always prune + // if not, and the query should _not_ prune, set prune=false, + // else, set to `null` to let the index options propagate + Boolean shouldPrune = overrideQueryPruningConfig ? Boolean.TRUE : (testQueryShouldNotPrune ? Boolean.FALSE : null); + TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig ? new TokenPruningConfig(3f, 0.5f, true) : null; + + SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( + SPARSE_VECTOR_FIELD, + shouldUseDefaultTokens ? SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS : SEARCH_WEIGHTED_TOKENS, + null, + null, + shouldPrune, + queryPruningConfig + ); + + return new SearchSourceBuilder().query(queryBuilder); + } + + private String getAssertMessage(String message) { + return message + + " (Params: hasIndexOptions=" + + testHasIndexOptions + + ", indexShouldPrune=" + + testIndexShouldPrune + + ", queryShouldNotPrune=" + + testQueryShouldNotPrune + + "): " + + getDescriptiveTestType(); + } + + private String getDescriptiveTestType() { + String testDescription = ""; + if (testQueryShouldNotPrune) { + testDescription += " query override prune=false:"; + } + + if (overrideQueryPruningConfig) { + testDescription += " query override pruningConfig=true:"; + } + + if (testHasIndexOptions) { + testDescription += " pruning index_options explicitly set:"; + } else { + testDescription = " no index options set, tokens should be pruned by default:"; + } + + if (testIndexShouldPrune == false) { + testDescription += " index options has pruning set to false"; + } + + return testDescription; + } + + private static final Map TEST_DOCUMENTS = Map.of( + "1", + "{\"sparse_vector_field\":{\"cheese\": 2.671405,\"is\": 0.11809908,\"comet\": 0.26088917}}", + "2", + "{\"sparse_vector_field\":{\"planet\": 2.3438394,\"is\": 0.54600334,\"astronomy\": 0.36015007,\"moon\": 0.20022368}}", + "3", + "{\"sparse_vector_field\":{\"is\": 0.6891394,\"globe\": 0.484035,\"ocean\": 0.080102935,\"underground\": 0.053516876}}" + ); + + private static final List SEARCH_WEIGHTED_TOKENS = List.of( + new WeightedToken("pugs", 0.5f), + new WeightedToken("cats", 0.5f), + new WeightedToken("is", 0.14600334f) + ); + + private static final List SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS = List.of(new WeightedToken("planet", 0.2f)); + + private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of("3", "2", "1"); + + private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of(); + + private static final List EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING = List.of("2"); + + private static final List EXPECTED_DOC_IDS_WITH_QUERY_OVERRIDE = List.of(); +} diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java index 42824a553d2bd..ad0f38d975b86 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java @@ -12,6 +12,8 @@ import java.util.Set; +import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SPARSE_VECTOR_INDEX_OPTIONS_FEATURE; + /** * Provides the XPack features that this version of the code supports */ @@ -20,4 +22,9 @@ public class XPackFeatures implements FeatureSpecification { public Set getFeatures() { return Set.of(); } + + @Override + public Set getTestFeatures() { + return Set.of(SPARSE_VECTOR_INDEX_OPTIONS_FEATURE); + } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 35cba890e5e0c..e51928096fb73 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -17,7 +17,10 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryRewriteContext; @@ -40,6 +43,7 @@ import java.util.Map; import java.util.Objects; +import static org.elasticsearch.TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19; import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN; @@ -61,7 +65,7 @@ public class SparseVectorQueryBuilder extends AbstractQueryBuilder queryVectors; private final String inferenceId; private final String query; - private final boolean shouldPruneTokens; + private final Boolean shouldPruneTokens; private final SetOnce weightedTokensSupplier; @@ -81,13 +85,11 @@ public SparseVectorQueryBuilder( @Nullable TokenPruningConfig tokenPruningConfig ) { this.fieldName = Objects.requireNonNull(fieldName, "[" + NAME + "] requires a [" + FIELD_FIELD.getPreferredName() + "]"); - this.shouldPruneTokens = (shouldPruneTokens != null ? shouldPruneTokens : DEFAULT_PRUNE); + this.shouldPruneTokens = shouldPruneTokens; this.queryVectors = queryVectors; this.inferenceId = inferenceId; this.query = query; - this.tokenPruningConfig = (tokenPruningConfig != null - ? tokenPruningConfig - : (this.shouldPruneTokens ? new TokenPruningConfig() : null)); + this.tokenPruningConfig = tokenPruningConfig; this.weightedTokensSupplier = null; // Preserve BWC error messaging @@ -124,7 +126,12 @@ public SparseVectorQueryBuilder( public SparseVectorQueryBuilder(StreamInput in) throws IOException { super(in); this.fieldName = in.readString(); - this.shouldPruneTokens = in.readBoolean(); + if (in.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) + || in.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { + this.shouldPruneTokens = in.readOptionalBoolean(); + } else { + this.shouldPruneTokens = in.readBoolean(); + } this.queryVectors = in.readOptionalCollectionAsList(WeightedToken::new); this.inferenceId = in.readOptionalString(); this.query = in.readOptionalString(); @@ -158,7 +165,7 @@ public String getQuery() { return query; } - public boolean shouldPruneTokens() { + public Boolean shouldPruneTokens() { return shouldPruneTokens; } @@ -173,7 +180,12 @@ protected void doWriteTo(StreamOutput out) throws IOException { } out.writeString(fieldName); - out.writeBoolean(shouldPruneTokens); + if (out.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) + || out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { + out.writeOptionalBoolean(shouldPruneTokens); + } else { + out.writeBoolean(shouldPruneTokens != null ? shouldPruneTokens : DEFAULT_PRUNE); + } out.writeOptionalCollection(queryVectors); out.writeOptionalString(inferenceId); out.writeOptionalString(query); @@ -196,7 +208,9 @@ protected void doXContent(XContentBuilder builder, Params params) throws IOExcep } builder.field(QUERY_FIELD.getPreferredName(), query); } - builder.field(PRUNE_FIELD.getPreferredName(), shouldPruneTokens); + if (shouldPruneTokens != null) { + builder.field(PRUNE_FIELD.getPreferredName(), shouldPruneTokens); + } if (tokenPruningConfig != null) { builder.field(PRUNING_CONFIG_FIELD.getPreferredName(), tokenPruningConfig); } @@ -222,8 +236,10 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { ); } - return (shouldPruneTokens) - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, tokenPruningConfig, queryVectors, ft, context) + TokenPruningConfig pruningConfig = getTokenPruningConfigForQuery(ft, context); + + return pruningConfig != null + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningConfig, queryVectors, ft, context) : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } @@ -231,7 +247,9 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) { if (queryVectors != null) { return this; - } else if (weightedTokensSupplier != null) { + } + + if (weightedTokensSupplier != null) { TextExpansionResults textExpansionResults = weightedTokensSupplier.get(); if (textExpansionResults == null) { return this; // No results yet @@ -332,6 +350,49 @@ public TransportVersion getMinimalSupportedVersion() { return TransportVersions.V_8_15_0; } + private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, SearchExecutionContext context) { + // if we do not have searcher, there can be no token pruning because there will be no IndexReader + if (context.searcher() == null) { + return null; + } + + // if we are not on a supported index version, do not prune by default + // nor do we check the index options, so, we'll return a pruning config only if the query specifies it. + if (context.indexVersionCreated().onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT) == false + && context.indexVersionCreated() + .between( + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, + IndexVersions.UPGRADE_TO_LUCENE_10_0_0 + ) == false) { + return (shouldPruneTokens != null && shouldPruneTokens) ? tokenPruningConfig : null; + } + + Boolean shouldQueryPruneTokens = shouldPruneTokens; + TokenPruningConfig pruningConfigToUse = tokenPruningConfig; + + if (ft instanceof SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType && asSVFieldType.getIndexOptions() != null) { + shouldQueryPruneTokens = shouldQueryPruneTokens == null ? asSVFieldType.getIndexOptions().getPrune() : shouldQueryPruneTokens; + pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; + } + + // do not prune if shouldQueryPruneTokens is explicitly set to false + if (shouldQueryPruneTokens != null && shouldQueryPruneTokens == false) { + return null; + } + + // if we're here, we should prune if set or by default + // if we don't have a pruning config, use the default + pruningConfigToUse = pruningConfigToUse == null + ? new TokenPruningConfig( + TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, + TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, + false + ) + : pruningConfigToUse; + + return pruningConfigToUse; + } + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>(NAME, a -> { String fieldName = (String) a[0]; @SuppressWarnings("unchecked") diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java index 81758ec5f9342..472cb5e1c7012 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java @@ -18,6 +18,7 @@ import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java index f41fcd77ce627..133b84cbb85a8 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java @@ -19,6 +19,7 @@ import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.xcontent.ParseField; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java index 1c2ac23151e6e..87a548a553365 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java @@ -15,6 +15,7 @@ import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.SearchExecutionContext; import java.io.IOException; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index af557ed6b7f82..ba6af5c96c91d 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -26,10 +26,12 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.CheckedConsumer; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.plugins.Plugin; @@ -57,6 +59,23 @@ public class SparseVectorQueryBuilderTests extends AbstractQueryTestCase WEIGHTED_TOKENS = List.of(new WeightedToken("foo", .42f)); private static final int NUM_TOKENS = WEIGHTED_TOKENS.size(); + private final IndexVersion indexVersionToTest; + + public SparseVectorQueryBuilderTests() { + // The sparse_vector field is not supported on versions 8.0 to 8.10. Because of this we'll only allow + // index versions after its reintroduction. + indexVersionToTest = randomBoolean() + ? IndexVersionUtils.randomVersionBetween( + random(), + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT, + IndexVersion.current() + ) + : IndexVersionUtils.randomVersionBetween( + random(), + IndexVersions.NEW_SPARSE_VECTOR, + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT + ); + } @Override protected SparseVectorQueryBuilder doCreateTestQueryBuilder() { @@ -104,12 +123,7 @@ protected Collection> getPlugins() { @Override protected Settings createTestIndexSettings() { - // The sparse_vector field is not supported on versions 8.0 to 8.10. Because of this we'll only allow - // index versions after its reintroduction. - final IndexVersion indexVersionCreated = randomBoolean() - ? IndexVersion.current() - : IndexVersionUtils.randomVersionBetween(random(), IndexVersions.NEW_SPARSE_VECTOR, IndexVersion.current()); - return Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, indexVersionCreated).build(); + return Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, indexVersionToTest).build(); } @Override @@ -144,11 +158,8 @@ protected Object simulateMethod(Method method, Object[] args) { @Override protected void initializeAdditionalMappings(MapperService mapperService) throws IOException { - mapperService.merge( - "_doc", - new CompressedXContent(Strings.toString(PutMappingRequest.simpleMapping(SPARSE_VECTOR_FIELD, "type=sparse_vector"))), - MapperService.MergeReason.MAPPING_UPDATE - ); + String mapping = Strings.toString(PutMappingRequest.simpleMapping(SPARSE_VECTOR_FIELD, "type=sparse_vector")); + mapperService.merge("_doc", new CompressedXContent(mapping), MapperService.MergeReason.MAPPING_UPDATE); } @Override @@ -170,41 +181,41 @@ protected void doAssertLuceneQuery(SparseVectorQueryBuilder queryBuilder, Query } } - /** - * Overridden to ensure that {@link SearchExecutionContext} has a non-null {@link IndexReader} - */ - @Override - public void testCacheability() throws IOException { + private void withSearchIndex(CheckedConsumer consumer) throws IOException { try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { Document document = new Document(); document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); iw.addDocument(document); try (IndexReader reader = iw.getReader()) { SearchExecutionContext context = createSearchExecutionContext(newSearcher(reader)); - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - QueryBuilder rewriteQuery = rewriteQuery(queryBuilder, new SearchExecutionContext(context)); - - assertNotNull(rewriteQuery.toQuery(context)); - assertTrue("query should be cacheable: " + queryBuilder.toString(), context.isCacheable()); + consumer.accept(context); } } } + /** + * Overridden to ensure that {@link SearchExecutionContext} has a non-null {@link IndexReader} + */ + @Override + public void testCacheability() throws IOException { + withSearchIndex((context) -> { + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + QueryBuilder rewriteQuery = null; + rewriteQuery = rewriteQuery(queryBuilder, new SearchExecutionContext(context)); + assertNotNull(rewriteQuery.toQuery(context)); + assertTrue("query should be cacheable: " + queryBuilder.toString(), context.isCacheable()); + }); + } + /** * Overridden to ensure that {@link SearchExecutionContext} has a non-null {@link IndexReader}; this query should always be rewritten */ @Override public void testMustRewrite() throws IOException { - try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); - iw.addDocument(document); - try (IndexReader reader = iw.getReader()) { - SearchExecutionContext context = createSearchExecutionContext(newSearcher(reader)); - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - queryBuilder.toQuery(context); - } - } + withSearchIndex((context) -> { + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + queryBuilder.toQuery(context); + }); } /** @@ -212,22 +223,16 @@ public void testMustRewrite() throws IOException { */ @Override public void testToQuery() throws IOException { - try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); - iw.addDocument(document); - try (IndexReader reader = iw.getReader()) { - SearchExecutionContext context = createSearchExecutionContext(newSearcher(reader)); - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - if (queryBuilder.getQueryVectors() == null) { - QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); - assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); - testDoToQuery((SparseVectorQueryBuilder) rewrittenQueryBuilder, context); - } else { - testDoToQuery(queryBuilder, context); - } + withSearchIndex((context) -> { + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + if (queryBuilder.getQueryVectors() == null) { + QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); + assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); + testDoToQuery((SparseVectorQueryBuilder) rewrittenQueryBuilder, context); + } else { + testDoToQuery(queryBuilder, context); } - } + }); } private void testDoToQuery(SparseVectorQueryBuilder queryBuilder, SearchExecutionContext context) throws IOException { @@ -240,7 +245,9 @@ private void testDoToQuery(SparseVectorQueryBuilder queryBuilder, SearchExecutio assertTrue(query instanceof SparseVectorQueryWrapper); var sparseQuery = (SparseVectorQueryWrapper) query; - if (queryBuilder.shouldPruneTokens()) { + + // check if we have explicit pruning or implicit pruning=true + if (queryBuilder.shouldPruneTokens() || indexVersionSupportsIndexOptions()) { // It's possible that all documents were pruned for aggressive pruning configurations assertTrue(sparseQuery.getTermsQuery() instanceof BooleanQuery || sparseQuery.getTermsQuery() instanceof MatchNoDocsQuery); } else { @@ -338,4 +345,19 @@ public void testThatWeCorrectlyRewriteQueryIntoVectors() { assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); } + + private boolean indexVersionSupportsIndexOptions() { + if (indexVersionToTest.onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)) { + return true; + } + + if (indexVersionToTest.between( + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, + IndexVersions.UPGRADE_TO_LUCENE_10_0_0 + )) { + return true; + } + + return false; + } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java index a0263003b72db..5e1a3fd5ed0ce 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java @@ -25,6 +25,7 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java index 8cdf44ae51dd4..095c88923bb5a 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.xpack.core.ml.search; import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.test.AbstractXContentSerializingTestCase; import org.elasticsearch.xcontent.XContentParser; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java index cded9b8dce5e2..76a782c94f9e0 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java @@ -27,6 +27,7 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.plugins.Plugin; diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index b808dce4f9750..ff57282921952 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -133,12 +133,13 @@ public void testSparseVector() throws Exception { Map queryMap = (Map) queries.get("sparse_vector_1"); List tokens = readSparseVector(queryMap.get("embeddings")); var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_ELSER); + SparseVectorQueryBuilder sparseQuery = new SparseVectorQueryBuilder( fieldType.getEmbeddingsField().fullPath(), tokens, null, null, - null, + false, null ); NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), sparseQuery, ScoreMode.Max); @@ -232,7 +233,7 @@ private void assertHighlightOneDoc( assertThat(topDocs.totalHits.value(), equalTo(1L)); int docID = topDocs.scoreDocs[0].doc; SemanticTextHighlighter highlighter = new SemanticTextHighlighter(); - var execContext = createSearchExecutionContext(mapperService); + var execContext = createSearchExecutionContext(mapperService, searcher); var luceneQuery = execContext.toQuery(request.source().query()).query(); FetchContext fetchContext = mock(FetchContext.class); Mockito.when(fetchContext.highlight()).thenReturn(new SearchHighlightContext(Collections.emptyList())); diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json index 6227f3f498854..a41d5fafaec41 100644 --- a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json +++ b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json @@ -464,4 +464,4 @@ "Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n" ] } -} \ No newline at end of file +} diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 20ab78a48ae1b..52c98ce12dfbe 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -113,6 +113,20 @@ teardown: model_id: "text_expansion_model" ignore: 404 + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.delete: + index: ["sparse_vector_pruning_test", "test-sparse-vector-without-pruning", "test-sparse-vector-with-pruning"] + ignore: 404 + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.refresh: { } + --- "Test sparse_vector search": - do: @@ -184,3 +198,428 @@ teardown: - match: { hits.total.value: 5 } - match: { hits.hits.0._source.source_text: "the octopus comforter smells" } + +--- +"Check sparse_vector token pruning index_options mappings": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + +--- +"Check sparse_vector token pruning index_options mappings defaults": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options + +--- +"Check sparse_vector token pruning index_options prune missing do not allow config": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options prune false do not allow config": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: false + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options tokens freq out of bounds": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 101.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options tokens weight out of bounds": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[tokens_weight_threshold\] must be between 0 and 1/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 5.0 + tokens_weight_threshold: 3.5 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options in query": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-with-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-without-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: false + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-with-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-without-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: false + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + +--- +"Check sparse_vector should prune by default": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-pruning-default + body: + mappings: + properties: + content_embedding: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-pruning-default + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.14600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.1891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"index": { "_id": "4" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "5" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "6" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "7" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "8" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "9" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "10" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "11" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "12" }} + {"content_embedding":{"is": 0.1891394}} + + - do: + search: + index: test-sparse-vector-pruning-default + body: + query: + sparse_vector: + field: content_embedding + query_vector: + pugs: 0.5 + cats: 0.5 + is: 0.04600334 + + - match: { hits.total.value: 0 } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index e3b59bdaf50bd..248e9d2af0d70 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -112,6 +112,20 @@ teardown: model_id: "text_expansion_model" ignore: 404 + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.delete: + index: ["sparse_vector_pruning_test", "test-sparse-vector-without-pruning", "test-sparse-vector-with-pruning"] + ignore: 404 + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.refresh: { } + --- "Test sparse_vector search": - do: @@ -183,3 +197,428 @@ teardown: - match: { hits.total.value: 5 } - match: { hits.hits.0._source.source_text: "the octopus comforter smells" } + +--- +"Check sparse_vector token pruning index_options mappings": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + +--- +"Check sparse_vector token pruning index_options mappings defaults": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options + +--- +"Check sparse_vector token pruning index_options prune missing do not allow config": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options prune false do not allow config": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: false + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options tokens freq out of bounds": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 101.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options tokens weight out of bounds": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[tokens_weight_threshold\] must be between 0 and 1/ + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 5.0 + tokens_weight_threshold: 3.5 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options in query": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-with-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-without-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: false + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-with-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-without-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: false + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + +--- +"Check sparse_vector should prune by default": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-pruning-default + body: + mappings: + properties: + content_embedding: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-pruning-default + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.14600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.1891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"index": { "_id": "4" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "5" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "6" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "7" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "8" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "9" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "10" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "11" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "12" }} + {"content_embedding":{"is": 0.1891394}} + + - do: + search: + index: test-sparse-vector-pruning-default + body: + query: + sparse_vector: + field: content_embedding + query_vector: + pugs: 0.5 + cats: 0.5 + is: 0.04600334 + + - match: { hits.total.value: 0 } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 3481773b0bab3..0b7cd44d500a8 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -89,6 +89,24 @@ setup: model_id: text_expansion_model wait_for: started +--- +teardown: + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.delete: + index: ["sparse_vector_pruning_test", "test-sparse-vector-without-pruning", "test-sparse-vector-with-pruning"] + ignore: 404 + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.refresh: { } + --- "Test sparse_vector search": - do: @@ -510,3 +528,432 @@ setup: - match: { hits.hits.0._score: 4.0 } - match: { hits.hits.1._id: "parent-foo-bar" } - match: { hits.hits.1._score: 2.0 } + +--- +"Check sparse_vector token pruning index_options mappings": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + +--- +"Check sparse_vector token pruning index_options mappings defaults": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options + +--- +"Check sparse_vector token pruning index_options prune missing do not allow config": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options prune false do not allow config": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: false + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options tokens freq out of bounds": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/ + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 101.0 + tokens_weight_threshold: 0.4 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options tokens weight out of bounds": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + catch: /\[tokens_weight_threshold\] must be between 0 and 1/ + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 5.0 + tokens_weight_threshold: 3.5 + + - match: { status: 400 } + +--- +"Check sparse_vector token pruning index_options in query": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: test-sparse-vector-with-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + settings: + number_of_shards: 1 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: test-sparse-vector-without-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: false + settings: + number_of_shards: 1 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + bulk: + index: test-sparse-vector-with-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + bulk: + index: test-sparse-vector-without-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: false + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + +--- +"Check sparse_vector should prune by default": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: test-sparse-vector-pruning-default + body: + mappings: + properties: + content_embedding: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + bulk: + index: test-sparse-vector-pruning-default + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.14600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.1891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"index": { "_id": "4" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "5" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "6" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "7" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "8" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "9" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "10" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "11" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "12" }} + {"content_embedding":{"is": 0.1891394}} + + - do: + search: + index: test-sparse-vector-pruning-default + body: + query: + sparse_vector: + field: content_embedding + query_vector: + pugs: 0.5 + cats: 0.5 + is: 0.04600334 + + - match: { hits.total.value: 0 }