From e02cd3aa51cde440ec11f053011215a11569d4ca Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 11 Apr 2025 21:04:56 -0400 Subject: [PATCH 01/86] Initial checkin - needs tests --- .../elasticsearch/index/IndexVersions.java | 1 + .../vectors/SparseVectorFieldMapper.java | 258 +++++++++++++++++- .../vectors/SparseVectorFieldMapperTests.java | 31 +++ .../ml/search/SparseVectorQueryBuilder.java | 116 +++++++- .../search/SparseVectorQueryBuilderTests.java | 38 +++ 5 files changed, 426 insertions(+), 18 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index 531e9455d1a2e..809156ee57327 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -159,6 +159,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_UNSIGNED_LONG = def(9_019_0_00, Version.LUCENE_10_1_0); public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_SCALED_FLOAT = def(9_020_0_00, Version.LUCENE_10_1_0); public static final IndexVersion USE_LUCENE101_POSTINGS_FORMAT = def(9_021_0_00, Version.LUCENE_10_1_0); + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_022_0_00, Version.LUCENE_10_1_0); /* * STOP! READ THIS FIRST! No, really, * ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index fdbb03f51743e..c5be519ee3740 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -22,6 +22,7 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.core.Nullable; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -31,6 +32,8 @@ import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.MappingParserContext; import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.mapper.SourceValueFetcher; import org.elasticsearch.index.mapper.TextSearchInfo; @@ -38,6 +41,7 @@ import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.search.fetch.StoredFieldsSpec; import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser.Token; @@ -46,6 +50,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.stream.Stream; import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST; @@ -65,6 +70,9 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; + static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersion.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; + + private final SparseVectorFieldMapper.IndexOptions indexOptions; private static SparseVectorFieldMapper toType(FieldMapper in) { return (SparseVectorFieldMapper) in; @@ -73,9 +81,23 @@ private static SparseVectorFieldMapper toType(FieldMapper in) { public static class Builder extends FieldMapper.Builder { private final Parameter stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false); private final Parameter> meta = Parameter.metaParam(); + private final Parameter indexOptions; public Builder(String name) { super(name); + this.indexOptions = new Parameter<>( + "index_options", + true, + () -> null, + (n, c, o) -> o == null ? null : parseIndexOptions(n, c, o), + m -> toType(m).fieldType().indexOptions, + (b, n, v) -> { + if (v != null) { + b.field(n, v); + } + }, + Objects::toString + ); } public Builder setStored(boolean value) { @@ -85,17 +107,113 @@ public Builder setStored(boolean value) { @Override protected Parameter[] getParameters() { - return new Parameter[] { stored, meta }; + return new Parameter[] { stored, meta, indexOptions }; } @Override public SparseVectorFieldMapper build(MapperBuilderContext context) { return new SparseVectorFieldMapper( leafName(), - new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()), - builderParams(this, context) + new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), indexOptions.getValue()), + builderParams(this, context), + indexOptions.getValue() + ); + } + } + + public IndexOptions getIndexOptions() { + return this.indexOptions; + } + + private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(String fieldName, MappingParserContext context, Object propNode) { + @SuppressWarnings("unchecked") + Map indexOptionsMap = (Map) propNode; + + boolean hasOneOption = false; + Boolean prune = null; + PruningConfig pruningConfig = null; + + Object shouldPrune = indexOptionsMap.remove(IndexOptions.PRUNE_FIELD_NAME); + if (shouldPrune != null) { + if ((shouldPrune instanceof Boolean) == false) { + throw new MapperParsingException("[index_options] field [prune] should be true or false"); + } + hasOneOption = true; + prune = ((Boolean) shouldPrune); + } + + Object hasPruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); + if (hasPruningConfiguration != null) { + if ((hasPruningConfiguration instanceof Map) == false) { + throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); + } + + Integer tokensFreqRatioThreshold = null; + Double tokensWeightThreshold = null; + + @SuppressWarnings("unchecked") + Map pruningConfigMap = (Map) hasPruningConfiguration; + Object hasTokensFreqRatioThreshold = pruningConfigMap.remove(PruningConfig.TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME); + Object hasTokensWeightThreshold = pruningConfigMap.remove(PruningConfig.TOKENS_WEIGHT_THRESHOLD_FIELD_NAME); + + if (pruningConfigMap.isEmpty() == false) { + throw new MapperParsingException("[index_options] field [pruning_config] has unknown fields"); + } + + if (hasTokensFreqRatioThreshold != null) { + if ((hasTokensFreqRatioThreshold instanceof Integer) == false) { + throw new MapperParsingException( + "[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100" + ); + } + tokensFreqRatioThreshold = (Integer) hasTokensFreqRatioThreshold; + if (tokensFreqRatioThreshold < PruningConfig.MIN_TOKENS_FREQ_RATIO_THRESHOLD + || tokensFreqRatioThreshold > PruningConfig.MAX_TOKENS_FREQ_RATIO_THRESHOLD) { + throw new MapperParsingException( + "[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100" + ); + } + } + + if (hasTokensWeightThreshold != null) { + if ((hasTokensWeightThreshold instanceof Double) == false) { + throw new MapperParsingException( + "[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0" + ); + } + tokensWeightThreshold = (Double) hasTokensWeightThreshold; + if (tokensWeightThreshold < PruningConfig.MIN_TOKENS_WEIGHT_THRESHOLD + || tokensWeightThreshold > PruningConfig.MAX_TOKENS_WEIGHT_THRESHOLD) { + throw new MapperParsingException( + "[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0" + ); + } + } + + if (tokensFreqRatioThreshold != null || tokensWeightThreshold != null) { + pruningConfig = new PruningConfig(tokensFreqRatioThreshold, tokensWeightThreshold); + hasOneOption = true; + } + } + + if (hasOneOption == false) { + if (context.indexVersionCreated().before(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { + // don't set defaults if this index was created before + // we added this functionality in, so it will + // not change current index behaviour + return null; + } + + // index options are not set - for new indices, we + // need to set pruning to true by default + // with a default pruning configuration + return new IndexOptions( + true, + new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) ); } + + return new SparseVectorFieldMapper.IndexOptions(prune, pruningConfig); } public static final TypeParser PARSER = new TypeParser((n, c) -> { @@ -109,9 +227,21 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { }, notInMultiFields(CONTENT_TYPE)); public static final class SparseVectorFieldType extends MappedFieldType { + private final IndexOptions indexOptions; public SparseVectorFieldType(String name, boolean isStored, Map meta) { super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); + this.indexOptions = null; + } + + public SparseVectorFieldType( + String name, + boolean isStored, + Map meta, + @Nullable SparseVectorFieldMapper.IndexOptions indexOptions + ) { + super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); + this.indexOptions = indexOptions; } @Override @@ -157,8 +287,14 @@ private static String indexedValueForSearch(Object value) { } } - private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) { + private SparseVectorFieldMapper( + String simpleName, + MappedFieldType mappedFieldType, + BuilderParams builderParams, + @Nullable IndexOptions indexOptions + ) { super(simpleName, mappedFieldType, builderParams); + this.indexOptions = indexOptions; } @Override @@ -364,4 +500,118 @@ public void reset() { } } + public static class IndexOptions implements ToXContent { + public static final String PRUNE_FIELD_NAME = "prune"; + public static final String PRUNING_CONFIG_FIELD_NAME = "pruning_config"; + + final Boolean prune; + final PruningConfig pruningConfig; + + IndexOptions(@Nullable Boolean prune, @Nullable PruningConfig pruningConfig) { + this.prune = prune; + this.pruningConfig = pruningConfig; + } + + public Boolean getPrune() { + return prune; + } + + public PruningConfig getPruningConfig() { + return pruningConfig; + } + + @Override + public final boolean equals(Object other) { + if (other == this) { + return true; + } + if (other instanceof IndexOptions otherOptions) { + return Objects.equals(prune, otherOptions.prune) && Objects.equals(pruningConfig, otherOptions.pruningConfig); + } + return false; + } + + @Override + public final int hashCode() { + return Objects.hash(prune, pruningConfig); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + if (prune != null) { + builder.field(PRUNE_FIELD_NAME, prune); + } + if (pruningConfig != null) { + builder.field(PRUNING_CONFIG_FIELD_NAME, pruningConfig); + } + builder.endObject(); + return builder; + } + } + + public static class PruningConfig implements ToXContent { + public static final String TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME = "tokens_freq_ratio_threshold"; + public static final String TOKENS_WEIGHT_THRESHOLD_FIELD_NAME = "tokens_weight_threshold"; + + public static Integer DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD = 5; + public static Integer MIN_TOKENS_FREQ_RATIO_THRESHOLD = 1; + public static Integer MAX_TOKENS_FREQ_RATIO_THRESHOLD = 100; + + public static Double DEFAULT_TOKENS_WEIGHT_THRESHOLD = 0.4; + public static Double MIN_TOKENS_WEIGHT_THRESHOLD = 0.0; + public static Double MAX_TOKENS_WEIGHT_THRESHOLD = 1.0; + + final Integer tokens_freq_ratio_threshold; + final Double tokens_weight_threshold; + + PruningConfig(@Nullable Integer tokens_freq_ratio_threshold, @Nullable Double tokens_weight_threshold) { + this.tokens_freq_ratio_threshold = tokens_freq_ratio_threshold; + this.tokens_weight_threshold = tokens_weight_threshold; + } + + public int getTokensFreqRatioThresholdOrDefault() { + if (tokens_freq_ratio_threshold == null) { + return DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD; + } + return tokens_freq_ratio_threshold; + } + + public double getTokensWeightThresholdOrDefault() { + if (tokens_weight_threshold == null) { + return DEFAULT_TOKENS_WEIGHT_THRESHOLD; + } + return tokens_weight_threshold; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + if (tokens_freq_ratio_threshold != null) { + builder.field(TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME, tokens_freq_ratio_threshold); + } + if (tokens_weight_threshold != null) { + builder.field(TOKENS_WEIGHT_THRESHOLD_FIELD_NAME, tokens_weight_threshold); + } + builder.endObject(); + return builder; + } + + @Override + public final boolean equals(Object other) { + if (other == this) { + return true; + } + if (other instanceof PruningConfig otherConfig) { + return Objects.equals(tokens_freq_ratio_threshold, otherConfig.tokens_freq_ratio_threshold) + && Objects.equals(tokens_weight_threshold, otherConfig.tokens_weight_threshold); + } + return false; + } + + @Override + public final int hashCode() { + return Objects.hash(tokens_freq_ratio_threshold, tokens_weight_threshold); + } + } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index b2379ba579204..b2a195eed1830 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -67,6 +67,23 @@ protected void minimalMapping(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); } + protected void mappingWithIndexOptionsPrune(XContentBuilder b) throws IOException { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.endObject(); + } + + protected void mappingWithIndexOptionsPruningConfig(XContentBuilder b) throws IOException { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 5); + b.field("tokens_weight_threshold", 0.4); + b.endObject(); + b.endObject(); + } + @Override protected boolean supportsStoredFields() { return false; @@ -120,6 +137,20 @@ public void testDefaults() throws Exception { assertTrue(freq1 < freq2); } + public void testWithIndexOptionsPrune() throws Exception { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); + assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPrune)), mapper.mappingSource().toString()); + + // TODO -- finish + } + + public void testWithIndexOptionsPruningConfig() throws Exception { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPruningConfig)); + assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPruningConfig)), mapper.mappingSource().toString()); + + // TODO -- finish + } + public void testDotInFieldName() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); ParsedDocument parsedDocument = mapper.parse(source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20)))); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 35cba890e5e0c..5e79149eea6b1 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -18,6 +18,10 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.Mapping; +import org.elasticsearch.index.mapper.RootObjectMapper; +import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryRewriteContext; @@ -61,7 +65,7 @@ public class SparseVectorQueryBuilder extends AbstractQueryBuilder queryVectors; private final String inferenceId; private final String query; - private final boolean shouldPruneTokens; + private final Boolean shouldPruneTokens; private final SetOnce weightedTokensSupplier; @@ -81,13 +85,11 @@ public SparseVectorQueryBuilder( @Nullable TokenPruningConfig tokenPruningConfig ) { this.fieldName = Objects.requireNonNull(fieldName, "[" + NAME + "] requires a [" + FIELD_FIELD.getPreferredName() + "]"); - this.shouldPruneTokens = (shouldPruneTokens != null ? shouldPruneTokens : DEFAULT_PRUNE); + this.shouldPruneTokens = shouldPruneTokens; this.queryVectors = queryVectors; this.inferenceId = inferenceId; this.query = query; - this.tokenPruningConfig = (tokenPruningConfig != null - ? tokenPruningConfig - : (this.shouldPruneTokens ? new TokenPruningConfig() : null)); + this.tokenPruningConfig = tokenPruningConfig; this.weightedTokensSupplier = null; // Preserve BWC error messaging @@ -124,7 +126,7 @@ public SparseVectorQueryBuilder( public SparseVectorQueryBuilder(StreamInput in) throws IOException { super(in); this.fieldName = in.readString(); - this.shouldPruneTokens = in.readBoolean(); + this.shouldPruneTokens = in.readOptionalBoolean(); this.queryVectors = in.readOptionalCollectionAsList(WeightedToken::new); this.inferenceId = in.readOptionalString(); this.query = in.readOptionalString(); @@ -159,7 +161,7 @@ public String getQuery() { } public boolean shouldPruneTokens() { - return shouldPruneTokens; + return shouldPruneTokens != null ? shouldPruneTokens : DEFAULT_PRUNE; } public TokenPruningConfig getTokenPruningConfig() { @@ -173,7 +175,7 @@ protected void doWriteTo(StreamOutput out) throws IOException { } out.writeString(fieldName); - out.writeBoolean(shouldPruneTokens); + out.writeOptionalBoolean(shouldPruneTokens); out.writeOptionalCollection(queryVectors); out.writeOptionalString(inferenceId); out.writeOptionalString(query); @@ -196,7 +198,9 @@ protected void doXContent(XContentBuilder builder, Params params) throws IOExcep } builder.field(QUERY_FIELD.getPreferredName(), query); } - builder.field(PRUNE_FIELD.getPreferredName(), shouldPruneTokens); + if (shouldPruneTokens != null) { + builder.field(PRUNE_FIELD.getPreferredName(), shouldPruneTokens); + } if (tokenPruningConfig != null) { builder.field(PRUNING_CONFIG_FIELD.getPreferredName(), tokenPruningConfig); } @@ -222,8 +226,27 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { ); } - return (shouldPruneTokens) - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, tokenPruningConfig, queryVectors, ft, context) + // if the query options for pruning are not set, + // we need to check the index options for this field + // and use those if set. + boolean doPruneTokens = false; + TokenPruningConfig setTokenPruningConfig = tokenPruningConfig; + + if (shouldPruneTokens == null || setTokenPruningConfig == null) { + SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); + IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(sparseVectorFieldMapper); + if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { + doPruneTokens = true; + } + if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { + setTokenPruningConfig = indexPruningSettings.pruningConfig; + } + } else { + doPruneTokens = shouldPruneTokens; + } + + return (doPruneTokens) + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, setTokenPruningConfig, queryVectors, ft, context) : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } @@ -231,19 +254,40 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) { if (queryVectors != null) { return this; - } else if (weightedTokensSupplier != null) { + } + + if (weightedTokensSupplier != null) { TextExpansionResults textExpansionResults = weightedTokensSupplier.get(); if (textExpansionResults == null) { return this; // No results yet } + // if the query options for pruning are not set, + // we need to check the index options for this field + // and use those if set. + boolean doPruneTokens = false; + TokenPruningConfig setTokenPruningConfig = tokenPruningConfig; + + if (shouldPruneTokens == null || setTokenPruningConfig == null) { + SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForQueryRewrite(fieldName, queryRewriteContext); + IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(sparseVectorFieldMapper); + if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { + doPruneTokens = true; + } + if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { + setTokenPruningConfig = indexPruningSettings.pruningConfig; + } + } else { + doPruneTokens = shouldPruneTokens; + } + return new SparseVectorQueryBuilder( fieldName, textExpansionResults.getWeightedTokens(), null, null, - shouldPruneTokens, - tokenPruningConfig + doPruneTokens, + setTokenPruningConfig ); } else if (inferenceId == null) { // Edge case, where inference_id was not specified in the request, @@ -378,4 +422,48 @@ public static SparseVectorQueryBuilder fromXContent(XContentParser parser) { throw new ParsingException(parser.getTokenLocation(), e.getMessage(), e); } } + + private record IndexFieldPruningSettings(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) {} + + private IndexFieldPruningSettings getIndexFieldPruningSettings(SparseVectorFieldMapper sparseVectorFieldMapper) { + if (sparseVectorFieldMapper == null) { + return new IndexFieldPruningSettings(null, null); + } + + SparseVectorFieldMapper.IndexOptions indexOptions = sparseVectorFieldMapper.getIndexOptions(); + if (indexOptions == null) { + return new IndexFieldPruningSettings(null, null); + } + + Boolean indexOptionsPrune = indexOptions.getPrune(); + + SparseVectorFieldMapper.PruningConfig indexPruningConfig = indexOptions.getPruningConfig(); + TokenPruningConfig indexTokenPruningConfig = indexPruningConfig != null + ? new TokenPruningConfig( + indexPruningConfig.getTokensFreqRatioThresholdOrDefault(), + (float) indexPruningConfig.getTokensWeightThresholdOrDefault(), + false + ) + : null; + + return new IndexFieldPruningSettings(indexOptionsPrune, indexTokenPruningConfig); + } + + private SparseVectorFieldMapper getSparseVectorFieldMapper(String fieldName, Mapping fieldMapping) { + RootObjectMapper rootMapping = fieldMapping.getRoot(); + Mapper thisMapper = rootMapping.getMapper(fieldName); + if (thisMapper instanceof SparseVectorFieldMapper) { + return (SparseVectorFieldMapper) thisMapper; + } + + return null; + } + + private SparseVectorFieldMapper getSparseVectorFieldMapperForSearchExecution(String fieldName, SearchExecutionContext context) { + return getSparseVectorFieldMapper(fieldName, context.getMappingLookup().getMapping()); + } + + private SparseVectorFieldMapper getSparseVectorFieldMapperForQueryRewrite(String fieldName, QueryRewriteContext context) { + return getSparseVectorFieldMapper(fieldName, context.getMappingLookup().getMapping()); + } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index af557ed6b7f82..4f1e397f0fb10 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -46,6 +46,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.function.Consumer; import static org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder.QUERY_VECTOR_FIELD; import static org.hamcrest.CoreMatchers.instanceOf; @@ -58,6 +59,8 @@ public class SparseVectorQueryBuilderTests extends AbstractQueryTestCase WEIGHTED_TOKENS = List.of(new WeightedToken("foo", .42f)); private static final int NUM_TOKENS = WEIGHTED_TOKENS.size(); + private boolean testWithSparseVectorFieldIndexOptions = false; + @Override protected SparseVectorQueryBuilder doCreateTestQueryBuilder() { TokenPruningConfig tokenPruningConfig = randomBoolean() @@ -144,6 +147,11 @@ protected Object simulateMethod(Method method, Object[] args) { @Override protected void initializeAdditionalMappings(MapperService mapperService) throws IOException { + if (testWithSparseVectorFieldIndexOptions) { + addSparseVectorIndexOptionsMapping(mapperService); + return; + } + mapperService.merge( "_doc", new CompressedXContent(Strings.toString(PutMappingRequest.simpleMapping(SPARSE_VECTOR_FIELD, "type=sparse_vector"))), @@ -151,6 +159,13 @@ protected void initializeAdditionalMappings(MapperService mapperService) throws ); } + private void addSparseVectorIndexOptionsMapping(MapperService mapperService) throws IOException { + String addIndexOptionsTemplate = "{\"properties\":{\"" + + SPARSE_VECTOR_FIELD + + "\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":12,\"tokens_weight_threshold\":0.6}}}}}"; + mapperService.merge("_doc", new CompressedXContent(addIndexOptionsTemplate), MapperService.MergeReason.MAPPING_UPDATE); + } + @Override protected void doAssertLuceneQuery(SparseVectorQueryBuilder queryBuilder, Query query, SearchExecutionContext context) { assertThat(query, instanceOf(SparseVectorQueryWrapper.class)); @@ -338,4 +353,27 @@ public void testThatWeCorrectlyRewriteQueryIntoVectors() { assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); } + + public void testWeCorrectlyRewriteQueryIntoVectorsWithIndexOptions() { + wrapTestSparseVectorIndexOptions((c) -> { + SearchExecutionContext searchExecutionContext = createSearchExecutionContext(); + + TokenPruningConfig TokenPruningConfig = randomBoolean() ? new TokenPruningConfig(2, 0.3f, false) : null; + + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(TokenPruningConfig); + QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, searchExecutionContext); + assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); + assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); + assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); + }); + } + + private void wrapTestSparseVectorIndexOptions(Consumer testMethod) { + testWithSparseVectorFieldIndexOptions = true; + try { + testMethod.accept(true); + } finally { + testWithSparseVectorFieldIndexOptions = false; + } + } } From e24ab76e6efd84f43f51e186aa678f0d46123449 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 11 Apr 2025 21:34:52 -0400 Subject: [PATCH 02/86] Missing s in IndexVersions --- server/src/main/java/org/elasticsearch/index/IndexVersions.java | 1 + .../index/mapper/vectors/SparseVectorFieldMapper.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index 809156ee57327..b3d2f0bf00000 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -160,6 +160,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_SCALED_FLOAT = def(9_020_0_00, Version.LUCENE_10_1_0); public static final IndexVersion USE_LUCENE101_POSTINGS_FORMAT = def(9_021_0_00, Version.LUCENE_10_1_0); public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_022_0_00, Version.LUCENE_10_1_0); + /* * STOP! READ THIS FIRST! No, really, * ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _ diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index c5be519ee3740..97d3bf6c6e726 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -70,7 +70,7 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; - static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersion.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; + static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; private final SparseVectorFieldMapper.IndexOptions indexOptions; From f39b78aa9a4cf506c9304297f8c933269c19f20e Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 14 Apr 2025 20:49:55 -0400 Subject: [PATCH 03/86] add changelog and docs for index_options --- docs/changelog/126739.yaml | 5 +++ .../mapping-reference/sparse-vector.md | 31 ++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/126739.yaml diff --git a/docs/changelog/126739.yaml b/docs/changelog/126739.yaml new file mode 100644 index 0000000000000..2a2d4439c3bde --- /dev/null +++ b/docs/changelog/126739.yaml @@ -0,0 +1,5 @@ +pr: 126739 +summary: Add pruning index options to sparse vector field +area: Inference +type: enhancement +issues: [] diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index f8bfe69310600..5864de8a0871a 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -17,7 +17,14 @@ PUT my-index "mappings": { "properties": { "text.tokens": { - "type": "sparse_vector" + "type": "sparse_vector", + "index_options": { + "prune": true, + "pruning_config": { + "tokens_freq_ratio_threshold": 5, + "tokens_weight_threshold: 0.4 + } + } } } } @@ -36,6 +43,28 @@ The following parameters are accepted by `sparse_vector` fields: * Exclude the field from [_source](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#source-filtering). * Use [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source). +[index_options](...) +: (Optional, object) You can set index options for your `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If the pruning options are not set for your `sparse_query` vector on the field, Elasticsearch will use the defaults if set here for the field. The available options for the index options are: + +Parameters for `index_options` are: + +`prune` +: (Optional, boolean) [preview] Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: false. + +`pruning_config` +: (Optional, object) [preview] Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. + + Parameters for `pruning_config` include: + + `tokens_freq_ratio_threshold` + : (Optional, integer) [preview] Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned. This value must between 1 and 100. Default: `5`. + + `tokens_weight_threshold` + : (Optional, float) [preview] Tokens whose weight is less than `tokens_weight_threshold` are considered insignificant and pruned. This value must be between 0 and 1. Default: `0.4`. + + ::::{note} + The default values for `tokens_freq_ratio_threshold` and `tokens_weight_threshold` were chosen based on tests using ELSERv2 that provided the most optimal results. + :::: ## Multi-value sparse vectors [index-multi-value-sparse-vectors] From 983ddf108b5737bf051eb81ea37e190c114c4f85 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 21 Apr 2025 10:05:55 -0400 Subject: [PATCH 04/86] correct index version --- server/src/main/java/org/elasticsearch/index/IndexVersions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index 2d5f2bd2b5b0c..9ec3be9e1770b 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -160,7 +160,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion SYNTHETIC_SOURCE_STORE_ARRAYS_NATIVELY_SCALED_FLOAT = def(9_020_0_00, Version.LUCENE_10_1_0); public static final IndexVersion USE_LUCENE101_POSTINGS_FORMAT = def(9_021_0_00, Version.LUCENE_10_1_0); public static final IndexVersion UPGRADE_TO_LUCENE_10_2_0 = def(9_022_00_0, Version.LUCENE_10_2_0); - public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_023_0_00, Version.LUCENE_10_1_0); + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT = def(9_023_0_00, Version.LUCENE_10_2_0); /* * STOP! READ THIS FIRST! No, really, * ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _ From 9545a0c5f1ade4aec2212398d85e833f332494ef Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 21 Apr 2025 13:52:47 -0400 Subject: [PATCH 05/86] update tests --- .../vectors/SparseVectorFieldMapperTests.java | 42 ++++++++++++++++++- .../search/SparseVectorQueryBuilderTests.java | 4 +- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index b2a195eed1830..8a5a7c08d7076 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -141,14 +141,52 @@ public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPrune)), mapper.mappingSource().toString()); - // TODO -- finish + ParsedDocument doc1 = mapper.parse(source(this::writeField)); + + List fields = doc1.rootDoc().getFields("field"); + assertEquals(2, fields.size()); + assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); + XFeatureField featureField1 = null; + XFeatureField featureField2 = null; + for (IndexableField field : fields) { + if (field.stringValue().equals("ten")) { + featureField1 = (XFeatureField) field; + } else if (field.stringValue().equals("twenty")) { + featureField2 = (XFeatureField) field; + } else { + throw new UnsupportedOperationException(); + } + } + + int freq1 = getFrequency(featureField1.tokenStream(null, null)); + int freq2 = getFrequency(featureField2.tokenStream(null, null)); + assertTrue(freq1 < freq2); } public void testWithIndexOptionsPruningConfig() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPruningConfig)); assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPruningConfig)), mapper.mappingSource().toString()); - // TODO -- finish + ParsedDocument doc1 = mapper.parse(source(this::writeField)); + + List fields = doc1.rootDoc().getFields("field"); + assertEquals(2, fields.size()); + assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); + XFeatureField featureField1 = null; + XFeatureField featureField2 = null; + for (IndexableField field : fields) { + if (field.stringValue().equals("ten")) { + featureField1 = (XFeatureField) field; + } else if (field.stringValue().equals("twenty")) { + featureField2 = (XFeatureField) field; + } else { + throw new UnsupportedOperationException(); + } + } + + int freq1 = getFrequency(featureField1.tokenStream(null, null)); + int freq2 = getFrequency(featureField2.tokenStream(null, null)); + assertTrue(freq1 < freq2); } public void testDotInFieldName() throws Exception { diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index 4f1e397f0fb10..2811a974bbfff 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -358,9 +358,9 @@ public void testWeCorrectlyRewriteQueryIntoVectorsWithIndexOptions() { wrapTestSparseVectorIndexOptions((c) -> { SearchExecutionContext searchExecutionContext = createSearchExecutionContext(); - TokenPruningConfig TokenPruningConfig = randomBoolean() ? new TokenPruningConfig(2, 0.3f, false) : null; + TokenPruningConfig defaultTokenPruningConfig = new TokenPruningConfig(12, 0.6f, false); - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(TokenPruningConfig); + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(null); QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, searchExecutionContext); assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); From 19fe72dea34f9f81d3e4d99bdb5e6b0588e03bd2 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 22 Apr 2025 10:21:59 -0400 Subject: [PATCH 06/86] Complete tests for SparseVectorFieldMapper --- .../vectors/SparseVectorFieldMapper.java | 47 +++++---- .../vectors/SparseVectorFieldMapperTests.java | 97 ++++++++++++++++++- 2 files changed, 125 insertions(+), 19 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 97d3bf6c6e726..990acd96b56a1 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -89,7 +89,7 @@ public Builder(String name) { "index_options", true, () -> null, - (n, c, o) -> o == null ? null : parseIndexOptions(n, c, o), + (n, c, o) -> o == null ? getDefaultIndexOptions(c) : parseIndexOptions(c, o), m -> toType(m).fieldType().indexOptions, (b, n, v) -> { if (v != null) { @@ -112,11 +112,19 @@ protected Parameter[] getParameters() { @Override public SparseVectorFieldMapper build(MapperBuilderContext context) { + IndexOptions buildIndexOptions = indexOptions.getValue(); + + if (buildIndexOptions == null) { + buildIndexOptions = new IndexOptions(true, + new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) + ); + } + return new SparseVectorFieldMapper( leafName(), - new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), indexOptions.getValue()), + new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), buildIndexOptions), builderParams(this, context), - indexOptions.getValue() + buildIndexOptions ); } } @@ -125,7 +133,23 @@ public IndexOptions getIndexOptions() { return this.indexOptions; } - private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(String fieldName, MappingParserContext context, Object propNode) { + private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(MappingParserContext context) { + if (context.indexVersionCreated().before(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { + // don't set defaults if this index was created before + // we added this functionality in, so it will + // not change current index behaviour + return null; + } + + // index options are not set - for new indices, we + // need to set pruning to true by default + // with a default pruning configuration + return new IndexOptions(true, + new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) + ); + } + + private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { @SuppressWarnings("unchecked") Map indexOptionsMap = (Map) propNode; @@ -197,20 +221,7 @@ private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(String fie } if (hasOneOption == false) { - if (context.indexVersionCreated().before(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { - // don't set defaults if this index was created before - // we added this functionality in, so it will - // not change current index behaviour - return null; - } - - // index options are not set - for new indices, we - // need to set pruning to true by default - // with a default pruning configuration - return new IndexOptions( - true, - new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) - ); + return getDefaultIndexOptions(context); } return new SparseVectorFieldMapper.IndexOptions(prune, pruningConfig); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 8a5a7c08d7076..4862fc91b258c 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -67,6 +67,17 @@ protected void minimalMapping(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); } + protected void mappingWithDefaultIndexOptions(XContentBuilder b) throws IOException { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", true); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", SparseVectorFieldMapper.PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD); + b.field("tokens_weight_threshold", SparseVectorFieldMapper.PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD); + b.endObject(); + b.endObject(); + } + protected void mappingWithIndexOptionsPrune(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); b.startObject("index_options"); @@ -113,7 +124,7 @@ private static int getFrequency(TokenStream tk) throws IOException { public void testDefaults() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); - assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); + assertEquals(Strings.toString(fieldMapping(this::mappingWithDefaultIndexOptions)), mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); @@ -271,6 +282,90 @@ public void testCannotBeUsedInMultiFields() { assertThat(e.getMessage(), containsString("Field [feature] of type [sparse_vector] can't be used in multifields")); } + public void testPruneMustBeBoolean() { + Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", "othervalue"); + b.endObject(); + }))); + assertThat(e.getMessage(), containsString("[index_options] field [prune] should be true or false")); + } + + public void testPruningConfigurationIsMap() { + Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("pruning_config", "this_is_not_a_map"); + b.endObject(); + }))); + assertThat(e.getMessage(), containsString("index_options] field [pruning_config] should be a map")); + } + + public void testTokensFreqRatioCorrect() { + Exception eTestInteger = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", "notaninteger"); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestInteger.getMessage(), containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100")); + + Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", -2); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeLower.getMessage(), containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100")); + + Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 101); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeHigher.getMessage(), containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100")); + } + + public void testTokensWeightThresholdCorrect() { + Exception eTestDouble = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_weight_threshold", "notadouble"); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestDouble.getMessage(), containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0")); + + Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_weight_threshold", -0.1); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeLower.getMessage(), containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0")); + + Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_weight_threshold", 1.1); + b.endObject(); + b.endObject(); + }))); + assertThat(eTestRangeHigher.getMessage(), containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0")); + } + public void testStoreIsNotUpdateable() throws IOException { var mapperService = createMapperService(fieldMapping(this::minimalMapping)); XContentBuilder mapping = jsonBuilder().startObject() From 58f990930647e62b797ad758ff98080839b15b7d Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Tue, 22 Apr 2025 14:32:15 +0000 Subject: [PATCH 07/86] [CI] Auto commit changes from spotless --- .../vectors/SparseVectorFieldMapper.java | 6 ++-- .../vectors/SparseVectorFieldMapperTests.java | 30 +++++++++++++++---- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 990acd96b56a1..6aa9850cfec57 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -115,7 +115,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { IndexOptions buildIndexOptions = indexOptions.getValue(); if (buildIndexOptions == null) { - buildIndexOptions = new IndexOptions(true, + buildIndexOptions = new IndexOptions( + true, new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) ); } @@ -144,7 +145,8 @@ private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(Mappi // index options are not set - for new indices, we // need to set pruning to true by default // with a default pruning configuration - return new IndexOptions(true, + return new IndexOptions( + true, new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) ); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 4862fc91b258c..2df763de4841c 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -311,7 +311,10 @@ public void testTokensFreqRatioCorrect() { b.endObject(); b.endObject(); }))); - assertThat(eTestInteger.getMessage(), containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100")); + assertThat( + eTestInteger.getMessage(), + containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100") + ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -321,7 +324,10 @@ public void testTokensFreqRatioCorrect() { b.endObject(); b.endObject(); }))); - assertThat(eTestRangeLower.getMessage(), containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100")); + assertThat( + eTestRangeLower.getMessage(), + containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100") + ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -331,7 +337,10 @@ public void testTokensFreqRatioCorrect() { b.endObject(); b.endObject(); }))); - assertThat(eTestRangeHigher.getMessage(), containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100")); + assertThat( + eTestRangeHigher.getMessage(), + containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100") + ); } public void testTokensWeightThresholdCorrect() { @@ -343,7 +352,10 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); b.endObject(); }))); - assertThat(eTestDouble.getMessage(), containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0")); + assertThat( + eTestDouble.getMessage(), + containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0") + ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -353,7 +365,10 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); b.endObject(); }))); - assertThat(eTestRangeLower.getMessage(), containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0")); + assertThat( + eTestRangeLower.getMessage(), + containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0") + ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -363,7 +378,10 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); b.endObject(); }))); - assertThat(eTestRangeHigher.getMessage(), containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0")); + assertThat( + eTestRangeHigher.getMessage(), + containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0") + ); } public void testStoreIsNotUpdateable() throws IOException { From 96096babf4adf43dba5ef35e9df927489b153873 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 25 Apr 2025 15:01:17 -0400 Subject: [PATCH 08/86] fix docs --- docs/reference/elasticsearch/mapping-reference/sparse-vector.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index 5864de8a0871a..825f89487a752 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -43,7 +43,7 @@ The following parameters are accepted by `sparse_vector` fields: * Exclude the field from [_source](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#source-filtering). * Use [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source). -[index_options](...) +index_options : (Optional, object) You can set index options for your `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If the pruning options are not set for your `sparse_query` vector on the field, Elasticsearch will use the defaults if set here for the field. The available options for the index options are: Parameters for `index_options` are: From 6a6052ae0dccccbb466a2b48d7ec45b8f8f0e05e Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 25 Apr 2025 17:02:50 -0400 Subject: [PATCH 09/86] fix lint --- .../xpack/core/ml/search/SparseVectorQueryBuilderTests.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index 2811a974bbfff..b8fb33feb7bf4 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -162,7 +162,9 @@ protected void initializeAdditionalMappings(MapperService mapperService) throws private void addSparseVectorIndexOptionsMapping(MapperService mapperService) throws IOException { String addIndexOptionsTemplate = "{\"properties\":{\"" + SPARSE_VECTOR_FIELD - + "\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":12,\"tokens_weight_threshold\":0.6}}}}}"; + + "\":{\"type\":\"sparse_vector\",\"index_options\"" + + ":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\"" + + ":12,\"tokens_weight_threshold\":0.6}}}}}"; mapperService.merge("_doc", new CompressedXContent(addIndexOptionsTemplate), MapperService.MergeReason.MAPPING_UPDATE); } From 501099d7b17f031384a7ddcfdfc8727490b74c59 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 28 Apr 2025 19:49:38 -0400 Subject: [PATCH 10/86] cleanups + refactoring; fix tests; refine docs; --- .../mapping-reference/sparse-vector.md | 16 +- .../org/elasticsearch/TransportVersions.java | 1 + .../vectors/SparseVectorFieldMapper.java | 210 +++++++++++------- .../vectors/SparseVectorFieldMapperTests.java | 14 +- .../ml/search/SparseVectorQueryBuilder.java | 76 ++++--- 5 files changed, 190 insertions(+), 127 deletions(-) diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index 825f89487a752..465bf55efeb6f 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -11,6 +11,20 @@ A `sparse_vector` field can index features and weights so that they can later be `sparse_vector` is the field type that should be used with [ELSER mappings](docs-content://solutions/search/semantic-search/semantic-search-elser-ingest-pipelines.md#elser-mappings). +```console +PUT my-index +{ + "mappings": { + "properties": { + "text.tokens": { + "type": "sparse_vector" + } + } + } +} +``` + +With optional `index_options` for pruning: ```console PUT my-index { @@ -49,7 +63,7 @@ index_options Parameters for `index_options` are: `prune` -: (Optional, boolean) [preview] Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: false. +: (Optional, boolean) [preview] Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true. `pruning_config` : (Optional, object) [preview] Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index eaf3549bc83b1..e6007b85cafe2 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -228,6 +228,7 @@ static TransportVersion def(int id) { public static final TransportVersion DENSE_VECTOR_OFF_HEAP_STATS = def(9_062_00_0); public static final TransportVersion RANDOM_SAMPLER_QUERY_BUILDER = def(9_063_0_00); public static final TransportVersion SETTINGS_IN_DATA_STREAMS = def(9_064_0_00); + public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS = def(9_065_0_00); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 6aa9850cfec57..ed46b126ab324 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -22,6 +22,7 @@ import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; @@ -62,6 +63,7 @@ public class SparseVectorFieldMapper extends FieldMapper { public static final String CONTENT_TYPE = "sparse_vector"; + public static final String SPARSE_VECTOR_INDEX_OPTIONS = "index_options"; static final String ERROR_MESSAGE_7X = "[sparse_vector] field type in old 7.x indices is allowed to " + "contain [sparse_vector] fields, but they cannot be indexed or searched."; @@ -86,7 +88,7 @@ public static class Builder extends FieldMapper.Builder { public Builder(String name) { super(name); this.indexOptions = new Parameter<>( - "index_options", + SPARSE_VECTOR_INDEX_OPTIONS, true, () -> null, (n, c, o) -> o == null ? getDefaultIndexOptions(c) : parseIndexOptions(c, o), @@ -139,7 +141,7 @@ private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(Mappi // don't set defaults if this index was created before // we added this functionality in, so it will // not change current index behaviour - return null; + return new IndexOptions(false, null); } // index options are not set - for new indices, we @@ -152,77 +154,12 @@ private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(Mappi } private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { - @SuppressWarnings("unchecked") - Map indexOptionsMap = (Map) propNode; - - boolean hasOneOption = false; - Boolean prune = null; - PruningConfig pruningConfig = null; - - Object shouldPrune = indexOptionsMap.remove(IndexOptions.PRUNE_FIELD_NAME); - if (shouldPrune != null) { - if ((shouldPrune instanceof Boolean) == false) { - throw new MapperParsingException("[index_options] field [prune] should be true or false"); - } - hasOneOption = true; - prune = ((Boolean) shouldPrune); - } - - Object hasPruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); - if (hasPruningConfiguration != null) { - if ((hasPruningConfiguration instanceof Map) == false) { - throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); - } - - Integer tokensFreqRatioThreshold = null; - Double tokensWeightThreshold = null; - - @SuppressWarnings("unchecked") - Map pruningConfigMap = (Map) hasPruningConfiguration; - Object hasTokensFreqRatioThreshold = pruningConfigMap.remove(PruningConfig.TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME); - Object hasTokensWeightThreshold = pruningConfigMap.remove(PruningConfig.TOKENS_WEIGHT_THRESHOLD_FIELD_NAME); - - if (pruningConfigMap.isEmpty() == false) { - throw new MapperParsingException("[index_options] field [pruning_config] has unknown fields"); - } - - if (hasTokensFreqRatioThreshold != null) { - if ((hasTokensFreqRatioThreshold instanceof Integer) == false) { - throw new MapperParsingException( - "[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100" - ); - } - tokensFreqRatioThreshold = (Integer) hasTokensFreqRatioThreshold; - if (tokensFreqRatioThreshold < PruningConfig.MIN_TOKENS_FREQ_RATIO_THRESHOLD - || tokensFreqRatioThreshold > PruningConfig.MAX_TOKENS_FREQ_RATIO_THRESHOLD) { - throw new MapperParsingException( - "[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100" - ); - } - } - - if (hasTokensWeightThreshold != null) { - if ((hasTokensWeightThreshold instanceof Double) == false) { - throw new MapperParsingException( - "[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0" - ); - } - tokensWeightThreshold = (Double) hasTokensWeightThreshold; - if (tokensWeightThreshold < PruningConfig.MIN_TOKENS_WEIGHT_THRESHOLD - || tokensWeightThreshold > PruningConfig.MAX_TOKENS_WEIGHT_THRESHOLD) { - throw new MapperParsingException( - "[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0" - ); - } - } + Map indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS); - if (tokensFreqRatioThreshold != null || tokensWeightThreshold != null) { - pruningConfig = new PruningConfig(tokensFreqRatioThreshold, tokensWeightThreshold); - hasOneOption = true; - } - } + Boolean prune = IndexOptions.parseIndexOptionsPruneValue(indexOptionsMap); + PruningConfig pruningConfig = IndexOptions.parseIndexOptionsPruningConfig(prune, indexOptionsMap); - if (hasOneOption == false) { + if (prune == null && pruningConfig == null) { return getDefaultIndexOptions(context); } @@ -551,6 +488,10 @@ public final int hashCode() { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + if (prune == null && pruningConfig == null) { + return builder; + } + builder.startObject(); if (prune != null) { builder.field(PRUNE_FIELD_NAME, prune); @@ -561,36 +502,65 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.endObject(); return builder; } + + public static Boolean parseIndexOptionsPruneValue(Map indexOptionsMap) { + Object shouldPrune = indexOptionsMap.remove(IndexOptions.PRUNE_FIELD_NAME); + if (shouldPrune == null) { + return null; + } + + if ((shouldPrune instanceof Boolean) == false) { + throw new MapperParsingException("[index_options] field [prune] should be true or false"); + } + + return ((Boolean) shouldPrune); + } + + public static PruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { + Object pruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); + if (pruningConfiguration == null) { + return null; + } + if ((pruningConfiguration instanceof Map) == false) { + throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); + } + + if (prune != null && prune == false) { + throw new MapperParsingException("[index_options] field [pruning_config] should not be set if [prune] is false"); + } + + return PruningConfig.parsePruningConfig(pruningConfiguration); + } } public static class PruningConfig implements ToXContent { public static final String TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME = "tokens_freq_ratio_threshold"; public static final String TOKENS_WEIGHT_THRESHOLD_FIELD_NAME = "tokens_weight_threshold"; - public static Integer DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD = 5; - public static Integer MIN_TOKENS_FREQ_RATIO_THRESHOLD = 1; - public static Integer MAX_TOKENS_FREQ_RATIO_THRESHOLD = 100; + public static float DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD = 5; + public static float MIN_TOKENS_FREQ_RATIO_THRESHOLD = 1; + public static float MAX_TOKENS_FREQ_RATIO_THRESHOLD = 100; - public static Double DEFAULT_TOKENS_WEIGHT_THRESHOLD = 0.4; - public static Double MIN_TOKENS_WEIGHT_THRESHOLD = 0.0; - public static Double MAX_TOKENS_WEIGHT_THRESHOLD = 1.0; + public static float DEFAULT_TOKENS_WEIGHT_THRESHOLD = 0.4f; + public static float MIN_TOKENS_WEIGHT_THRESHOLD = 0.0f; + public static float MAX_TOKENS_WEIGHT_THRESHOLD = 1.0f; - final Integer tokens_freq_ratio_threshold; - final Double tokens_weight_threshold; + final Float tokens_freq_ratio_threshold; + final Float tokens_weight_threshold; - PruningConfig(@Nullable Integer tokens_freq_ratio_threshold, @Nullable Double tokens_weight_threshold) { + PruningConfig(@Nullable Float tokens_freq_ratio_threshold, @Nullable Float tokens_weight_threshold) { this.tokens_freq_ratio_threshold = tokens_freq_ratio_threshold; this.tokens_weight_threshold = tokens_weight_threshold; } - public int getTokensFreqRatioThresholdOrDefault() { + public float getTokensFreqRatioThresholdOrDefault() { if (tokens_freq_ratio_threshold == null) { return DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD; } return tokens_freq_ratio_threshold; } - public double getTokensWeightThresholdOrDefault() { + public float getTokensWeightThresholdOrDefault() { if (tokens_weight_threshold == null) { return DEFAULT_TOKENS_WEIGHT_THRESHOLD; } @@ -626,5 +596,81 @@ public final boolean equals(Object other) { public final int hashCode() { return Objects.hash(tokens_freq_ratio_threshold, tokens_weight_threshold); } + + public static PruningConfig parsePruningConfig(Object pruningConfiguration) { + Map pruningConfigMap = XContentMapValues.nodeMapValue(pruningConfiguration, SPARSE_VECTOR_INDEX_OPTIONS); + + Object mappedTokensFreqRatioThreshold = pruningConfigMap.remove(TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME); + Object mappedTokensWeightThreshold = pruningConfigMap.remove(TOKENS_WEIGHT_THRESHOLD_FIELD_NAME); + + if (pruningConfigMap.isEmpty() == false) { + throw new MapperParsingException("[index_options] field [pruning_config] has unknown fields"); + } + + Float tokensFreqRatioThreshold = parseTokensFreqRatioThreshold(mappedTokensFreqRatioThreshold); + Float tokensWeightThreshold = parseTokensWeightThreshold(mappedTokensWeightThreshold); + + if (tokensFreqRatioThreshold != null || tokensWeightThreshold != null) { + return new PruningConfig(tokensFreqRatioThreshold, tokensWeightThreshold); + } + + return null; + } + + private static Float parseFloatNumberFromObject(Object numberObject) { + if (numberObject instanceof Integer intValue) { + return (float) intValue; + } else if (numberObject instanceof Float floatValue) { + return floatValue; + } else if (numberObject instanceof Double doubleValue) { + return ((Double) numberObject).floatValue(); + } + return null; + } + + private static Float parseTokensWeightThreshold(Object mappedTokensWeightThreshold) { + if (mappedTokensWeightThreshold == null) { + return null; + } + + Float tokensWeightThreshold = parseFloatNumberFromObject(mappedTokensWeightThreshold); + + if (tokensWeightThreshold == null) { + throw new MapperParsingException( + "[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" + ); + } + + if (tokensWeightThreshold < PruningConfig.MIN_TOKENS_WEIGHT_THRESHOLD + || tokensWeightThreshold > PruningConfig.MAX_TOKENS_WEIGHT_THRESHOLD) { + throw new MapperParsingException( + "[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" + ); + } + return tokensWeightThreshold; + } + + private static Float parseTokensFreqRatioThreshold(Object mappedTokensFreqRatioThreshold) { + if (mappedTokensFreqRatioThreshold == null) { + return null; + } + + Float tokensFreqRatioThreshold = parseFloatNumberFromObject(mappedTokensFreqRatioThreshold); + + if (tokensFreqRatioThreshold == null) { + throw new MapperParsingException( + "[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" + ); + } + + if (tokensFreqRatioThreshold < PruningConfig.MIN_TOKENS_FREQ_RATIO_THRESHOLD + || tokensFreqRatioThreshold > PruningConfig.MAX_TOKENS_FREQ_RATIO_THRESHOLD) { + throw new MapperParsingException( + "[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" + ); + } + + return tokensFreqRatioThreshold; + } } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 2df763de4841c..693c5d626f15d 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -89,7 +89,7 @@ protected void mappingWithIndexOptionsPruningConfig(XContentBuilder b) throws IO b.field("type", "sparse_vector"); b.startObject("index_options"); b.startObject("pruning_config"); - b.field("tokens_freq_ratio_threshold", 5); + b.field("tokens_freq_ratio_threshold", 5.0); b.field("tokens_weight_threshold", 0.4); b.endObject(); b.endObject(); @@ -313,7 +313,7 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestInteger.getMessage(), - containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100") + containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100") ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -326,7 +326,7 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestRangeLower.getMessage(), - containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100") + containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100") ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -339,7 +339,7 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestRangeHigher.getMessage(), - containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be an integer between 1 and 100") + containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100") ); } @@ -354,7 +354,7 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestDouble.getMessage(), - containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0") + containsString("[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0") ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -367,7 +367,7 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestRangeLower.getMessage(), - containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0") + containsString("[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0") ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -380,7 +380,7 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestRangeHigher.getMessage(), - containsString("[pruning_config] field [tokens_weight_threshold] field should be an number between 0.0 and 1.0") + containsString("[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0") ); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 5e79149eea6b1..389684a0d822b 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -126,7 +126,11 @@ public SparseVectorQueryBuilder( public SparseVectorQueryBuilder(StreamInput in) throws IOException { super(in); this.fieldName = in.readString(); - this.shouldPruneTokens = in.readOptionalBoolean(); + if (in.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { + this.shouldPruneTokens = in.readOptionalBoolean(); + } else { + this.shouldPruneTokens = in.readBoolean(); + } this.queryVectors = in.readOptionalCollectionAsList(WeightedToken::new); this.inferenceId = in.readOptionalString(); this.query = in.readOptionalString(); @@ -175,7 +179,11 @@ protected void doWriteTo(StreamOutput out) throws IOException { } out.writeString(fieldName); - out.writeOptionalBoolean(shouldPruneTokens); + if (out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { + out.writeOptionalBoolean(shouldPruneTokens); + } else { + out.writeBoolean(shouldPruneTokens != null && shouldPruneTokens); + } out.writeOptionalCollection(queryVectors); out.writeOptionalString(inferenceId); out.writeOptionalString(query); @@ -229,24 +237,11 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { // if the query options for pruning are not set, // we need to check the index options for this field // and use those if set. - boolean doPruneTokens = false; - TokenPruningConfig setTokenPruningConfig = tokenPruningConfig; + SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); + TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded(shouldPruneTokens, tokenPruningConfig, sparseVectorFieldMapper); - if (shouldPruneTokens == null || setTokenPruningConfig == null) { - SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); - IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(sparseVectorFieldMapper); - if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { - doPruneTokens = true; - } - if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { - setTokenPruningConfig = indexPruningSettings.pruningConfig; - } - } else { - doPruneTokens = shouldPruneTokens; - } - - return (doPruneTokens) - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, setTokenPruningConfig, queryVectors, ft, context) + return pruningOptions.pruneTokens + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningOptions.pruningConfig, queryVectors, ft, context) : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } @@ -265,29 +260,16 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) { // if the query options for pruning are not set, // we need to check the index options for this field // and use those if set. - boolean doPruneTokens = false; - TokenPruningConfig setTokenPruningConfig = tokenPruningConfig; - - if (shouldPruneTokens == null || setTokenPruningConfig == null) { - SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForQueryRewrite(fieldName, queryRewriteContext); - IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(sparseVectorFieldMapper); - if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { - doPruneTokens = true; - } - if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { - setTokenPruningConfig = indexPruningSettings.pruningConfig; - } - } else { - doPruneTokens = shouldPruneTokens; - } + SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForQueryRewrite(fieldName, queryRewriteContext); + TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded(shouldPruneTokens, tokenPruningConfig, sparseVectorFieldMapper); return new SparseVectorQueryBuilder( fieldName, textExpansionResults.getWeightedTokens(), null, null, - doPruneTokens, - setTokenPruningConfig + pruningOptions.pruneTokens, + pruningOptions.pruningConfig ); } else if (inferenceId == null) { // Edge case, where inference_id was not specified in the request, @@ -441,7 +423,7 @@ private IndexFieldPruningSettings getIndexFieldPruningSettings(SparseVectorField TokenPruningConfig indexTokenPruningConfig = indexPruningConfig != null ? new TokenPruningConfig( indexPruningConfig.getTokensFreqRatioThresholdOrDefault(), - (float) indexPruningConfig.getTokensWeightThresholdOrDefault(), + indexPruningConfig.getTokensWeightThresholdOrDefault(), false ) : null; @@ -466,4 +448,24 @@ private SparseVectorFieldMapper getSparseVectorFieldMapperForSearchExecution(Str private SparseVectorFieldMapper getSparseVectorFieldMapperForQueryRewrite(String fieldName, QueryRewriteContext context) { return getSparseVectorFieldMapper(fieldName, context.getMappingLookup().getMapping()); } + + private record TokenPruningSet(boolean pruneTokens, TokenPruningConfig pruningConfig) {} + + private TokenPruningSet setPruningConfigFromIndexIfNeeded(Boolean queryPruneTokens, TokenPruningConfig queryPruningConfig, SparseVectorFieldMapper fieldMapper) { + boolean doPruneTokens = false; + TokenPruningConfig setTokenPruningConfig = queryPruningConfig; + if (queryPruneTokens == null || queryPruningConfig == null) { + IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(fieldMapper); + if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { + doPruneTokens = true; + } + if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { + setTokenPruningConfig = indexPruningSettings.pruningConfig; + } + } else { + doPruneTokens = queryPruneTokens; + } + + return new TokenPruningSet(doPruneTokens, setTokenPruningConfig); + } } From 21323e407d4188db3a2a058d1f68b1af6a17ffac Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 28 Apr 2025 23:55:47 +0000 Subject: [PATCH 11/86] [CI] Auto commit changes from spotless --- .../core/ml/search/SparseVectorQueryBuilder.java | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 389684a0d822b..64b85b0718c3d 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -261,7 +261,11 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) { // we need to check the index options for this field // and use those if set. SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForQueryRewrite(fieldName, queryRewriteContext); - TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded(shouldPruneTokens, tokenPruningConfig, sparseVectorFieldMapper); + TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded( + shouldPruneTokens, + tokenPruningConfig, + sparseVectorFieldMapper + ); return new SparseVectorQueryBuilder( fieldName, @@ -451,7 +455,11 @@ private SparseVectorFieldMapper getSparseVectorFieldMapperForQueryRewrite(String private record TokenPruningSet(boolean pruneTokens, TokenPruningConfig pruningConfig) {} - private TokenPruningSet setPruningConfigFromIndexIfNeeded(Boolean queryPruneTokens, TokenPruningConfig queryPruningConfig, SparseVectorFieldMapper fieldMapper) { + private TokenPruningSet setPruningConfigFromIndexIfNeeded( + Boolean queryPruneTokens, + TokenPruningConfig queryPruningConfig, + SparseVectorFieldMapper fieldMapper + ) { boolean doPruneTokens = false; TokenPruningConfig setTokenPruningConfig = queryPruningConfig; if (queryPruneTokens == null || queryPruningConfig == null) { From 9d5df849c14358e9b115cfc206d644314d017396 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 28 Apr 2025 21:15:35 -0400 Subject: [PATCH 12/86] delete changelog - let it autocreate --- docs/changelog/126739.yaml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/changelog/126739.yaml diff --git a/docs/changelog/126739.yaml b/docs/changelog/126739.yaml deleted file mode 100644 index 2a2d4439c3bde..0000000000000 --- a/docs/changelog/126739.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pr: 126739 -summary: Add pruning index options to sparse vector field -area: Inference -type: enhancement -issues: [] From 846fcff6d10fd0693fdab0e0f30f2d21263ff5e5 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 29 Apr 2025 09:02:17 -0400 Subject: [PATCH 13/86] cleanups --- .../index/mapper/vectors/SparseVectorFieldMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index ed46b126ab324..3ac4bdbeebd7b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -91,7 +91,7 @@ public Builder(String name) { SPARSE_VECTOR_INDEX_OPTIONS, true, () -> null, - (n, c, o) -> o == null ? getDefaultIndexOptions(c) : parseIndexOptions(c, o), + (n, c, o) -> parseIndexOptions(c, o), m -> toType(m).fieldType().indexOptions, (b, n, v) -> { if (v != null) { From e593f171a1fe039687cc5f41d46604c911230d0d Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 29 Apr 2025 12:45:56 -0400 Subject: [PATCH 14/86] don't explicitly set the config if not present --- .../index/mapper/vectors/SparseVectorFieldMapper.java | 11 ++++------- .../core/ml/search/SparseVectorQueryBuilder.java | 10 +++++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 3ac4bdbeebd7b..0019c38120935 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -144,13 +144,10 @@ private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(Mappi return new IndexOptions(false, null); } - // index options are not set - for new indices, we - // need to set pruning to true by default - // with a default pruning configuration - return new IndexOptions( - true, - new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) - ); + // index options are not set - for new indices, + // if this is null, in the query will use the + // proper defaults + return null; } private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 64b85b0718c3d..5bd4d97192900 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -418,7 +418,15 @@ private IndexFieldPruningSettings getIndexFieldPruningSettings(SparseVectorField SparseVectorFieldMapper.IndexOptions indexOptions = sparseVectorFieldMapper.getIndexOptions(); if (indexOptions == null) { - return new IndexFieldPruningSettings(null, null); + // return the default if not set in the index options + return new IndexFieldPruningSettings( + true, + new TokenPruningConfig( + TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, + TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, + false + ) + ); } Boolean indexOptionsPrune = indexOptions.getPrune(); From 7a247032465d937bbe6366aecc88090dc76928df Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 29 Apr 2025 16:19:12 -0400 Subject: [PATCH 15/86] use default prune config if prune=true and not set --- .../index/mapper/vectors/SparseVectorFieldMapper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 0019c38120935..58a02e8a35d99 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -516,7 +516,7 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio public static PruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { Object pruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); if (pruningConfiguration == null) { - return null; + return new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD); } if ((pruningConfiguration instanceof Map) == false) { throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); From 7ceb12aeb00792bf435942b13d302098cdba9e11 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 29 Apr 2025 17:06:49 -0400 Subject: [PATCH 16/86] fix test --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 693c5d626f15d..ce2d5cc6f5ff7 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -150,7 +150,8 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); - assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPrune)), mapper.mappingSource().toString()); + String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}}}}}" + assertEquals(expectedMapping, mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); From 5dd4728d30d44cc0c9940fb6c65fdf50f9873a31 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 29 Apr 2025 18:25:54 -0400 Subject: [PATCH 17/86] really fix test :/ --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index ce2d5cc6f5ff7..b2272d489ed63 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -150,7 +150,7 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); - String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}}}}}" + String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}}}}}"; assertEquals(expectedMapping, mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); From f3b4a98ba0bc46cbadb926ca687a30b3d8a40117 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Tue, 29 Apr 2025 22:34:50 +0000 Subject: [PATCH 18/86] [CI] Auto commit changes from spotless --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index b2272d489ed63..4c9fd359d0d25 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -150,7 +150,8 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); - String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}}}}}"; + String expectedMapping = + "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}}}}}"; assertEquals(expectedMapping, mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); From 7ddb77a5c477d49f9230f487717eb414884eabee Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 30 Apr 2025 08:41:38 -0400 Subject: [PATCH 19/86] rename index to test3 / previous not cleaned up --- .../resources/rest-api-spec/test/cluster.stats/10_basic.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml index 7a952190205c5..e25ff56a0dc4d 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml @@ -375,7 +375,7 @@ - do: indices.create: - index: test2 + index: test3 body: settings: number_of_replicas: 0 @@ -424,7 +424,7 @@ - do: index: - index: test2 + index: test3 id: "1" body: vector: @@ -438,7 +438,7 @@ - do: index: - index: test2 + index: test3 id: "2" body: vector: From 3625a37cc5ea47ca15574399f4a9c57c91d90bae Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 30 Apr 2025 14:53:44 -0400 Subject: [PATCH 20/86] fix lint --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 4c9fd359d0d25..6b33a118235aa 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -151,7 +151,9 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); String expectedMapping = - "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}}}}}"; + "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":" + + "{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}" + +"}}}}"; assertEquals(expectedMapping, mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); From 92db1c64f48709222f87a1964327ee3c90a9d102 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 30 Apr 2025 19:02:00 +0000 Subject: [PATCH 21/86] [CI] Auto commit changes from spotless --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 6b33a118235aa..50326205b7226 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -150,10 +150,9 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); - String expectedMapping = - "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":" + String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":" + "{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}" - +"}}}}"; + + "}}}}"; assertEquals(expectedMapping, mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); From 6a7f46c9248a7d0d4260abf40602d7987af07f68 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 08:54:08 -0400 Subject: [PATCH 22/86] fix the current yaml tests/ensure cleanup --- .../test/cluster.stats/10_basic.yml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml index e25ff56a0dc4d..a1a8fa206a38b 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml @@ -353,11 +353,22 @@ - match: { indices.dense_vector.value_count: 8 } - is_false: indices.dense_vector.off_heap + # ensure indices are cleaned up + - do: + indices.delete: + index: test1 + - do: + indices.delete: + index: test2 + - do: + indices.refresh: { } + --- "Sparse vector stats": - requires: cluster_features: [ "gte_v8.15.0" ] reason: "sparse vector stats added in 8.15" + - do: indices.create: index: test1 @@ -375,7 +386,7 @@ - do: indices.create: - index: test3 + index: test2 body: settings: number_of_replicas: 0 @@ -424,7 +435,7 @@ - do: index: - index: test3 + index: test2 id: "1" body: vector: @@ -438,7 +449,7 @@ - do: index: - index: test3 + index: test2 id: "2" body: vector: From e95033cac65e8151fca5e51de2f12b6023ed196a Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 09:07:46 -0400 Subject: [PATCH 23/86] light cleanups --- .../index/mapper/vectors/SparseVectorFieldMapper.java | 2 +- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 58a02e8a35d99..0019c38120935 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -516,7 +516,7 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio public static PruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { Object pruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); if (pruningConfiguration == null) { - return new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD); + return null; } if ((pruningConfiguration instanceof Map) == false) { throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 50326205b7226..2cec3a9182f82 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -151,8 +151,7 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":" - + "{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\":5.0,\"tokens_weight_threshold\":0.4}}" - + "}}}}"; + + "{\"prune\":true}}}}}"; assertEquals(expectedMapping, mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); From bdfc9b80581b66117effc5e57c61f5d9b6a727b7 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 09:35:45 -0400 Subject: [PATCH 24/86] clean docs --- .../elasticsearch/mapping-reference/sparse-vector.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index 465bf55efeb6f..0f3d691e490ba 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -36,7 +36,7 @@ PUT my-index "prune": true, "pruning_config": { "tokens_freq_ratio_threshold": 5, - "tokens_weight_threshold: 0.4 + "tokens_weight_threshold": 0.4 } } } @@ -66,7 +66,7 @@ Parameters for `index_options` are: : (Optional, boolean) [preview] Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true. `pruning_config` -: (Optional, object) [preview] Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. +: (Optional, object) [preview] Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur. Parameters for `pruning_config` include: From 20bcf20eacd36a009d3f33e424924fbb2d1e46b8 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 10:59:26 -0400 Subject: [PATCH 25/86] refactor/move TokenPruningConfig into server --- .../vectors/SparseVectorFieldMapper.java | 22 ++- .../mapper/vectors}/TokenPruningConfig.java | 131 +++++++++++++++++- .../vectors/SparseVectorFieldMapperTests.java | 4 +- .../ml/search/SparseVectorQueryBuilder.java | 7 +- .../ml/search/TextExpansionQueryBuilder.java | 1 + .../ml/search/WeightedTokensQueryBuilder.java | 1 + .../core/ml/search/WeightedTokensUtils.java | 1 + .../search/SparseVectorQueryBuilderTests.java | 1 + .../TextExpansionQueryBuilderTests.java | 1 + .../ml/search/TokenPruningConfigTests.java | 1 + .../WeightedTokensQueryBuilderTests.java | 1 + 11 files changed, 153 insertions(+), 18 deletions(-) rename {x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search => server/src/main/java/org/elasticsearch/index/mapper/vectors}/TokenPruningConfig.java (59%) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 0019c38120935..10191942ced4f 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -119,7 +119,11 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { if (buildIndexOptions == null) { buildIndexOptions = new IndexOptions( true, - new PruningConfig(PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD) + new TokenPruningConfig( + TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, + TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, + false + ) ); } @@ -154,7 +158,7 @@ private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingPar Map indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS); Boolean prune = IndexOptions.parseIndexOptionsPruneValue(indexOptionsMap); - PruningConfig pruningConfig = IndexOptions.parseIndexOptionsPruningConfig(prune, indexOptionsMap); + TokenPruningConfig pruningConfig = IndexOptions.parseIndexOptionsPruningConfig(prune, indexOptionsMap); if (prune == null && pruningConfig == null) { return getDefaultIndexOptions(context); @@ -452,9 +456,9 @@ public static class IndexOptions implements ToXContent { public static final String PRUNING_CONFIG_FIELD_NAME = "pruning_config"; final Boolean prune; - final PruningConfig pruningConfig; + final TokenPruningConfig pruningConfig; - IndexOptions(@Nullable Boolean prune, @Nullable PruningConfig pruningConfig) { + IndexOptions(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) { this.prune = prune; this.pruningConfig = pruningConfig; } @@ -463,7 +467,7 @@ public Boolean getPrune() { return prune; } - public PruningConfig getPruningConfig() { + public TokenPruningConfig getPruningConfig() { return pruningConfig; } @@ -513,7 +517,7 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio return ((Boolean) shouldPrune); } - public static PruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { + public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { Object pruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); if (pruningConfiguration == null) { return null; @@ -526,10 +530,13 @@ public static PruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map pruningConfigurationMap = XContentMapValues.nodeMapValue(pruningConfiguration, PRUNING_CONFIG_FIELD_NAME); + + return TokenPruningConfig.parseFromMap(pruningConfigurationMap); } } + /* public static class PruningConfig implements ToXContent { public static final String TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME = "tokens_freq_ratio_threshold"; public static final String TOKENS_WEIGHT_THRESHOLD_FIELD_NAME = "tokens_weight_threshold"; @@ -670,4 +677,5 @@ private static Float parseTokensFreqRatioThreshold(Object mappedTokensFreqRatioT return tokensFreqRatioThreshold; } } + */ } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java similarity index 59% rename from x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfig.java rename to server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index 13358839830ed..43bf2d81ec433 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -1,3 +1,12 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + /* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License @@ -5,12 +14,13 @@ * 2.0. */ -package org.elasticsearch.xpack.core.ml.search; +package org.elasticsearch.index.mapper.vectors; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.xcontent.ParseField; import org.elasticsearch.xcontent.ToXContentObject; @@ -19,21 +29,26 @@ import java.io.IOException; import java.util.Locale; +import java.util.Map; import java.util.Objects; import java.util.Set; -import static org.elasticsearch.xpack.core.ml.search.WeightedTokensQueryBuilder.PRUNING_CONFIG; - public class TokenPruningConfig implements Writeable, ToXContentObject { + public static final String PRUNING_CONFIG_FIELD = "pruning_config"; + public static final ParseField TOKENS_FREQ_RATIO_THRESHOLD = new ParseField("tokens_freq_ratio_threshold"); public static final ParseField TOKENS_WEIGHT_THRESHOLD = new ParseField("tokens_weight_threshold"); public static final ParseField ONLY_SCORE_PRUNED_TOKENS_FIELD = new ParseField("only_score_pruned_tokens"); // Tokens whose frequency is more than 5 times the average frequency of all tokens in the specified field are considered outliers. public static final float DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD = 5; + public static final float MIN_TOKENS_FREQ_RATIO_THRESHOLD = 1; public static final float MAX_TOKENS_FREQ_RATIO_THRESHOLD = 100; + // A token's weight should be > 40% of the best weight in the query to be considered significant. public static final float DEFAULT_TOKENS_WEIGHT_THRESHOLD = 0.4f; + public static final float MIN_TOKENS_WEIGHT_THRESHOLD = 0.0f; + public static final float MAX_TOKENS_WEIGHT_THRESHOLD = 1.0f; private final float tokensFreqRatioThreshold; private final float tokensWeightThreshold; @@ -150,7 +165,7 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx ).contains(currentFieldName) == false) { throw new ParsingException( parser.getTokenLocation(), - "[" + PRUNING_CONFIG.getPreferredName() + "] unknown token [" + currentFieldName + "]" + "[" + PRUNING_CONFIG_FIELD + "] unknown token [" + currentFieldName + "]" ); } } else if (token.isValue()) { @@ -163,16 +178,120 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx } else { throw new ParsingException( parser.getTokenLocation(), - "[" + PRUNING_CONFIG.getPreferredName() + "] does not support [" + currentFieldName + "]" + "[" + PRUNING_CONFIG_FIELD + "] does not support [" + currentFieldName + "]" ); } } else { throw new ParsingException( parser.getTokenLocation(), - "[" + PRUNING_CONFIG.getPreferredName() + "] unknown token [" + token + "] after [" + currentFieldName + "]" + "[" + PRUNING_CONFIG_FIELD + "] unknown token [" + token + "] after [" + currentFieldName + "]" ); } } return new TokenPruningConfig(ratioThreshold, weightThreshold, onlyScorePrunedTokens); } + + public static TokenPruningConfig parseFromMap(Map pruningConfigMap) { + Object mappedTokensFreqRatioThreshold = pruningConfigMap.remove(TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName()); + Object mappedTokensWeightThreshold = pruningConfigMap.remove(TOKENS_WEIGHT_THRESHOLD.getPreferredName()); + Object mappedOnlyScorePrunedTokens = pruningConfigMap.remove(ONLY_SCORE_PRUNED_TOKENS_FIELD.getPreferredName()); + + if (pruningConfigMap.isEmpty() == false) { + throw new MapperParsingException("[" + PRUNING_CONFIG_FIELD + "] has unknown fields"); + } + + Float tokensFreqRatioThreshold = parseTokensFreqRatioThreshold(mappedTokensFreqRatioThreshold); + Float tokensWeightThreshold = parseTokensWeightThreshold(mappedTokensWeightThreshold); + boolean onlyScorePrunedTokens = mappedOnlyScorePrunedTokens != null ? parseScorePrunedTokens(mappedOnlyScorePrunedTokens) : false; + + if (tokensFreqRatioThreshold != null || tokensWeightThreshold != null) { + return new TokenPruningConfig(tokensFreqRatioThreshold, tokensWeightThreshold, onlyScorePrunedTokens); + } + + return null; + } + + private static Float parseFloatNumberFromObject(Object numberObject) { + if (numberObject instanceof Integer intValue) { + return (float) intValue; + } else if (numberObject instanceof Float floatValue) { + return floatValue; + } else if (numberObject instanceof Double doubleValue) { + return ((Double) numberObject).floatValue(); + } + return null; + } + + private static Float parseTokensWeightThreshold(Object mappedTokensWeightThreshold) { + if (mappedTokensWeightThreshold == null) { + return DEFAULT_TOKENS_WEIGHT_THRESHOLD; + } + + Float tokensWeightThreshold = parseFloatNumberFromObject(mappedTokensWeightThreshold); + + if (tokensWeightThreshold == null) { + throw new MapperParsingException( + "[" + + PRUNING_CONFIG_FIELD + + "] field [" + + TOKENS_WEIGHT_THRESHOLD.getPreferredName() + + "] field should be a number between 0.0 and 1.0" + ); + } + + if (tokensWeightThreshold < MIN_TOKENS_WEIGHT_THRESHOLD || tokensWeightThreshold > MAX_TOKENS_WEIGHT_THRESHOLD) { + throw new MapperParsingException( + "[" + + PRUNING_CONFIG_FIELD + + "] field [" + + TOKENS_WEIGHT_THRESHOLD.getPreferredName() + + "] field should be a number between 0.0 and 1.0" + ); + } + return tokensWeightThreshold; + } + + private static Float parseTokensFreqRatioThreshold(Object mappedTokensFreqRatioThreshold) { + if (mappedTokensFreqRatioThreshold == null) { + return DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD; + } + + Float tokensFreqRatioThreshold = parseFloatNumberFromObject(mappedTokensFreqRatioThreshold); + + if (tokensFreqRatioThreshold == null) { + throw new MapperParsingException( + "[" + + PRUNING_CONFIG_FIELD + + "] field [" + + TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName() + + "] field should be a number between 1 and 100" + ); + } + + if (tokensFreqRatioThreshold < MIN_TOKENS_FREQ_RATIO_THRESHOLD || tokensFreqRatioThreshold > MAX_TOKENS_FREQ_RATIO_THRESHOLD) { + throw new MapperParsingException( + "[" + + PRUNING_CONFIG_FIELD + + "] field [" + + TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName() + + "] field should be a number between 1 and 100" + ); + } + + return tokensFreqRatioThreshold; + } + + private static boolean parseScorePrunedTokens(Object mappedScorePrunedTokens) { + if (mappedScorePrunedTokens == null) { + return false; + } + + if (mappedScorePrunedTokens instanceof Boolean boolValue) { + return boolValue; + } + + throw new MapperParsingException( + "[" + PRUNING_CONFIG_FIELD + "] field [" + ONLY_SCORE_PRUNED_TOKENS_FIELD.getPreferredName() + "] field should be true or false" + ); + } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 2cec3a9182f82..fbfe65128a769 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -72,8 +72,8 @@ protected void mappingWithDefaultIndexOptions(XContentBuilder b) throws IOExcept b.startObject("index_options"); b.field("prune", true); b.startObject("pruning_config"); - b.field("tokens_freq_ratio_threshold", SparseVectorFieldMapper.PruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD); - b.field("tokens_weight_threshold", SparseVectorFieldMapper.PruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD); + b.field("tokens_freq_ratio_threshold", TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD); + b.field("tokens_weight_threshold", TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD); b.endObject(); b.endObject(); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 5bd4d97192900..0a5f31f1ba7d9 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -22,6 +22,7 @@ import org.elasticsearch.index.mapper.Mapping; import org.elasticsearch.index.mapper.RootObjectMapper; import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryRewriteContext; @@ -431,11 +432,11 @@ private IndexFieldPruningSettings getIndexFieldPruningSettings(SparseVectorField Boolean indexOptionsPrune = indexOptions.getPrune(); - SparseVectorFieldMapper.PruningConfig indexPruningConfig = indexOptions.getPruningConfig(); + TokenPruningConfig indexPruningConfig = indexOptions.getPruningConfig(); TokenPruningConfig indexTokenPruningConfig = indexPruningConfig != null ? new TokenPruningConfig( - indexPruningConfig.getTokensFreqRatioThresholdOrDefault(), - indexPruningConfig.getTokensWeightThresholdOrDefault(), + indexPruningConfig.getTokensFreqRatioThreshold(), + indexPruningConfig.getTokensWeightThreshold(), false ) : null; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java index 81758ec5f9342..472cb5e1c7012 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilder.java @@ -18,6 +18,7 @@ import org.elasticsearch.common.logging.DeprecationCategory; import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java index f41fcd77ce627..133b84cbb85a8 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilder.java @@ -19,6 +19,7 @@ import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.xcontent.ParseField; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java index 1c2ac23151e6e..87a548a553365 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensUtils.java @@ -15,6 +15,7 @@ import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.SearchExecutionContext; import java.io.IOException; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index b8fb33feb7bf4..e3058853cf139 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -30,6 +30,7 @@ import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.plugins.Plugin; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java index a0263003b72db..5e1a3fd5ed0ce 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TextExpansionQueryBuilderTests.java @@ -25,6 +25,7 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java index 8cdf44ae51dd4..095c88923bb5a 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/TokenPruningConfigTests.java @@ -8,6 +8,7 @@ package org.elasticsearch.xpack.core.ml.search; import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.test.AbstractXContentSerializingTestCase; import org.elasticsearch.xcontent.XContentParser; diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java index cded9b8dce5e2..76a782c94f9e0 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/WeightedTokensQueryBuilderTests.java @@ -27,6 +27,7 @@ import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.plugins.Plugin; From 1f0718de86b1fd913b4925bb596782ca6f1f382f Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 1 May 2025 15:08:24 +0000 Subject: [PATCH 26/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorQueryBuilder.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 0a5f31f1ba7d9..e97a2f442b62e 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -434,11 +434,7 @@ private IndexFieldPruningSettings getIndexFieldPruningSettings(SparseVectorField TokenPruningConfig indexPruningConfig = indexOptions.getPruningConfig(); TokenPruningConfig indexTokenPruningConfig = indexPruningConfig != null - ? new TokenPruningConfig( - indexPruningConfig.getTokensFreqRatioThreshold(), - indexPruningConfig.getTokensWeightThreshold(), - false - ) + ? new TokenPruningConfig(indexPruningConfig.getTokensFreqRatioThreshold(), indexPruningConfig.getTokensWeightThreshold(), false) : null; return new IndexFieldPruningSettings(indexOptionsPrune, indexTokenPruningConfig); From 404e6453d29a423570337348c8ec7ab72ee98fd3 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 12:10:06 -0400 Subject: [PATCH 27/86] cleanup existing Yaml tests via teardown --- .../test/cluster.stats/10_basic.yml | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml index a1a8fa206a38b..c14026257bdd9 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.stats/10_basic.yml @@ -1,3 +1,15 @@ + +--- +teardown: + # ensure indices are cleaned up after each test + # mainly for the sparse vector tests + - do: + indices.delete: + index: ["test1", "test2"] + ignore: 404 + - do: + indices.refresh: { } + --- "cluster stats test": - do: @@ -353,16 +365,6 @@ - match: { indices.dense_vector.value_count: 8 } - is_false: indices.dense_vector.off_heap - # ensure indices are cleaned up - - do: - indices.delete: - index: test1 - - do: - indices.delete: - index: test2 - - do: - indices.refresh: { } - --- "Sparse vector stats": - requires: From bdfcf5ecf6f6a8a2cd4609f370eb82aad6f95863 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 13:03:42 -0400 Subject: [PATCH 28/86] fix lint --- .../index/mapper/vectors/TokenPruningConfig.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index 43bf2d81ec433..c0441af87a447 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -7,13 +7,6 @@ * License v3.0 only", or the "Server Side Public License, v 1". */ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0; you may not use this file except in compliance with the Elastic License - * 2.0. - */ - package org.elasticsearch.index.mapper.vectors; import org.elasticsearch.common.ParsingException; From f27dfb82bc8d3cc50ab8749c716b6798a9af3cdd Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 1 May 2025 15:15:23 -0400 Subject: [PATCH 29/86] add node feature and simple yml test --- .../test/search.vectors/90_sparse_vector.yml | 53 +++++++++++++++++++ .../vectors/SparseVectorFieldMapper.java | 3 ++ 2 files changed, 56 insertions(+) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml index 0b65a69bf500e..a76e40edcaef5 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml @@ -589,3 +589,56 @@ - match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 } - match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 } - match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 } + +--- +"sparse_vector token pruning index options support": + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + + - do: + indices.create: + index: test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + index: + index: test + id: "1" + body: + text: "running is good for you" + ml: + tokens: + running: 2.4097164 + good: 2.170997 + run: 2.052153 + race: 1.4575411 + for: 1.1908325 + runner: 1.1803857 + exercise: 1.1652642 + you: 0.9654308 + training: 0.94999343 + sports: 0.93650943 + fitness: 0.83129317 + best: 0.820365 + bad: 0.7385934 + health: 0.7098149 + marathon: 0.61555296 + gym: 0.5652374 + + - match: { result: "created" } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 10191942ced4f..1e9b8b9963a50 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -24,6 +24,7 @@ import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.core.Nullable; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -76,6 +77,8 @@ public class SparseVectorFieldMapper extends FieldMapper { private final SparseVectorFieldMapper.IndexOptions indexOptions; + public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector_index_options_supported"); + private static SparseVectorFieldMapper toType(FieldMapper in) { return (SparseVectorFieldMapper) in; } From 65c514786a271e7781af66141825d0f408157880 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 2 May 2025 09:10:19 -0400 Subject: [PATCH 30/86] remove checked-in test - moving it / add elsewhere --- .../test/search.vectors/90_sparse_vector.yml | 53 ------------------- 1 file changed, 53 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml index a76e40edcaef5..0b65a69bf500e 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml @@ -589,56 +589,3 @@ - match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 } - match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 } - match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 } - ---- -"sparse_vector token pruning index options support": - - requires: - cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" - - - do: - indices.create: - index: test - body: - mappings: - properties: - text: - type: text - ml.tokens: - type: sparse_vector - index_options: - prune: true - pruning_config: - tokens_freq_ratio_threshold: 1.0 - tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - - - match: { acknowledged: true } - - - do: - index: - index: test - id: "1" - body: - text: "running is good for you" - ml: - tokens: - running: 2.4097164 - good: 2.170997 - run: 2.052153 - race: 1.4575411 - for: 1.1908325 - runner: 1.1803857 - exercise: 1.1652642 - you: 0.9654308 - training: 0.94999343 - sports: 0.93650943 - fitness: 0.83129317 - best: 0.820365 - bad: 0.7385934 - health: 0.7098149 - marathon: 0.61555296 - gym: 0.5652374 - - - match: { result: "created" } From fc78b0f469ac58d9492602029c3bdf22e5391ff9 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 2 May 2025 13:10:23 -0400 Subject: [PATCH 31/86] cleanups; start of yamlRestTests --- .../vectors/SparseVectorFieldMapper.java | 184 ++----------- .../vectors/SparseVectorFieldMapperTests.java | 8 +- .../xpack/core/XPackFeatures.java | 7 + .../test/ml/sparse_vector_search.yml | 251 ++++++++++++++++++ 4 files changed, 276 insertions(+), 174 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 1e9b8b9963a50..0b63a64d559d0 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -117,24 +117,10 @@ protected Parameter[] getParameters() { @Override public SparseVectorFieldMapper build(MapperBuilderContext context) { - IndexOptions buildIndexOptions = indexOptions.getValue(); - - if (buildIndexOptions == null) { - buildIndexOptions = new IndexOptions( - true, - new TokenPruningConfig( - TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, - TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, - false - ) - ); - } - return new SparseVectorFieldMapper( leafName(), - new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), buildIndexOptions), - builderParams(this, context), - buildIndexOptions + new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue(), indexOptions.getValue()), + builderParams(this, context) ); } } @@ -158,6 +144,10 @@ private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(Mappi } private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { + if (propNode == null) { + return getDefaultIndexOptions(context); + } + Map indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS); Boolean prune = IndexOptions.parseIndexOptionsPruneValue(indexOptionsMap); @@ -198,6 +188,10 @@ public SparseVectorFieldType( this.indexOptions = indexOptions; } + public IndexOptions getIndexOptions() { + return indexOptions; + } + @Override public String typeName() { return CONTENT_TYPE; @@ -241,14 +235,9 @@ private static String indexedValueForSearch(Object value) { } } - private SparseVectorFieldMapper( - String simpleName, - MappedFieldType mappedFieldType, - BuilderParams builderParams, - @Nullable IndexOptions indexOptions - ) { + private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) { super(simpleName, mappedFieldType, builderParams); - this.indexOptions = indexOptions; + this.indexOptions = ((SparseVectorFieldType) mappedFieldType).getIndexOptions(); } @Override @@ -513,11 +502,11 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio return null; } - if ((shouldPrune instanceof Boolean) == false) { - throw new MapperParsingException("[index_options] field [prune] should be true or false"); + if (shouldPrune instanceof Boolean boolValue) { + return boolValue; } - return ((Boolean) shouldPrune); + throw new MapperParsingException("[index_options] field [prune] should be true or false"); } public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { @@ -538,147 +527,4 @@ public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, M return TokenPruningConfig.parseFromMap(pruningConfigurationMap); } } - - /* - public static class PruningConfig implements ToXContent { - public static final String TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME = "tokens_freq_ratio_threshold"; - public static final String TOKENS_WEIGHT_THRESHOLD_FIELD_NAME = "tokens_weight_threshold"; - - public static float DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD = 5; - public static float MIN_TOKENS_FREQ_RATIO_THRESHOLD = 1; - public static float MAX_TOKENS_FREQ_RATIO_THRESHOLD = 100; - - public static float DEFAULT_TOKENS_WEIGHT_THRESHOLD = 0.4f; - public static float MIN_TOKENS_WEIGHT_THRESHOLD = 0.0f; - public static float MAX_TOKENS_WEIGHT_THRESHOLD = 1.0f; - - final Float tokens_freq_ratio_threshold; - final Float tokens_weight_threshold; - - PruningConfig(@Nullable Float tokens_freq_ratio_threshold, @Nullable Float tokens_weight_threshold) { - this.tokens_freq_ratio_threshold = tokens_freq_ratio_threshold; - this.tokens_weight_threshold = tokens_weight_threshold; - } - - public float getTokensFreqRatioThresholdOrDefault() { - if (tokens_freq_ratio_threshold == null) { - return DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD; - } - return tokens_freq_ratio_threshold; - } - - public float getTokensWeightThresholdOrDefault() { - if (tokens_weight_threshold == null) { - return DEFAULT_TOKENS_WEIGHT_THRESHOLD; - } - return tokens_weight_threshold; - } - - @Override - public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { - builder.startObject(); - if (tokens_freq_ratio_threshold != null) { - builder.field(TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME, tokens_freq_ratio_threshold); - } - if (tokens_weight_threshold != null) { - builder.field(TOKENS_WEIGHT_THRESHOLD_FIELD_NAME, tokens_weight_threshold); - } - builder.endObject(); - return builder; - } - - @Override - public final boolean equals(Object other) { - if (other == this) { - return true; - } - if (other instanceof PruningConfig otherConfig) { - return Objects.equals(tokens_freq_ratio_threshold, otherConfig.tokens_freq_ratio_threshold) - && Objects.equals(tokens_weight_threshold, otherConfig.tokens_weight_threshold); - } - return false; - } - - @Override - public final int hashCode() { - return Objects.hash(tokens_freq_ratio_threshold, tokens_weight_threshold); - } - - public static PruningConfig parsePruningConfig(Object pruningConfiguration) { - Map pruningConfigMap = XContentMapValues.nodeMapValue(pruningConfiguration, SPARSE_VECTOR_INDEX_OPTIONS); - - Object mappedTokensFreqRatioThreshold = pruningConfigMap.remove(TOKENS_FREQ_RATIO_THRESHOLD_FIELD_NAME); - Object mappedTokensWeightThreshold = pruningConfigMap.remove(TOKENS_WEIGHT_THRESHOLD_FIELD_NAME); - - if (pruningConfigMap.isEmpty() == false) { - throw new MapperParsingException("[index_options] field [pruning_config] has unknown fields"); - } - - Float tokensFreqRatioThreshold = parseTokensFreqRatioThreshold(mappedTokensFreqRatioThreshold); - Float tokensWeightThreshold = parseTokensWeightThreshold(mappedTokensWeightThreshold); - - if (tokensFreqRatioThreshold != null || tokensWeightThreshold != null) { - return new PruningConfig(tokensFreqRatioThreshold, tokensWeightThreshold); - } - - return null; - } - - private static Float parseFloatNumberFromObject(Object numberObject) { - if (numberObject instanceof Integer intValue) { - return (float) intValue; - } else if (numberObject instanceof Float floatValue) { - return floatValue; - } else if (numberObject instanceof Double doubleValue) { - return ((Double) numberObject).floatValue(); - } - return null; - } - - private static Float parseTokensWeightThreshold(Object mappedTokensWeightThreshold) { - if (mappedTokensWeightThreshold == null) { - return null; - } - - Float tokensWeightThreshold = parseFloatNumberFromObject(mappedTokensWeightThreshold); - - if (tokensWeightThreshold == null) { - throw new MapperParsingException( - "[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" - ); - } - - if (tokensWeightThreshold < PruningConfig.MIN_TOKENS_WEIGHT_THRESHOLD - || tokensWeightThreshold > PruningConfig.MAX_TOKENS_WEIGHT_THRESHOLD) { - throw new MapperParsingException( - "[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" - ); - } - return tokensWeightThreshold; - } - - private static Float parseTokensFreqRatioThreshold(Object mappedTokensFreqRatioThreshold) { - if (mappedTokensFreqRatioThreshold == null) { - return null; - } - - Float tokensFreqRatioThreshold = parseFloatNumberFromObject(mappedTokensFreqRatioThreshold); - - if (tokensFreqRatioThreshold == null) { - throw new MapperParsingException( - "[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" - ); - } - - if (tokensFreqRatioThreshold < PruningConfig.MIN_TOKENS_FREQ_RATIO_THRESHOLD - || tokensFreqRatioThreshold > PruningConfig.MAX_TOKENS_FREQ_RATIO_THRESHOLD) { - throw new MapperParsingException( - "[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" - ); - } - - return tokensFreqRatioThreshold; - } - } - */ } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index fbfe65128a769..e52f12f1489ed 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -124,7 +124,7 @@ private static int getFrequency(TokenStream tk) throws IOException { public void testDefaults() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); - assertEquals(Strings.toString(fieldMapping(this::mappingWithDefaultIndexOptions)), mapper.mappingSource().toString()); + assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); @@ -150,9 +150,7 @@ public void testDefaults() throws Exception { public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); - String expectedMapping = "{\"_doc\":{\"properties\":{\"field\":{\"type\":\"sparse_vector\",\"index_options\":" - + "{\"prune\":true}}}}}"; - assertEquals(expectedMapping, mapper.mappingSource().toString()); + assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPrune)), mapper.mappingSource().toString()); ParsedDocument doc1 = mapper.parse(source(this::writeField)); @@ -176,7 +174,7 @@ public void testWithIndexOptionsPrune() throws Exception { assertTrue(freq1 < freq2); } - public void testWithIndexOptionsPruningConfig() throws Exception { + public void testWithIndexOptionsPruningConfigOnly() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPruningConfig)); assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPruningConfig)), mapper.mappingSource().toString()); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java index 42824a553d2bd..ad0f38d975b86 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackFeatures.java @@ -12,6 +12,8 @@ import java.util.Set; +import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SPARSE_VECTOR_INDEX_OPTIONS_FEATURE; + /** * Provides the XPack features that this version of the code supports */ @@ -20,4 +22,9 @@ public class XPackFeatures implements FeatureSpecification { public Set getFeatures() { return Set.of(); } + + @Override + public Set getTestFeatures() { + return Set.of(SPARSE_VECTOR_INDEX_OPTIONS_FEATURE); + } } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 3481773b0bab3..83d623a7f2aeb 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -89,6 +89,29 @@ setup: model_id: text_expansion_model wait_for: started +--- +teardown: + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + # ensure indices are cleaned up after each test + # mainly for the sparse vector token pruning tests + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.delete: + index: ["sparse_vector_pruning_test"] + ignore: 404 + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.refresh: { } + --- "Test sparse_vector search": - do: @@ -510,3 +533,231 @@ setup: - match: { hits.hits.0._score: 4.0 } - match: { hits.hits.1._id: "parent-foo-bar" } - match: { hits.hits.1._score: 2.0 } + +--- +"Check sparse_vector token pruning index_options mappings": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + +--- +"Check sparse_vector token pruning index_options mappings defaults": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: null } + +--- +"Check sparse_vector token pruning index_options prune default true allow config": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + pruning_config: + tokens_freq_ratio_threshold: 16.5 + tokens_weight_threshold: 0.2 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 16.5 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.2 } + +--- +"Check sparse_vector token pruning index_options prune false do not allow config": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: false + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false" } + +--- +"Check sparse_vector token pruning index_options tokens freq out of bounds": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 101.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" } + +--- +"Check sparse_vector token pruning index_options tokens weight out of bounds": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 5.0 + tokens_weight_threshold: 3.5 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } From 0b17c16b3784a8f78357a3ec1f0885b7d1d174eb Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 5 May 2025 11:41:06 -0400 Subject: [PATCH 32/86] ensure query index version; add test index options --- .../vectors/SparseVectorFieldMapper.java | 2 +- .../ml/search/SparseVectorQueryBuilder.java | 20 ++- .../test/ml/sparse_vector_search.yml | 117 +++++++++++++++++- 3 files changed, 131 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 0b63a64d559d0..da81e20a3bd02 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -73,7 +73,7 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; - static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; private final SparseVectorFieldMapper.IndexOptions indexOptions; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index e97a2f442b62e..2edf1e4ddee67 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -237,13 +237,21 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { // if the query options for pruning are not set, // we need to check the index options for this field - // and use those if set. - SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); - TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded(shouldPruneTokens, tokenPruningConfig, sparseVectorFieldMapper); + // and use those if set - however, only if the index + // was created after we added this support. + if (context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { + SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); + TokenPruningSet pruningOptions = + setPruningConfigFromIndexIfNeeded(shouldPruneTokens, tokenPruningConfig, sparseVectorFieldMapper); + + return pruningOptions.pruneTokens + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningOptions.pruningConfig, queryVectors, ft, context) + : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); + } - return pruningOptions.pruneTokens - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningOptions.pruningConfig, queryVectors, ft, context) - : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); + return (shouldPruneTokens != null && shouldPruneTokens) + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, tokenPruningConfig, queryVectors, ft, context) + : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } @Override diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 83d623a7f2aeb..086886b4fc131 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -104,7 +104,7 @@ teardown: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json indices.delete: - index: ["sparse_vector_pruning_test"] + index: ["sparse_vector_pruning_test", "test-sparse-vector-without-pruning", "test-sparse-vector-with-pruning"] ignore: 404 - do: headers: @@ -761,3 +761,118 @@ teardown: - match: { status: 400 } - match: { error.type: "mapper_parsing_exception" } - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } + +--- +"Check sparse_vector token pruning index_options in query": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: test-sparse-vector-with-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.create: + index: test-sparse-vector-without-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: false + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + bulk: + index: test-sparse-vector-with-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + bulk: + index: test-sparse-vector-without-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + From 08d51c93038ad0c0ad22d2a40be5ce5863696b4a Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 5 May 2025 15:49:22 +0000 Subject: [PATCH 33/86] [CI] Auto commit changes from spotless --- .../mapper/vectors/SparseVectorFieldMapper.java | 3 ++- .../core/ml/search/SparseVectorQueryBuilder.java | 15 +++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index da81e20a3bd02..95a215cc622a2 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -73,7 +73,8 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; - public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; private final SparseVectorFieldMapper.IndexOptions indexOptions; diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 2edf1e4ddee67..9500139fe18ec 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -241,17 +241,20 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { // was created after we added this support. if (context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); - TokenPruningSet pruningOptions = - setPruningConfigFromIndexIfNeeded(shouldPruneTokens, tokenPruningConfig, sparseVectorFieldMapper); + TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded( + shouldPruneTokens, + tokenPruningConfig, + sparseVectorFieldMapper + ); return pruningOptions.pruneTokens - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningOptions.pruningConfig, queryVectors, ft, context) - : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningOptions.pruningConfig, queryVectors, ft, context) + : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } return (shouldPruneTokens != null && shouldPruneTokens) - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, tokenPruningConfig, queryVectors, ft, context) - : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, tokenPruningConfig, queryVectors, ft, context) + : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } @Override From ae3484157e959ebdad3e877cfdd51aff18e32043 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 5 May 2025 13:57:34 -0400 Subject: [PATCH 34/86] add yaml tests for ml multi/remote clusters --- .../test/multi_cluster/50_sparse_vector.yml | 357 ++++++++++++++++++ .../test/remote_cluster/50_sparse_vector.yml | 357 ++++++++++++++++++ 2 files changed, 714 insertions(+) diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 20ab78a48ae1b..6f5e97e23441a 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -113,6 +113,20 @@ teardown: model_id: "text_expansion_model" ignore: 404 + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.delete: + index: ["sparse_vector_pruning_test", "test-sparse-vector-without-pruning", "test-sparse-vector-with-pruning"] + ignore: 404 + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.refresh: { } + --- "Test sparse_vector search": - do: @@ -184,3 +198,346 @@ teardown: - match: { hits.total.value: 5 } - match: { hits.hits.0._source.source_text: "the octopus comforter smells" } + +--- +"Check sparse_vector token pruning index_options mappings": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + +--- +"Check sparse_vector token pruning index_options mappings defaults": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: null } + +--- +"Check sparse_vector token pruning index_options prune default true allow config": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + pruning_config: + tokens_freq_ratio_threshold: 16.5 + tokens_weight_threshold: 0.2 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 16.5 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.2 } + +--- +"Check sparse_vector token pruning index_options prune false do not allow config": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: false + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false" } + +--- +"Check sparse_vector token pruning index_options tokens freq out of bounds": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 101.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" } + +--- +"Check sparse_vector token pruning index_options tokens weight out of bounds": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 5.0 + tokens_weight_threshold: 3.5 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } + +--- +"Check sparse_vector token pruning index_options in query": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-with-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-without-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: false + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-with-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-without-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index e3b59bdaf50bd..776a88692b11d 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -112,6 +112,20 @@ teardown: model_id: "text_expansion_model" ignore: 404 + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.delete: + index: ["sparse_vector_pruning_test", "test-sparse-vector-without-pruning", "test-sparse-vector-with-pruning"] + ignore: 404 + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.refresh: { } + --- "Test sparse_vector search": - do: @@ -183,3 +197,346 @@ teardown: - match: { hits.total.value: 5 } - match: { hits.hits.0._source.source_text: "the octopus comforter smells" } + +--- +"Check sparse_vector token pruning index_options mappings": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + +--- +"Check sparse_vector token pruning index_options mappings defaults": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: null } + +--- +"Check sparse_vector token pruning index_options prune default true allow config": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + pruning_config: + tokens_freq_ratio_threshold: 16.5 + tokens_weight_threshold: 0.2 + embeddings: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.get_mapping: + index: sparse_vector_pruning_test + + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 16.5 } + - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.2 } + +--- +"Check sparse_vector token pruning index_options prune false do not allow config": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: false + pruning_config: + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false" } + +--- +"Check sparse_vector token pruning index_options tokens freq out of bounds": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 101.0 + tokens_weight_threshold: 0.4 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" } + +--- +"Check sparse_vector token pruning index_options tokens weight out of bounds": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + catch: bad_request + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: sparse_vector_pruning_test + body: + mappings: + properties: + text: + type: text + ml.tokens: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 5.0 + tokens_weight_threshold: 3.5 + embeddings: + type: sparse_vector + + - match: { status: 400 } + - match: { error.type: "mapper_parsing_exception" } + - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } + +--- +"Check sparse_vector token pruning index_options in query": + + - requires: + cluster_features: 'sparse_vector_index_options_supported' + reason: "sparse_vector token pruning index options added support in 9.1" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: test-sparse-vector-with-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + indices.create: + index: test-sparse-vector-without-pruning + body: + mappings: + properties: + content_embedding: + type: sparse_vector + index_options: + prune: false + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + bulk: + index: test-sparse-vector-with-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Content-Type: application/json + bulk: + index: test-sparse-vector-without-pruning + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + From 74b19caee9127713b7ea247daac46992c36d2528 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 5 May 2025 15:42:46 -0400 Subject: [PATCH 35/86] fix yaml test --- .../resources/rest-api-spec/test/ml/sparse_vector_search.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 086886b4fc131..96d42d56029fe 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -871,8 +871,7 @@ teardown: underground: 0.053516876 is: 0.54600334 - - match: { hits.total.value: 2 } + - match: { hits.total.value: 1 } - match: { hits.hits.0._id: "1" } - - match: { hits.hits.1._id: "3" } From a341322a16b5c07562ad127cce01fb9d8982e0ee Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 5 May 2025 17:57:21 -0400 Subject: [PATCH 36/86] finally fix yaml tests? --- .../test/ml/sparse_vector_search.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 96d42d56029fe..1e0b95cf9e588 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -821,6 +821,13 @@ teardown: {"index": { "_id": "3" }} {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.refresh: + index: test-sparse-vector-with-pruning + - do: headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser @@ -836,6 +843,13 @@ teardown: {"index": { "_id": "3" }} {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + - do: + headers: + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser + Content-Type: application/json + indices.refresh: + index: test-sparse-vector-without-pruning + - do: search: index: test-sparse-vector-without-pruning @@ -871,7 +885,8 @@ teardown: underground: 0.053516876 is: 0.54600334 - - match: { hits.total.value: 1 } + - match: { hits.total.value: 2 } - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } From a47b9151d4fd271051348691268207c97c8ab1bd Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 5 May 2025 18:01:55 -0400 Subject: [PATCH 37/86] update docs --- .../reference/elasticsearch/mapping-reference/sparse-vector.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index 0f3d691e490ba..aa5ce5a463df8 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -24,7 +24,8 @@ PUT my-index } ``` -With optional `index_options` for pruning: +Also, with optional `index_options` for pruning: + ```console PUT my-index { From 4e681bdba479198c565399981844dba397cc70c6 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 5 May 2025 18:56:24 -0400 Subject: [PATCH 38/86] add 8.x tx version; fix yaml tests; optimizations --- .../org/elasticsearch/TransportVersions.java | 1 + .../vectors/SparseVectorFieldMapper.java | 7 +- .../mapper/vectors/TokenPruningConfig.java | 37 ++---- .../vectors/SparseVectorFieldMapperTests.java | 44 ++++++- .../ml/search/SparseVectorQueryBuilder.java | 12 +- .../test/multi_cluster/50_sparse_vector.yml | 93 +++++++++----- .../test/remote_cluster/50_sparse_vector.yml | 117 +++++++++++------- .../test/ml/sparse_vector_search.yml | 112 +++++++++-------- 8 files changed, 258 insertions(+), 165 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index 6b6c780aaf922..c070d93597b94 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -168,6 +168,7 @@ static TransportVersion def(int id) { public static final TransportVersion ESQL_AGGREGATE_METRIC_DOUBLE_BLOCK_8_19 = def(8_841_0_24); public static final TransportVersion INTRODUCE_FAILURES_LIFECYCLE_BACKPORT_8_19 = def(8_841_0_25); public static final TransportVersion INTRODUCE_FAILURES_DEFAULT_RETENTION_BACKPORT_8_19 = def(8_841_0_26); + public static final TransportVersion SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19 = def(8_841_0_27); public static final TransportVersion V_9_0_0 = def(9_000_0_09); public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10); public static final TransportVersion COHERE_BIT_EMBEDDING_TYPE_SUPPORT_ADDED = def(9_001_0_00); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 95a215cc622a2..f65da08a601b3 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -515,11 +515,16 @@ public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, M if (pruningConfiguration == null) { return null; } + + if (prune == null) { + throw new MapperParsingException("[index_options] field [pruning_config] should only be set if [prune] is set to true"); + } + if ((pruningConfiguration instanceof Map) == false) { throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); } - if (prune != null && prune == false) { + if (prune == false) { throw new MapperParsingException("[index_options] field [pruning_config] should not be set if [prune] is false"); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index c0441af87a447..687dacdd72cb1 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -204,7 +204,7 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM return null; } - private static Float parseFloatNumberFromObject(Object numberObject) { + private static Float parseFloatNumberFromObject(Object numberObject, String fieldName, String exceptionDetails) { if (numberObject instanceof Integer intValue) { return (float) intValue; } else if (numberObject instanceof Float floatValue) { @@ -212,7 +212,8 @@ private static Float parseFloatNumberFromObject(Object numberObject) { } else if (numberObject instanceof Double doubleValue) { return ((Double) numberObject).floatValue(); } - return null; + + throw new MapperParsingException("[" + PRUNING_CONFIG_FIELD + "] field [" + fieldName + "]" + exceptionDetails); } private static Float parseTokensWeightThreshold(Object mappedTokensWeightThreshold) { @@ -220,17 +221,11 @@ private static Float parseTokensWeightThreshold(Object mappedTokensWeightThresho return DEFAULT_TOKENS_WEIGHT_THRESHOLD; } - Float tokensWeightThreshold = parseFloatNumberFromObject(mappedTokensWeightThreshold); - - if (tokensWeightThreshold == null) { - throw new MapperParsingException( - "[" - + PRUNING_CONFIG_FIELD - + "] field [" - + TOKENS_WEIGHT_THRESHOLD.getPreferredName() - + "] field should be a number between 0.0 and 1.0" - ); - } + Float tokensWeightThreshold = parseFloatNumberFromObject( + mappedTokensWeightThreshold, + TOKENS_WEIGHT_THRESHOLD.getPreferredName(), + "field should be a number between 0.0 and 1.0" + ); if (tokensWeightThreshold < MIN_TOKENS_WEIGHT_THRESHOLD || tokensWeightThreshold > MAX_TOKENS_WEIGHT_THRESHOLD) { throw new MapperParsingException( @@ -249,17 +244,11 @@ private static Float parseTokensFreqRatioThreshold(Object mappedTokensFreqRatioT return DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD; } - Float tokensFreqRatioThreshold = parseFloatNumberFromObject(mappedTokensFreqRatioThreshold); - - if (tokensFreqRatioThreshold == null) { - throw new MapperParsingException( - "[" - + PRUNING_CONFIG_FIELD - + "] field [" - + TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName() - + "] field should be a number between 1 and 100" - ); - } + Float tokensFreqRatioThreshold = parseFloatNumberFromObject( + mappedTokensFreqRatioThreshold, + TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName(), + "field should be a number between 1 and 100" + ); if (tokensFreqRatioThreshold < MIN_TOKENS_FREQ_RATIO_THRESHOLD || tokensFreqRatioThreshold > MAX_TOKENS_FREQ_RATIO_THRESHOLD) { throw new MapperParsingException( diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index e52f12f1489ed..92fe7121c477f 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -88,6 +88,7 @@ protected void mappingWithIndexOptionsPrune(XContentBuilder b) throws IOExceptio protected void mappingWithIndexOptionsPruningConfig(XContentBuilder b) throws IOException { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_freq_ratio_threshold", 5.0); b.field("tokens_weight_threshold", 0.4); @@ -296,16 +297,50 @@ public void testPruningConfigurationIsMap() { Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.field("pruning_config", "this_is_not_a_map"); b.endObject(); }))); assertThat(e.getMessage(), containsString("index_options] field [pruning_config] should be a map")); } + public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { + + Exception eTestPruneIsFalse = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.field("prune", false); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 5.0); + b.field("tokens_weight_threshold", 0.4); + b.endObject(); + b.endObject(); + }))); + assertThat( + eTestPruneIsFalse.getMessage(), + containsString("Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false") + ); + + Exception eTestPruneIsMissing = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { + b.field("type", "sparse_vector"); + b.startObject("index_options"); + b.startObject("pruning_config"); + b.field("tokens_freq_ratio_threshold", 5.0); + b.field("tokens_weight_threshold", 0.4); + b.endObject(); + b.endObject(); + }))); + assertThat( + eTestPruneIsMissing.getMessage(), + containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") + ); + } + public void testTokensFreqRatioCorrect() { Exception eTestInteger = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_freq_ratio_threshold", "notaninteger"); b.endObject(); @@ -313,12 +348,13 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestInteger.getMessage(), - containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100") + containsString("Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold]field should be a number between 1 and 100") ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_freq_ratio_threshold", -2); b.endObject(); @@ -332,6 +368,7 @@ public void testTokensFreqRatioCorrect() { Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_freq_ratio_threshold", 101); b.endObject(); @@ -347,6 +384,7 @@ public void testTokensWeightThresholdCorrect() { Exception eTestDouble = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_weight_threshold", "notadouble"); b.endObject(); @@ -354,12 +392,13 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestDouble.getMessage(), - containsString("[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0") + containsString("Failed to parse mapping: [pruning_config] field [tokens_weight_threshold]field should be a number between 0.0 and 1.0") ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_weight_threshold", -0.1); b.endObject(); @@ -373,6 +412,7 @@ public void testTokensWeightThresholdCorrect() { Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); b.startObject("index_options"); + b.field("prune", true); b.startObject("pruning_config"); b.field("tokens_weight_threshold", 1.1); b.endObject(); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 9500139fe18ec..80cec284189ff 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -235,6 +235,9 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { ); } + Boolean pruneTokensToUse = shouldPruneTokens; + TokenPruningConfig pruningConfigToUse = tokenPruningConfig; + // if the query options for pruning are not set, // we need to check the index options for this field // and use those if set - however, only if the index @@ -247,13 +250,12 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { sparseVectorFieldMapper ); - return pruningOptions.pruneTokens - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningOptions.pruningConfig, queryVectors, ft, context) - : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); + pruneTokensToUse = pruningOptions.pruneTokens; + pruningConfigToUse = pruningOptions.pruningConfig; } - return (shouldPruneTokens != null && shouldPruneTokens) - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, tokenPruningConfig, queryVectors, ft, context) + return (pruneTokensToUse != null && pruneTokensToUse) + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningConfigToUse, queryVectors, ft, context) : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 6f5e97e23441a..3fd7406bec573 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -204,7 +204,7 @@ teardown: - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -247,7 +247,7 @@ teardown: - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -276,20 +276,20 @@ teardown: indices.get_mapping: index: sparse_vector_pruning_test - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: null } + - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options + - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- -"Check sparse_vector token pruning index_options prune default true allow config": +"Check sparse_vector token pruning index_options prune missing do not allow config": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json @@ -304,35 +304,24 @@ teardown: type: sparse_vector index_options: pruning_config: - tokens_freq_ratio_threshold: 16.5 - tokens_weight_threshold: 0.2 + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 embeddings: type: sparse_vector - - match: { acknowledged: true } - - - do: - headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials - Content-Type: application/json - indices.get_mapping: - index: sparse_vector_pruning_test - - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 16.5 } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.2 } + - match: { status: 400 } --- "Check sparse_vector token pruning index_options prune false do not allow config": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[index_options\] field \[pruning_config\] should not be set if \[prune\] is false/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json @@ -354,20 +343,18 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false" } --- "Check sparse_vector token pruning index_options tokens freq out of bounds": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[pruning_config\] field \[tokens_freq_ratio_threshold\] field should be a number between 1 and 100/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json @@ -389,20 +376,18 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" } --- "Check sparse_vector token pruning index_options tokens weight out of bounds": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[pruning_config\] field \[tokens_weight_threshold\] field should be a number between 0.0 and 1.0/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json @@ -424,15 +409,13 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } --- "Check sparse_vector token pruning index_options in query": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -540,4 +523,46 @@ teardown: - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "3" } + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: false + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index 776a88692b11d..1d5c62d30a134 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -203,13 +203,13 @@ teardown: - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: sparse_vector_pruning_test @@ -232,7 +232,7 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.get_mapping: index: sparse_vector_pruning_test @@ -246,13 +246,13 @@ teardown: - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: sparse_vector_pruning_test @@ -270,27 +270,27 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.get_mapping: index: sparse_vector_pruning_test - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: null } + - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options + - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- -"Check sparse_vector token pruning index_options prune default true allow config": +"Check sparse_vector token pruning index_options prune missing do not allow config": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: sparse_vector_pruning_test @@ -303,37 +303,26 @@ teardown: type: sparse_vector index_options: pruning_config: - tokens_freq_ratio_threshold: 16.5 - tokens_weight_threshold: 0.2 + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 embeddings: type: sparse_vector - - match: { acknowledged: true } - - - do: - headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" - Content-Type: application/json - indices.get_mapping: - index: sparse_vector_pruning_test - - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 16.5 } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.2 } + - match: { status: 400 } --- "Check sparse_vector token pruning index_options prune false do not allow config": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[index_options\] field \[pruning_config\] should not be set if \[prune\] is false/ headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: sparse_vector_pruning_test @@ -353,22 +342,20 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false" } --- "Check sparse_vector token pruning index_options tokens freq out of bounds": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[pruning_config\] field \[tokens_freq_ratio_threshold\] field should be a number between 1 and 100/ headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: sparse_vector_pruning_test @@ -388,22 +375,20 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" } --- "Check sparse_vector token pruning index_options tokens weight out of bounds": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[pruning_config\] field \[tokens_weight_threshold\] field should be a number between 0.0 and 1.0/ headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: sparse_vector_pruning_test @@ -423,21 +408,19 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } --- "Check sparse_vector token pruning index_options in query": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: test-sparse-vector-with-pruning @@ -456,7 +439,7 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json indices.create: index: test-sparse-vector-without-pruning @@ -472,7 +455,7 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json bulk: index: test-sparse-vector-with-pruning @@ -487,7 +470,7 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json bulk: index: test-sparse-vector-without-pruning @@ -539,4 +522,46 @@ teardown: - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "3" } + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: false + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 1e0b95cf9e588..e6b09c5ee4e20 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -91,14 +91,9 @@ setup: --- teardown: - - requires: - cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" - skip: features: headers - # ensure indices are cleaned up after each test - # mainly for the sparse vector token pruning tests - do: headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser @@ -539,7 +534,7 @@ teardown: - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -582,7 +577,7 @@ teardown: - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -611,20 +606,20 @@ teardown: indices.get_mapping: index: sparse_vector_pruning_test - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: null } + - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options + - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- -"Check sparse_vector token pruning index_options prune default true allow config": +"Check sparse_vector token pruning index_options prune missing do not allow config": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json @@ -639,35 +634,24 @@ teardown: type: sparse_vector index_options: pruning_config: - tokens_freq_ratio_threshold: 16.5 - tokens_weight_threshold: 0.2 + tokens_freq_ratio_threshold: 1.0 + tokens_weight_threshold: 0.4 embeddings: type: sparse_vector - - match: { acknowledged: true } - - - do: - headers: - Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser - Content-Type: application/json - indices.get_mapping: - index: sparse_vector_pruning_test - - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: null } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 16.5 } - - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.2 } + - match: { status: 400 } --- "Check sparse_vector token pruning index_options prune false do not allow config": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[index_options\] field \[pruning_config\] should not be set if \[prune\] is false/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json @@ -689,20 +673,18 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false" } --- "Check sparse_vector token pruning index_options tokens freq out of bounds": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[pruning_config\] field \[tokens_freq_ratio_threshold\] field should be a number between 1 and 100/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json @@ -724,20 +706,18 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100" } --- "Check sparse_vector token pruning index_options tokens weight out of bounds": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers - do: - catch: bad_request + catch: /\[pruning_config\] field \[tokens_weight_threshold\] field should be a number between 0.0 and 1.0/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json @@ -759,15 +739,13 @@ teardown: type: sparse_vector - match: { status: 400 } - - match: { error.type: "mapper_parsing_exception" } - - match: { error.reason: "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0" } --- "Check sparse_vector token pruning index_options in query": - requires: cluster_features: 'sparse_vector_index_options_supported' - reason: "sparse_vector token pruning index options added support in 9.1" + reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -821,13 +799,6 @@ teardown: {"index": { "_id": "3" }} {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} - - do: - headers: - Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser - Content-Type: application/json - indices.refresh: - index: test-sparse-vector-with-pruning - - do: headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser @@ -843,13 +814,6 @@ teardown: {"index": { "_id": "3" }} {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} - - do: - headers: - Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser - Content-Type: application/json - indices.refresh: - index: test-sparse-vector-without-pruning - - do: search: index: test-sparse-vector-without-pruning @@ -889,4 +853,46 @@ teardown: - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "3" } + - do: + search: + index: test-sparse-vector-without-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: true + pruning_config: + tokens_freq_ratio_threshold: 1 + tokens_weight_threshold: 1.0 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + + - do: + search: + index: test-sparse-vector-with-pruning + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + prune: false + - match: { hits.total.value: 3 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } + - match: { hits.hits.2._id: "2" } From 6e50539aecc05b5c9f6a87dab71259de49a18f1e Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 5 May 2025 23:03:48 +0000 Subject: [PATCH 39/86] [CI] Auto commit changes from spotless --- .../mapper/vectors/SparseVectorFieldMapperTests.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 92fe7121c477f..296654faba0ea 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -348,7 +348,9 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestInteger.getMessage(), - containsString("Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold]field should be a number between 1 and 100") + containsString( + "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold]field should be a number between 1 and 100" + ) ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -392,7 +394,9 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestDouble.getMessage(), - containsString("Failed to parse mapping: [pruning_config] field [tokens_weight_threshold]field should be a number between 0.0 and 1.0") + containsString( + "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold]field should be a number between 0.0 and 1.0" + ) ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { From 095bb28b9c1aa5b36c793ef10230b219977bd339 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 6 May 2025 10:29:02 -0400 Subject: [PATCH 40/86] explicitly set # of shards for indices for test --- .../resources/rest-api-spec/test/ml/sparse_vector_search.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index e6b09c5ee4e20..99ac948e77a7b 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -765,6 +765,8 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1 tokens_weight_threshold: 1.0 + settings: + number_of_shards: 1 - match: { acknowledged: true } @@ -781,6 +783,8 @@ teardown: type: sparse_vector index_options: prune: false + settings: + number_of_shards: 1 - match: { acknowledged: true } From 5bb6561bca2d2a0022b72745ed7aa7a613a3ff94 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 6 May 2025 11:12:03 -0400 Subject: [PATCH 41/86] fix docs; add backport 8.x index version --- docs/reference/elasticsearch/mapping-reference/sparse-vector.md | 2 +- server/src/main/java/org/elasticsearch/index/IndexVersions.java | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index aa5ce5a463df8..e831a27fac868 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -59,7 +59,7 @@ The following parameters are accepted by `sparse_vector` fields: * Use [synthetic `_source`](/reference/elasticsearch/mapping-reference/mapping-source-field.md#synthetic-source). index_options -: (Optional, object) You can set index options for your `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If the pruning options are not set for your `sparse_query` vector on the field, Elasticsearch will use the defaults if set here for the field. The available options for the index options are: +: (Optional, object) You can set index options for your `sparse_vector` field to determine if you should prune tokens, and the parameter configurations for the token pruning. If pruning options are not set in your `sparse_query` vector, Elasticsearch will use the default options configured for the field, if any. The available options for the index options are: Parameters for `index_options` are: diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index 35ef3e23c887c..ae9232fb7e110 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -140,6 +140,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion ADD_RESCORE_PARAMS_TO_QUANTIZED_VECTORS_BACKPORT_8_X = def(8_528_0_00, Version.LUCENE_9_12_1); public static final IndexVersion RESCORE_PARAMS_ALLOW_ZERO_TO_QUANTIZED_VECTORS_BACKPORT_8_X = def(8_529_0_00, Version.LUCENE_9_12_1); public static final IndexVersion DEFAULT_OVERSAMPLE_VALUE_FOR_BBQ_BACKPORT_8_X = def(8_530_0_00, Version.LUCENE_9_12_1); + public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X = def(8_531_0_00, Version.LUCENE_9_12_1); public static final IndexVersion UPGRADE_TO_LUCENE_10_0_0 = def(9_000_0_00, Version.LUCENE_10_0_0); public static final IndexVersion LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT = def(9_001_0_00, Version.LUCENE_10_0_0); public static final IndexVersion TIME_BASED_K_ORDERED_DOC_ID = def(9_002_0_00, Version.LUCENE_10_0_0); From 0b1c1d29a71b18f9832066991125308fe5243df8 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 6 May 2025 19:45:37 -0400 Subject: [PATCH 42/86] cleanups/optimizations --- .../vectors/SparseVectorFieldMapper.java | 37 +++---------------- .../vectors/SparseVectorFieldMapperTests.java | 4 +- .../test/multi_cluster/50_sparse_vector.yml | 2 +- .../test/remote_cluster/50_sparse_vector.yml | 2 +- .../test/ml/sparse_vector_search.yml | 2 +- 5 files changed, 11 insertions(+), 36 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index f65da08a601b3..9bf04f8bece34 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -130,23 +130,9 @@ public IndexOptions getIndexOptions() { return this.indexOptions; } - private static SparseVectorFieldMapper.IndexOptions getDefaultIndexOptions(MappingParserContext context) { - if (context.indexVersionCreated().before(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { - // don't set defaults if this index was created before - // we added this functionality in, so it will - // not change current index behaviour - return new IndexOptions(false, null); - } - - // index options are not set - for new indices, - // if this is null, in the query will use the - // proper defaults - return null; - } - private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { if (propNode == null) { - return getDefaultIndexOptions(context); + return null; } Map indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS); @@ -155,7 +141,7 @@ private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingPar TokenPruningConfig pruningConfig = IndexOptions.parseIndexOptionsPruningConfig(prune, indexOptionsMap); if (prune == null && pruningConfig == null) { - return getDefaultIndexOptions(context); + return null; } return new SparseVectorFieldMapper.IndexOptions(prune, pruningConfig); @@ -175,8 +161,7 @@ public static final class SparseVectorFieldType extends MappedFieldType { private final IndexOptions indexOptions; public SparseVectorFieldType(String name, boolean isStored, Map meta) { - super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta); - this.indexOptions = null; + this(name, isStored, meta, null); } public SparseVectorFieldType( @@ -482,17 +467,15 @@ public final int hashCode() { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { - if (prune == null && pruningConfig == null) { - return builder; - } - builder.startObject(); + if (prune != null) { builder.field(PRUNE_FIELD_NAME, prune); } if (pruningConfig != null) { builder.field(PRUNING_CONFIG_FIELD_NAME, pruningConfig); } + builder.endObject(); return builder; } @@ -516,18 +499,10 @@ public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, M return null; } - if (prune == null) { + if (prune == null || prune == false) { throw new MapperParsingException("[index_options] field [pruning_config] should only be set if [prune] is set to true"); } - if ((pruningConfiguration instanceof Map) == false) { - throw new MapperParsingException("[index_options] field [pruning_config] should be a map"); - } - - if (prune == false) { - throw new MapperParsingException("[index_options] field [pruning_config] should not be set if [prune] is false"); - } - Map pruningConfigurationMap = XContentMapValues.nodeMapValue(pruningConfiguration, PRUNING_CONFIG_FIELD_NAME); return TokenPruningConfig.parseFromMap(pruningConfigurationMap); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 296654faba0ea..2302017762cff 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -301,7 +301,7 @@ public void testPruningConfigurationIsMap() { b.field("pruning_config", "this_is_not_a_map"); b.endObject(); }))); - assertThat(e.getMessage(), containsString("index_options] field [pruning_config] should be a map")); + assertThat(e.getMessage(), containsString("pruning_config should be a map but was of type:")); } public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { @@ -318,7 +318,7 @@ public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { }))); assertThat( eTestPruneIsFalse.getMessage(), - containsString("Failed to parse mapping: [index_options] field [pruning_config] should not be set if [prune] is false") + containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") ); Exception eTestPruneIsMissing = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 3fd7406bec573..6c2f75c149ddf 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -321,7 +321,7 @@ teardown: features: headers - do: - catch: /\[index_options\] field \[pruning_config\] should not be set if \[prune\] is false/ + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index 1d5c62d30a134..a7260461f7a01 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -320,7 +320,7 @@ teardown: features: headers - do: - catch: /\[index_options\] field \[pruning_config\] should not be set if \[prune\] is false/ + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 99ac948e77a7b..cd102aebef62c 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -651,7 +651,7 @@ teardown: features: headers - do: - catch: /\[index_options\] field \[pruning_config\] should not be set if \[prune\] is false/ + catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json From 7f60eca701a0f79894abb7fa24c5f8fa40c18262 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 7 May 2025 09:00:30 -0400 Subject: [PATCH 43/86] Update docs/changelog/126739.yaml --- docs/changelog/126739.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog/126739.yaml diff --git a/docs/changelog/126739.yaml b/docs/changelog/126739.yaml new file mode 100644 index 0000000000000..e33c598422e28 --- /dev/null +++ b/docs/changelog/126739.yaml @@ -0,0 +1,6 @@ +pr: 126739 +summary: Update `sparse_vector` field mapping to include default setting for token + pruning +area: Relevance +type: enhancement +issues: [] From d8f3c6328a3a429117fdcb7334665a89bafa3f57 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 7 May 2025 10:31:24 -0400 Subject: [PATCH 44/86] refactor equals for IndexOptions --- .../index/mapper/vectors/SparseVectorFieldMapper.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 9bf04f8bece34..514a12cde40bd 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -454,10 +454,13 @@ public final boolean equals(Object other) { if (other == this) { return true; } - if (other instanceof IndexOptions otherOptions) { - return Objects.equals(prune, otherOptions.prune) && Objects.equals(pruningConfig, otherOptions.pruningConfig); + + if (other == null || getClass() != other.getClass()) { + return false; } - return false; + + IndexOptions otherAsIndexOptions = (IndexOptions) other; + return Objects.equals(prune, otherAsIndexOptions.prune) && Objects.equals(pruningConfig, otherAsIndexOptions.pruningConfig); } @Override From be783310c2a068c746ed501664990f06f8b143a0 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 21 May 2025 14:42:48 -0400 Subject: [PATCH 45/86] some cleanups; refactoring --- docs/changelog/126739.yaml | 2 +- .../xcontent/AbstractObjectParser.java | 9 ++ .../vectors/SparseVectorFieldMapper.java | 24 +++- .../mapper/vectors/TokenPruningConfig.java | 47 ++++++++ .../vectors/SparseVectorFieldMapperTests.java | 107 +++++++----------- .../ml/search/SparseVectorQueryBuilder.java | 9 +- .../test/multi_cluster/50_sparse_vector.yml | 23 ++-- .../test/remote_cluster/50_sparse_vector.yml | 23 ++-- .../test/ml/sparse_vector_search.yml | 14 +-- 9 files changed, 149 insertions(+), 109 deletions(-) diff --git a/docs/changelog/126739.yaml b/docs/changelog/126739.yaml index e33c598422e28..ff61327f9878e 100644 --- a/docs/changelog/126739.yaml +++ b/docs/changelog/126739.yaml @@ -1,6 +1,6 @@ pr: 126739 summary: Update `sparse_vector` field mapping to include default setting for token pruning -area: Relevance +area: Mapping type: enhancement issues: [] diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java index 244e1270fe530..1a6bf51aea896 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java @@ -287,6 +287,15 @@ public void declareBoolean(BiConsumer consumer, ParseField field declareField(consumer, XContentParser::booleanValue, field, ValueType.BOOLEAN); } + public void declareBooleanOrNull(BiConsumer consumer, boolean nullValue, ParseField field) { + declareField( + consumer, + p -> p.currentToken() == XContentParser.Token.VALUE_NULL ? nullValue : p.booleanValue(), + field, + ValueType.BOOLEAN_OR_NULL + ); + } + public void declareObjectArray(BiConsumer> consumer, ContextParser objectParser, ParseField field) { declareFieldArray(consumer, objectParser, field, ValueType.OBJECT_ARRAY); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 514a12cde40bd..0ebfee833ee5a 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -78,7 +78,7 @@ public class SparseVectorFieldMapper extends FieldMapper { private final SparseVectorFieldMapper.IndexOptions indexOptions; - public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector_index_options_supported"); + public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector.index_options_supported"); private static SparseVectorFieldMapper toType(FieldMapper in) { return (SparseVectorFieldMapper) in; @@ -484,7 +484,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws } public static Boolean parseIndexOptionsPruneValue(Map indexOptionsMap) { - Object shouldPrune = indexOptionsMap.remove(IndexOptions.PRUNE_FIELD_NAME); + Object shouldPrune = indexOptionsMap.get(IndexOptions.PRUNE_FIELD_NAME); if (shouldPrune == null) { return null; } @@ -493,17 +493,31 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio return boolValue; } - throw new MapperParsingException("[index_options] field [prune] should be true or false"); + throw new MapperParsingException( + "[" + + SPARSE_VECTOR_INDEX_OPTIONS + + "] field [" + + PRUNE_FIELD_NAME + + "] should be true or false" + ); } public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { - Object pruningConfiguration = indexOptionsMap.remove(IndexOptions.PRUNING_CONFIG_FIELD_NAME); + Object pruningConfiguration = indexOptionsMap.get(IndexOptions.PRUNING_CONFIG_FIELD_NAME); if (pruningConfiguration == null) { return null; } if (prune == null || prune == false) { - throw new MapperParsingException("[index_options] field [pruning_config] should only be set if [prune] is set to true"); + throw new MapperParsingException( + "[" + + SPARSE_VECTOR_INDEX_OPTIONS + + "] field [" + + PRUNING_CONFIG_FIELD_NAME + +"] should only be set if [" + + PRUNE_FIELD_NAME + + "] is set to true" + ); } Map pruningConfigurationMap = XContentMapValues.nodeMapValue(pruningConfiguration, PRUNING_CONFIG_FIELD_NAME); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index 687dacdd72cb1..e0144e7856689 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -9,16 +9,22 @@ package org.elasticsearch.index.mapper.vectors; +import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.query.QueryBuilder; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.DeprecationHandler; +import org.elasticsearch.xcontent.NamedXContentRegistry; import org.elasticsearch.xcontent.ParseField; import org.elasticsearch.xcontent.ToXContentObject; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.support.MapXContentParser; import java.io.IOException; import java.util.Locale; @@ -26,6 +32,9 @@ import java.util.Objects; import java.util.Set; +import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; +import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; + public class TokenPruningConfig implements Writeable, ToXContentObject { public static final String PRUNING_CONFIG_FIELD = "pruning_config"; @@ -184,7 +193,42 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx return new TokenPruningConfig(ratioThreshold, weightThreshold, onlyScorePrunedTokens); } + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + PRUNING_CONFIG_FIELD, + args -> new TokenPruningConfig( + args[0] == null ? DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD : (Float)args[0], + args[1] == null ? DEFAULT_TOKENS_WEIGHT_THRESHOLD : (Float)args[1], + args[2] != null && (Boolean) args[2] + ) + ); + + static { + PARSER.declareFloatOrNull(optionalConstructorArg(), DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, TOKENS_FREQ_RATIO_THRESHOLD); + PARSER.declareFloatOrNull(optionalConstructorArg(), DEFAULT_TOKENS_WEIGHT_THRESHOLD, TOKENS_WEIGHT_THRESHOLD); + PARSER.declareBooleanOrNull(optionalConstructorArg(), false, ONLY_SCORE_PRUNED_TOKENS_FIELD); + } + public static TokenPruningConfig parseFromMap(Map pruningConfigMap) { + if (pruningConfigMap == null) { + return null; + } + + try { + XContentParser parser = new MapXContentParser(NamedXContentRegistry.EMPTY, + DeprecationHandler.IGNORE_DEPRECATIONS, + pruningConfigMap, + XContentType.JSON + ); + + return PARSER.parse(parser, null); + } catch (Exception exc) { + if (exc.getCause() != null && exc.getCause().getClass().equals(IllegalArgumentException.class)) { + throw new ElasticsearchException(exc.getCause()); + } + throw new ElasticsearchException(exc); + } + + /* Object mappedTokensFreqRatioThreshold = pruningConfigMap.remove(TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName()); Object mappedTokensWeightThreshold = pruningConfigMap.remove(TOKENS_WEIGHT_THRESHOLD.getPreferredName()); Object mappedOnlyScorePrunedTokens = pruningConfigMap.remove(ONLY_SCORE_PRUNED_TOKENS_FIELD.getPreferredName()); @@ -202,8 +246,10 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM } return null; + */ } + /* private static Float parseFloatNumberFromObject(Object numberObject, String fieldName, String exceptionDetails) { if (numberObject instanceof Integer intValue) { return (float) intValue; @@ -276,4 +322,5 @@ private static boolean parseScorePrunedTokens(Object mappedScorePrunedTokens) { "[" + PRUNING_CONFIG_FIELD + "] field [" + ONLY_SCORE_PRUNED_TOKENS_FIELD.getPreferredName() + "] field should be true or false" ); } + */ } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 2302017762cff..9906940a38841 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -127,78 +127,28 @@ public void testDefaults() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); - ParsedDocument doc1 = mapper.parse(source(this::writeField)); + checkParsedDocument(mapper); + } - List fields = doc1.rootDoc().getFields("field"); - assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); - XFeatureField featureField1 = null; - XFeatureField featureField2 = null; - for (IndexableField field : fields) { - if (field.stringValue().equals("ten")) { - featureField1 = (XFeatureField) field; - } else if (field.stringValue().equals("twenty")) { - featureField2 = (XFeatureField) field; - } else { - throw new UnsupportedOperationException(); - } - } + public void testDefaultsPreIndexOptions() throws Exception { + DocumentMapper mapper = getDocumentMapperPreviousVersion(fieldMapping(this::minimalMapping)); + assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); - int freq1 = getFrequency(featureField1.tokenStream(null, null)); - int freq2 = getFrequency(featureField2.tokenStream(null, null)); - assertTrue(freq1 < freq2); + checkParsedDocument(mapper); } public void testWithIndexOptionsPrune() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPrune)); assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPrune)), mapper.mappingSource().toString()); - ParsedDocument doc1 = mapper.parse(source(this::writeField)); - - List fields = doc1.rootDoc().getFields("field"); - assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); - XFeatureField featureField1 = null; - XFeatureField featureField2 = null; - for (IndexableField field : fields) { - if (field.stringValue().equals("ten")) { - featureField1 = (XFeatureField) field; - } else if (field.stringValue().equals("twenty")) { - featureField2 = (XFeatureField) field; - } else { - throw new UnsupportedOperationException(); - } - } - - int freq1 = getFrequency(featureField1.tokenStream(null, null)); - int freq2 = getFrequency(featureField2.tokenStream(null, null)); - assertTrue(freq1 < freq2); + checkParsedDocument(mapper); } public void testWithIndexOptionsPruningConfigOnly() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::mappingWithIndexOptionsPruningConfig)); assertEquals(Strings.toString(fieldMapping(this::mappingWithIndexOptionsPruningConfig)), mapper.mappingSource().toString()); - ParsedDocument doc1 = mapper.parse(source(this::writeField)); - - List fields = doc1.rootDoc().getFields("field"); - assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); - XFeatureField featureField1 = null; - XFeatureField featureField2 = null; - for (IndexableField field : fields) { - if (field.stringValue().equals("ten")) { - featureField1 = (XFeatureField) field; - } else if (field.stringValue().equals("twenty")) { - featureField2 = (XFeatureField) field; - } else { - throw new UnsupportedOperationException(); - } - } - - int freq1 = getFrequency(featureField1.tokenStream(null, null)); - int freq2 = getFrequency(featureField2.tokenStream(null, null)); - assertTrue(freq1 < freq2); + checkParsedDocument(mapper); } public void testDotInFieldName() throws Exception { @@ -349,7 +299,7 @@ public void testTokensFreqRatioCorrect() { assertThat( eTestInteger.getMessage(), containsString( - "Failed to parse mapping: [pruning_config] field [tokens_freq_ratio_threshold]field should be a number between 1 and 100" + "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: [0:0] [pruning_config] failed to parse field [tokens_freq_ratio_threshold]" ) ); @@ -364,7 +314,7 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestRangeLower.getMessage(), - containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100") + containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0") ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -378,7 +328,7 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestRangeHigher.getMessage(), - containsString("[pruning_config] field [tokens_freq_ratio_threshold] field should be a number between 1 and 100") + containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got 101") ); } @@ -395,7 +345,7 @@ public void testTokensWeightThresholdCorrect() { assertThat( eTestDouble.getMessage(), containsString( - "Failed to parse mapping: [pruning_config] field [tokens_weight_threshold]field should be a number between 0.0 and 1.0" + "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: [0:0] [pruning_config] failed to parse field [tokens_weight_threshold]" ) ); @@ -410,7 +360,7 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestRangeLower.getMessage(), - containsString("[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0") + containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_weight_threshold] must be between 0 and 1") ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -424,7 +374,7 @@ public void testTokensWeightThresholdCorrect() { }))); assertThat( eTestRangeHigher.getMessage(), - containsString("[pruning_config] field [tokens_weight_threshold] field should be a number between 0.0 and 1.0") + containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_weight_threshold] must be between 0 and 1") ); } @@ -559,4 +509,33 @@ private Map toFloats(Map value) { } return result; } + + private void checkParsedDocument(DocumentMapper mapper) throws IOException { + ParsedDocument doc1 = mapper.parse(source(this::writeField)); + + List fields = doc1.rootDoc().getFields("field"); + assertEquals(2, fields.size()); + assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); + XFeatureField featureField1 = null; + XFeatureField featureField2 = null; + for (IndexableField field : fields) { + if (field.stringValue().equals("ten")) { + featureField1 = (XFeatureField) field; + } else if (field.stringValue().equals("twenty")) { + featureField2 = (XFeatureField) field; + } else { + throw new UnsupportedOperationException(); + } + } + + int freq1 = getFrequency(featureField1.tokenStream(null, null)); + int freq2 = getFrequency(featureField2.tokenStream(null, null)); + assertTrue(freq1 < freq2); + } + + private final IndexVersion PRE_SPARSE_VECTOR_INDEX_OPTIONS_VERSION = IndexVersions.DEFAULT_TO_ACORN_HNSW_FILTER_HEURISTIC; + + private DocumentMapper getDocumentMapperPreviousVersion(XContentBuilder mappings) throws IOException { + return createMapperService(PRE_SPARSE_VECTOR_INDEX_OPTIONS_VERSION, mappings).documentMapper(); + } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 80cec284189ff..4854c35e44334 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -165,8 +165,8 @@ public String getQuery() { return query; } - public boolean shouldPruneTokens() { - return shouldPruneTokens != null ? shouldPruneTokens : DEFAULT_PRUNE; + public Boolean shouldPruneTokens() { + return shouldPruneTokens; } public TokenPruningConfig getTokenPruningConfig() { @@ -481,10 +481,15 @@ private TokenPruningSet setPruningConfigFromIndexIfNeeded( boolean doPruneTokens = false; TokenPruningConfig setTokenPruningConfig = queryPruningConfig; if (queryPruneTokens == null || queryPruningConfig == null) { + if (queryPruneTokens != null) { + doPruneTokens = queryPruneTokens; + } + IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(fieldMapper); if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { doPruneTokens = true; } + if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { setTokenPruningConfig = indexPruningSettings.pruningConfig; } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 6c2f75c149ddf..d9b03619ac021 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -203,7 +203,7 @@ teardown: "Check sparse_vector token pruning index_options mappings": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -241,12 +241,13 @@ teardown: - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options mappings defaults": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -283,7 +284,7 @@ teardown: "Check sparse_vector token pruning index_options prune missing do not allow config": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -306,8 +307,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -315,7 +314,7 @@ teardown: "Check sparse_vector token pruning index_options prune false do not allow config": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -339,8 +338,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -348,7 +345,7 @@ teardown: "Check sparse_vector token pruning index_options tokens freq out of bounds": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -372,8 +369,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 101.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -381,7 +376,7 @@ teardown: "Check sparse_vector token pruning index_options tokens weight out of bounds": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -405,8 +400,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 5.0 tokens_weight_threshold: 3.5 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -414,7 +407,7 @@ teardown: "Check sparse_vector token pruning index_options in query": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index a7260461f7a01..1f7e7f6dbabc3 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -202,7 +202,7 @@ teardown: "Check sparse_vector token pruning index_options mappings": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -240,12 +240,13 @@ teardown: - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } + - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options mappings defaults": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -282,7 +283,7 @@ teardown: "Check sparse_vector token pruning index_options prune missing do not allow config": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -305,8 +306,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -314,7 +313,7 @@ teardown: "Check sparse_vector token pruning index_options prune false do not allow config": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -338,8 +337,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -347,7 +344,7 @@ teardown: "Check sparse_vector token pruning index_options tokens freq out of bounds": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -371,8 +368,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 101.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -380,7 +375,7 @@ teardown: "Check sparse_vector token pruning index_options tokens weight out of bounds": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -404,8 +399,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 5.0 tokens_weight_threshold: 3.5 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -413,7 +406,7 @@ teardown: "Check sparse_vector token pruning index_options in query": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index cd102aebef62c..ce63fbd75869d 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -533,7 +533,7 @@ teardown: "Check sparse_vector token pruning index_options mappings": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -576,7 +576,7 @@ teardown: "Check sparse_vector token pruning index_options mappings defaults": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -613,7 +613,7 @@ teardown: "Check sparse_vector token pruning index_options prune missing do not allow config": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -645,7 +645,7 @@ teardown: "Check sparse_vector token pruning index_options prune false do not allow config": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -678,7 +678,7 @@ teardown: "Check sparse_vector token pruning index_options tokens freq out of bounds": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -711,7 +711,7 @@ teardown: "Check sparse_vector token pruning index_options tokens weight out of bounds": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers @@ -744,7 +744,7 @@ teardown: "Check sparse_vector token pruning index_options in query": - requires: - cluster_features: 'sparse_vector_index_options_supported' + cluster_features: 'sparse_vector.index_options_supported' reason: "sparse_vector token pruning index options added support in 8.19" - skip: features: headers From a0cc202965bdc3eb40e31bc4a230febc53e9cc66 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 21 May 2025 18:52:10 +0000 Subject: [PATCH 46/86] [CI] Auto commit changes from spotless --- .../vectors/SparseVectorFieldMapper.java | 18 +++++++----------- .../mapper/vectors/TokenPruningConfig.java | 9 ++++----- .../vectors/SparseVectorFieldMapperTests.java | 8 ++++++-- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 0ebfee833ee5a..a083f1cd9c72e 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -494,11 +494,7 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio } throw new MapperParsingException( - "[" - + SPARSE_VECTOR_INDEX_OPTIONS - + "] field [" - + PRUNE_FIELD_NAME - + "] should be true or false" + "[" + SPARSE_VECTOR_INDEX_OPTIONS + "] field [" + PRUNE_FIELD_NAME + "] should be true or false" ); } @@ -511,12 +507,12 @@ public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, M if (prune == null || prune == false) { throw new MapperParsingException( "[" - + SPARSE_VECTOR_INDEX_OPTIONS - + "] field [" - + PRUNING_CONFIG_FIELD_NAME - +"] should only be set if [" - + PRUNE_FIELD_NAME - + "] is set to true" + + SPARSE_VECTOR_INDEX_OPTIONS + + "] field [" + + PRUNING_CONFIG_FIELD_NAME + + "] should only be set if [" + + PRUNE_FIELD_NAME + + "] is set to true" ); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index e0144e7856689..f02ad7fde6301 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -14,7 +14,6 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; -import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.xcontent.ConstructingObjectParser; import org.elasticsearch.xcontent.DeprecationHandler; @@ -32,7 +31,6 @@ import java.util.Objects; import java.util.Set; -import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; public class TokenPruningConfig implements Writeable, ToXContentObject { @@ -196,8 +194,8 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( PRUNING_CONFIG_FIELD, args -> new TokenPruningConfig( - args[0] == null ? DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD : (Float)args[0], - args[1] == null ? DEFAULT_TOKENS_WEIGHT_THRESHOLD : (Float)args[1], + args[0] == null ? DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD : (Float) args[0], + args[1] == null ? DEFAULT_TOKENS_WEIGHT_THRESHOLD : (Float) args[1], args[2] != null && (Boolean) args[2] ) ); @@ -214,7 +212,8 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM } try { - XContentParser parser = new MapXContentParser(NamedXContentRegistry.EMPTY, + XContentParser parser = new MapXContentParser( + NamedXContentRegistry.EMPTY, DeprecationHandler.IGNORE_DEPRECATIONS, pruningConfigMap, XContentType.JSON diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 9906940a38841..a6d50b5ada95e 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -314,7 +314,9 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestRangeLower.getMessage(), - containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0") + containsString( + "Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0" + ) ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -328,7 +330,9 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestRangeHigher.getMessage(), - containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got 101") + containsString( + "Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got 101" + ) ); } From 223a7945a1c89af0707d5e6a5cb8b14381173bd4 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 21 May 2025 16:12:54 -0400 Subject: [PATCH 47/86] fix yaml tests --- .../mapper/vectors/TokenPruningConfig.java | 96 ------------------- .../test/multi_cluster/50_sparse_vector.yml | 4 +- .../test/remote_cluster/50_sparse_vector.yml | 4 +- .../test/ml/sparse_vector_search.yml | 4 +- 4 files changed, 6 insertions(+), 102 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index f02ad7fde6301..de9250b4b1d41 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -226,100 +226,4 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM } throw new ElasticsearchException(exc); } - - /* - Object mappedTokensFreqRatioThreshold = pruningConfigMap.remove(TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName()); - Object mappedTokensWeightThreshold = pruningConfigMap.remove(TOKENS_WEIGHT_THRESHOLD.getPreferredName()); - Object mappedOnlyScorePrunedTokens = pruningConfigMap.remove(ONLY_SCORE_PRUNED_TOKENS_FIELD.getPreferredName()); - - if (pruningConfigMap.isEmpty() == false) { - throw new MapperParsingException("[" + PRUNING_CONFIG_FIELD + "] has unknown fields"); - } - - Float tokensFreqRatioThreshold = parseTokensFreqRatioThreshold(mappedTokensFreqRatioThreshold); - Float tokensWeightThreshold = parseTokensWeightThreshold(mappedTokensWeightThreshold); - boolean onlyScorePrunedTokens = mappedOnlyScorePrunedTokens != null ? parseScorePrunedTokens(mappedOnlyScorePrunedTokens) : false; - - if (tokensFreqRatioThreshold != null || tokensWeightThreshold != null) { - return new TokenPruningConfig(tokensFreqRatioThreshold, tokensWeightThreshold, onlyScorePrunedTokens); - } - - return null; - */ - } - - /* - private static Float parseFloatNumberFromObject(Object numberObject, String fieldName, String exceptionDetails) { - if (numberObject instanceof Integer intValue) { - return (float) intValue; - } else if (numberObject instanceof Float floatValue) { - return floatValue; - } else if (numberObject instanceof Double doubleValue) { - return ((Double) numberObject).floatValue(); - } - - throw new MapperParsingException("[" + PRUNING_CONFIG_FIELD + "] field [" + fieldName + "]" + exceptionDetails); - } - - private static Float parseTokensWeightThreshold(Object mappedTokensWeightThreshold) { - if (mappedTokensWeightThreshold == null) { - return DEFAULT_TOKENS_WEIGHT_THRESHOLD; - } - - Float tokensWeightThreshold = parseFloatNumberFromObject( - mappedTokensWeightThreshold, - TOKENS_WEIGHT_THRESHOLD.getPreferredName(), - "field should be a number between 0.0 and 1.0" - ); - - if (tokensWeightThreshold < MIN_TOKENS_WEIGHT_THRESHOLD || tokensWeightThreshold > MAX_TOKENS_WEIGHT_THRESHOLD) { - throw new MapperParsingException( - "[" - + PRUNING_CONFIG_FIELD - + "] field [" - + TOKENS_WEIGHT_THRESHOLD.getPreferredName() - + "] field should be a number between 0.0 and 1.0" - ); - } - return tokensWeightThreshold; - } - - private static Float parseTokensFreqRatioThreshold(Object mappedTokensFreqRatioThreshold) { - if (mappedTokensFreqRatioThreshold == null) { - return DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD; - } - - Float tokensFreqRatioThreshold = parseFloatNumberFromObject( - mappedTokensFreqRatioThreshold, - TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName(), - "field should be a number between 1 and 100" - ); - - if (tokensFreqRatioThreshold < MIN_TOKENS_FREQ_RATIO_THRESHOLD || tokensFreqRatioThreshold > MAX_TOKENS_FREQ_RATIO_THRESHOLD) { - throw new MapperParsingException( - "[" - + PRUNING_CONFIG_FIELD - + "] field [" - + TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName() - + "] field should be a number between 1 and 100" - ); - } - - return tokensFreqRatioThreshold; - } - - private static boolean parseScorePrunedTokens(Object mappedScorePrunedTokens) { - if (mappedScorePrunedTokens == null) { - return false; - } - - if (mappedScorePrunedTokens instanceof Boolean boolValue) { - return boolValue; - } - - throw new MapperParsingException( - "[" + PRUNING_CONFIG_FIELD + "] field [" + ONLY_SCORE_PRUNED_TOKENS_FIELD.getPreferredName() + "] field should be true or false" - ); - } - */ } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index d9b03619ac021..cf30014216ffc 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -351,7 +351,7 @@ teardown: features: headers - do: - catch: /\[pruning_config\] field \[tokens_freq_ratio_threshold\] field should be a number between 1 and 100/ + catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json @@ -382,7 +382,7 @@ teardown: features: headers - do: - catch: /\[pruning_config\] field \[tokens_weight_threshold\] field should be a number between 0.0 and 1.0/ + catch: /\[tokens_weight_threshold\] must be between 0 and 1/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index 1f7e7f6dbabc3..b8fe638fdd0a7 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -350,7 +350,7 @@ teardown: features: headers - do: - catch: /\[pruning_config\] field \[tokens_freq_ratio_threshold\] field should be a number between 1 and 100/ + catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json @@ -381,7 +381,7 @@ teardown: features: headers - do: - catch: /\[pruning_config\] field \[tokens_weight_threshold\] field should be a number between 0.0 and 1.0/ + catch: /\[tokens_weight_threshold\] must be between 0 and 1/ headers: Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials Content-Type: application/json diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index ce63fbd75869d..1afede3bf247f 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -684,7 +684,7 @@ teardown: features: headers - do: - catch: /\[pruning_config\] field \[tokens_freq_ratio_threshold\] field should be a number between 1 and 100/ + catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json @@ -717,7 +717,7 @@ teardown: features: headers - do: - catch: /\[pruning_config\] field \[tokens_weight_threshold\] field should be a number between 0.0 and 1.0/ + catch: /\[tokens_weight_threshold\] must be between 0 and 1/ headers: Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json From dd4a218ed3d5b7f6c92d831181951d8d28f61fb9 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 21 May 2025 16:26:15 -0400 Subject: [PATCH 48/86] fix missing } --- .../index/mapper/vectors/TokenPruningConfig.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index de9250b4b1d41..f05c31b5cc85f 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -212,8 +212,7 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM } try { - XContentParser parser = new MapXContentParser( - NamedXContentRegistry.EMPTY, + XContentParser parser = new MapXContentParser(NamedXContentRegistry.EMPTY, DeprecationHandler.IGNORE_DEPRECATIONS, pruningConfigMap, XContentType.JSON @@ -226,4 +225,5 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM } throw new ElasticsearchException(exc); } + } } From d60e2df92eb57f4b0b52ba2b04b142bd1b0b3818 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 21 May 2025 20:34:58 +0000 Subject: [PATCH 49/86] [CI] Auto commit changes from spotless --- .../elasticsearch/index/mapper/vectors/TokenPruningConfig.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index f05c31b5cc85f..c797a0e86395d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -212,7 +212,8 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM } try { - XContentParser parser = new MapXContentParser(NamedXContentRegistry.EMPTY, + XContentParser parser = new MapXContentParser( + NamedXContentRegistry.EMPTY, DeprecationHandler.IGNORE_DEPRECATIONS, pruningConfigMap, XContentType.JSON From a3bb90427f6b7e3c7a11fb107bd71c083a678416 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 21 May 2025 20:36:46 -0400 Subject: [PATCH 50/86] more cleanups; need to complete tests --- .../vectors/SparseVectorFieldMapper.java | 5 +- .../ml/search/SparseVectorQueryBuilder.java | 115 ++------------- .../search/SparseVectorQueryBuilderTests.java | 136 ++++++++++++------ 3 files changed, 109 insertions(+), 147 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index a083f1cd9c72e..b4578fa6a8cb1 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -76,8 +76,6 @@ public class SparseVectorFieldMapper extends FieldMapper { public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; - private final SparseVectorFieldMapper.IndexOptions indexOptions; - public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector.index_options_supported"); private static SparseVectorFieldMapper toType(FieldMapper in) { @@ -127,7 +125,7 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) { } public IndexOptions getIndexOptions() { - return this.indexOptions; + return fieldType().getIndexOptions(); } private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { @@ -223,7 +221,6 @@ private static String indexedValueForSearch(Object value) { private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams builderParams) { super(simpleName, mappedFieldType, builderParams); - this.indexOptions = ((SparseVectorFieldType) mappedFieldType).getIndexOptions(); } @Override diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 4854c35e44334..bf3c99dc09d96 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -238,20 +238,16 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { Boolean pruneTokensToUse = shouldPruneTokens; TokenPruningConfig pruningConfigToUse = tokenPruningConfig; - // if the query options for pruning are not set, - // we need to check the index options for this field - // and use those if set - however, only if the index - // was created after we added this support. - if (context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { - SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForSearchExecution(fieldName, context); - TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded( - shouldPruneTokens, - tokenPruningConfig, - sparseVectorFieldMapper - ); - - pruneTokensToUse = pruningOptions.pruneTokens; - pruningConfigToUse = pruningOptions.pruningConfig; + // if the query options for pruning are not set, we need to check the index options for this field + // and use those if set - however, only if the index was created after we added this support. + if (ft.getClass().equals(SparseVectorFieldMapper.SparseVectorFieldType.class) && + context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { + SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType = (SparseVectorFieldMapper.SparseVectorFieldType) ft; + + if (asSVFieldType.getIndexOptions() != null) { + pruneTokensToUse = pruneTokensToUse == null ? asSVFieldType.getIndexOptions().getPrune() : pruneTokensToUse; + pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; + } } return (pruneTokensToUse != null && pruneTokensToUse) @@ -271,23 +267,13 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) { return this; // No results yet } - // if the query options for pruning are not set, - // we need to check the index options for this field - // and use those if set. - SparseVectorFieldMapper sparseVectorFieldMapper = getSparseVectorFieldMapperForQueryRewrite(fieldName, queryRewriteContext); - TokenPruningSet pruningOptions = setPruningConfigFromIndexIfNeeded( - shouldPruneTokens, - tokenPruningConfig, - sparseVectorFieldMapper - ); - return new SparseVectorQueryBuilder( fieldName, textExpansionResults.getWeightedTokens(), null, null, - pruningOptions.pruneTokens, - pruningOptions.pruningConfig + shouldPruneTokens, + tokenPruningConfig ); } else if (inferenceId == null) { // Edge case, where inference_id was not specified in the request, @@ -422,81 +408,4 @@ public static SparseVectorQueryBuilder fromXContent(XContentParser parser) { throw new ParsingException(parser.getTokenLocation(), e.getMessage(), e); } } - - private record IndexFieldPruningSettings(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) {} - - private IndexFieldPruningSettings getIndexFieldPruningSettings(SparseVectorFieldMapper sparseVectorFieldMapper) { - if (sparseVectorFieldMapper == null) { - return new IndexFieldPruningSettings(null, null); - } - - SparseVectorFieldMapper.IndexOptions indexOptions = sparseVectorFieldMapper.getIndexOptions(); - if (indexOptions == null) { - // return the default if not set in the index options - return new IndexFieldPruningSettings( - true, - new TokenPruningConfig( - TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, - TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, - false - ) - ); - } - - Boolean indexOptionsPrune = indexOptions.getPrune(); - - TokenPruningConfig indexPruningConfig = indexOptions.getPruningConfig(); - TokenPruningConfig indexTokenPruningConfig = indexPruningConfig != null - ? new TokenPruningConfig(indexPruningConfig.getTokensFreqRatioThreshold(), indexPruningConfig.getTokensWeightThreshold(), false) - : null; - - return new IndexFieldPruningSettings(indexOptionsPrune, indexTokenPruningConfig); - } - - private SparseVectorFieldMapper getSparseVectorFieldMapper(String fieldName, Mapping fieldMapping) { - RootObjectMapper rootMapping = fieldMapping.getRoot(); - Mapper thisMapper = rootMapping.getMapper(fieldName); - if (thisMapper instanceof SparseVectorFieldMapper) { - return (SparseVectorFieldMapper) thisMapper; - } - - return null; - } - - private SparseVectorFieldMapper getSparseVectorFieldMapperForSearchExecution(String fieldName, SearchExecutionContext context) { - return getSparseVectorFieldMapper(fieldName, context.getMappingLookup().getMapping()); - } - - private SparseVectorFieldMapper getSparseVectorFieldMapperForQueryRewrite(String fieldName, QueryRewriteContext context) { - return getSparseVectorFieldMapper(fieldName, context.getMappingLookup().getMapping()); - } - - private record TokenPruningSet(boolean pruneTokens, TokenPruningConfig pruningConfig) {} - - private TokenPruningSet setPruningConfigFromIndexIfNeeded( - Boolean queryPruneTokens, - TokenPruningConfig queryPruningConfig, - SparseVectorFieldMapper fieldMapper - ) { - boolean doPruneTokens = false; - TokenPruningConfig setTokenPruningConfig = queryPruningConfig; - if (queryPruneTokens == null || queryPruningConfig == null) { - if (queryPruneTokens != null) { - doPruneTokens = queryPruneTokens; - } - - IndexFieldPruningSettings indexPruningSettings = getIndexFieldPruningSettings(fieldMapper); - if (shouldPruneTokens == null && indexPruningSettings.prune != null && indexPruningSettings.prune) { - doPruneTokens = true; - } - - if (setTokenPruningConfig == null && indexPruningSettings.pruningConfig != null) { - setTokenPruningConfig = indexPruningSettings.pruningConfig; - } - } else { - doPruneTokens = queryPruneTokens; - } - - return new TokenPruningSet(doPruneTokens, setTokenPruningConfig); - } } diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index e3058853cf139..978ba404b7b1c 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -43,6 +43,10 @@ import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults; import java.io.IOException; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collection; @@ -60,7 +64,10 @@ public class SparseVectorQueryBuilderTests extends AbstractQueryTestCase WEIGHTED_TOKENS = List.of(new WeightedToken("foo", .42f)); private static final int NUM_TOKENS = WEIGHTED_TOKENS.size(); - private boolean testWithSparseVectorFieldIndexOptions = false; + @Retention(RetentionPolicy.RUNTIME) + @Target(ElementType.METHOD) + public @interface InjectSparseVectorIndexOptions { + } @Override protected SparseVectorQueryBuilder doCreateTestQueryBuilder() { @@ -148,7 +155,7 @@ protected Object simulateMethod(Method method, Object[] args) { @Override protected void initializeAdditionalMappings(MapperService mapperService) throws IOException { - if (testWithSparseVectorFieldIndexOptions) { + if (shouldInjectSparseVectorIndexOptions()) { addSparseVectorIndexOptionsMapping(mapperService); return; } @@ -160,6 +167,18 @@ protected void initializeAdditionalMappings(MapperService mapperService) throws ); } + private boolean shouldInjectSparseVectorIndexOptions() { + Class clazz = this.getClass(); + Class injectSparseVectorIndexOptions = InjectSparseVectorIndexOptions.class; + + try { + Method method = clazz.getMethod(this.getTestName()); + return method.isAnnotationPresent(injectSparseVectorIndexOptions); + } catch (NoSuchMethodException e) { + return false; + } + } + private void addSparseVectorIndexOptionsMapping(MapperService mapperService) throws IOException { String addIndexOptionsTemplate = "{\"properties\":{\"" + SPARSE_VECTOR_FIELD @@ -188,24 +207,34 @@ protected void doAssertLuceneQuery(SparseVectorQueryBuilder queryBuilder, Query } } - /** - * Overridden to ensure that {@link SearchExecutionContext} has a non-null {@link IndexReader} - */ - @Override - public void testCacheability() throws IOException { + private void withSearchIndex(Consumer consumer) throws IOException { try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { Document document = new Document(); document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); iw.addDocument(document); try (IndexReader reader = iw.getReader()) { SearchExecutionContext context = createSearchExecutionContext(newSearcher(reader)); - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - QueryBuilder rewriteQuery = rewriteQuery(queryBuilder, new SearchExecutionContext(context)); + consumer.accept(context); + } + } + } + /** + * Overridden to ensure that {@link SearchExecutionContext} has a non-null {@link IndexReader} + */ + @Override + public void testCacheability() throws IOException { + withSearchIndex((context) -> { + try { + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + QueryBuilder rewriteQuery = null; + rewriteQuery = rewriteQuery(queryBuilder, new SearchExecutionContext(context)); assertNotNull(rewriteQuery.toQuery(context)); assertTrue("query should be cacheable: " + queryBuilder.toString(), context.isCacheable()); + } catch (IOException e) { + throw new RuntimeException(e); } - } + }); } /** @@ -213,16 +242,14 @@ public void testCacheability() throws IOException { */ @Override public void testMustRewrite() throws IOException { - try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); - iw.addDocument(document); - try (IndexReader reader = iw.getReader()) { - SearchExecutionContext context = createSearchExecutionContext(newSearcher(reader)); + withSearchIndex((context) -> { + try { SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); queryBuilder.toQuery(context); + } catch (IOException e) { + throw new RuntimeException(e); } - } + }); } /** @@ -230,12 +257,8 @@ public void testMustRewrite() throws IOException { */ @Override public void testToQuery() throws IOException { - try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { - Document document = new Document(); - document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); - iw.addDocument(document); - try (IndexReader reader = iw.getReader()) { - SearchExecutionContext context = createSearchExecutionContext(newSearcher(reader)); + withSearchIndex((context) -> { + try { SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); if (queryBuilder.getQueryVectors() == null) { QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); @@ -244,8 +267,10 @@ public void testToQuery() throws IOException { } else { testDoToQuery(queryBuilder, context); } + } catch (IOException e) { + throw new RuntimeException(e); } - } + }); } private void testDoToQuery(SparseVectorQueryBuilder queryBuilder, SearchExecutionContext context) throws IOException { @@ -357,26 +382,57 @@ public void testThatWeCorrectlyRewriteQueryIntoVectors() { assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); } - public void testWeCorrectlyRewriteQueryIntoVectorsWithIndexOptions() { - wrapTestSparseVectorIndexOptions((c) -> { - SearchExecutionContext searchExecutionContext = createSearchExecutionContext(); + @InjectSparseVectorIndexOptions + public void testItUsesIndexOptionsDefaults() throws IOException { + withSearchIndex((context) -> { + try { + SparseVectorQueryBuilder builder = new SparseVectorQueryBuilder( + SPARSE_VECTOR_FIELD, + WEIGHTED_TOKENS, + null, + null, + null, + null + ); + Query query = builder.doToQuery(context); + + } catch (IOException ex) { + throw new RuntimeException(ex); + } + }); + } - TokenPruningConfig defaultTokenPruningConfig = new TokenPruningConfig(12, 0.6f, false); + @InjectSparseVectorIndexOptions + public void testItOverridesIndexOptionsDefaults() { - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(null); - QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, searchExecutionContext); - assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); - assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); - assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); + } + + @InjectSparseVectorIndexOptions + public void testToQueryWithIndexOptions() throws IOException { + withSearchIndex((context) -> { + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + try { + if (queryBuilder.getQueryVectors() == null) { + QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); + assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); + testDoToQuery((SparseVectorQueryBuilder) rewrittenQueryBuilder, context); + } else { + testDoToQuery(queryBuilder, context); + } + } catch (IOException e) { + throw new RuntimeException(e); + } }); } - private void wrapTestSparseVectorIndexOptions(Consumer testMethod) { - testWithSparseVectorFieldIndexOptions = true; - try { - testMethod.accept(true); - } finally { - testWithSparseVectorFieldIndexOptions = false; - } + @InjectSparseVectorIndexOptions + public void testWeCorrectlyRewriteQueryIntoVectorsWithIndexOptions() { + SearchExecutionContext searchExecutionContext = createSearchExecutionContext(); + + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(null); + QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, searchExecutionContext); + assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); + assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); + assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); } } From 4a23c9cb831c70ed902f03b1e72369a91c7988cc Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 22 May 2025 00:45:53 +0000 Subject: [PATCH 51/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorQueryBuilder.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index bf3c99dc09d96..01800174b138c 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -18,9 +18,6 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.mapper.MappedFieldType; -import org.elasticsearch.index.mapper.Mapper; -import org.elasticsearch.index.mapper.Mapping; -import org.elasticsearch.index.mapper.RootObjectMapper; import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.index.query.AbstractQueryBuilder; @@ -240,8 +237,8 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { // if the query options for pruning are not set, we need to check the index options for this field // and use those if set - however, only if the index was created after we added this support. - if (ft.getClass().equals(SparseVectorFieldMapper.SparseVectorFieldType.class) && - context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { + if (ft.getClass().equals(SparseVectorFieldMapper.SparseVectorFieldType.class) + && context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType = (SparseVectorFieldMapper.SparseVectorFieldType) ft; if (asSVFieldType.getIndexOptions() != null) { From e2e65db8e9922216519c52fcfe71967d8a48d468 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 22 May 2025 13:09:43 -0400 Subject: [PATCH 52/86] additional tests and refactoring --- .../ml/search/SparseVectorQueryBuilder.java | 9 ++- .../search/SparseVectorQueryBuilderTests.java | 73 ++++++++----------- 2 files changed, 38 insertions(+), 44 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 01800174b138c..3d1a0cee20ebc 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -42,6 +42,7 @@ import java.util.Map; import java.util.Objects; +import static org.elasticsearch.TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19; import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN; @@ -124,7 +125,9 @@ public SparseVectorQueryBuilder( public SparseVectorQueryBuilder(StreamInput in) throws IOException { super(in); this.fieldName = in.readString(); - if (in.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { + if (in.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) || + in.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS) + ) { this.shouldPruneTokens = in.readOptionalBoolean(); } else { this.shouldPruneTokens = in.readBoolean(); @@ -177,7 +180,9 @@ protected void doWriteTo(StreamOutput out) throws IOException { } out.writeString(fieldName); - if (out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { + if (out.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) + || out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS) + ) { out.writeOptionalBoolean(shouldPruneTokens); } else { out.writeBoolean(shouldPruneTokens != null && shouldPruneTokens); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index 978ba404b7b1c..cc98e14821180 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -155,19 +155,26 @@ protected Object simulateMethod(Method method, Object[] args) { @Override protected void initializeAdditionalMappings(MapperService mapperService) throws IOException { - if (shouldInjectSparseVectorIndexOptions()) { - addSparseVectorIndexOptionsMapping(mapperService); - return; - } - mapperService.merge( "_doc", - new CompressedXContent(Strings.toString(PutMappingRequest.simpleMapping(SPARSE_VECTOR_FIELD, "type=sparse_vector"))), + new CompressedXContent(getTestSparseVectorIndexMapping()), MapperService.MergeReason.MAPPING_UPDATE ); } - private boolean shouldInjectSparseVectorIndexOptions() { + private String getTestSparseVectorIndexMapping() { + if (currentTestHasIndexOptions()) { + return "{\"properties\":{\"" + + SPARSE_VECTOR_FIELD + + "\":{\"type\":\"sparse_vector\",\"index_options\"" + + ":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\"" + + ":12,\"tokens_weight_threshold\":0.6}}}}}"; + } + + return Strings.toString(PutMappingRequest.simpleMapping(SPARSE_VECTOR_FIELD, "type=sparse_vector")); + } + + private boolean currentTestHasIndexOptions() { Class clazz = this.getClass(); Class injectSparseVectorIndexOptions = InjectSparseVectorIndexOptions.class; @@ -179,15 +186,6 @@ private boolean shouldInjectSparseVectorIndexOptions() { } } - private void addSparseVectorIndexOptionsMapping(MapperService mapperService) throws IOException { - String addIndexOptionsTemplate = "{\"properties\":{\"" - + SPARSE_VECTOR_FIELD - + "\":{\"type\":\"sparse_vector\",\"index_options\"" - + ":{\"prune\":true,\"pruning_config\":{\"tokens_freq_ratio_threshold\"" - + ":12,\"tokens_weight_threshold\":0.6}}}}}"; - mapperService.merge("_doc", new CompressedXContent(addIndexOptionsTemplate), MapperService.MergeReason.MAPPING_UPDATE); - } - @Override protected void doAssertLuceneQuery(SparseVectorQueryBuilder queryBuilder, Query query, SearchExecutionContext context) { assertThat(query, instanceOf(SparseVectorQueryWrapper.class)); @@ -283,7 +281,7 @@ private void testDoToQuery(SparseVectorQueryBuilder queryBuilder, SearchExecutio assertTrue(query instanceof SparseVectorQueryWrapper); var sparseQuery = (SparseVectorQueryWrapper) query; - if (queryBuilder.shouldPruneTokens()) { + if (queryBuilder.shouldPruneTokens() || currentTestHasIndexOptions()) { // It's possible that all documents were pruned for aggressive pruning configurations assertTrue(sparseQuery.getTermsQuery() instanceof BooleanQuery || sparseQuery.getTermsQuery() instanceof MatchNoDocsQuery); } else { @@ -386,16 +384,9 @@ public void testThatWeCorrectlyRewriteQueryIntoVectors() { public void testItUsesIndexOptionsDefaults() throws IOException { withSearchIndex((context) -> { try { - SparseVectorQueryBuilder builder = new SparseVectorQueryBuilder( - SPARSE_VECTOR_FIELD, - WEIGHTED_TOKENS, - null, - null, - null, - null - ); - Query query = builder.doToQuery(context); - + SparseVectorQueryBuilder builder = createTestQueryBuilder(null); + assertFalse(builder.shouldPruneTokens()); + testDoToQuery(builder, context); } catch (IOException ex) { throw new RuntimeException(ex); } @@ -403,14 +394,23 @@ public void testItUsesIndexOptionsDefaults() throws IOException { } @InjectSparseVectorIndexOptions - public void testItOverridesIndexOptionsDefaults() { - + public void testItOverridesIndexOptionsDefaults() throws IOException { + withSearchIndex((context) -> { + try { + TokenPruningConfig pruningConfig = new TokenPruningConfig(2, 0.3f, false); + SparseVectorQueryBuilder builder = createTestQueryBuilder(pruningConfig); + assertTrue(builder.shouldPruneTokens()); + testDoToQuery(builder, context); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + }); } @InjectSparseVectorIndexOptions - public void testToQueryWithIndexOptions() throws IOException { + public void testToQueryRewriteWithIndexOptions() throws IOException { withSearchIndex((context) -> { - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(null); try { if (queryBuilder.getQueryVectors() == null) { QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); @@ -424,15 +424,4 @@ public void testToQueryWithIndexOptions() throws IOException { } }); } - - @InjectSparseVectorIndexOptions - public void testWeCorrectlyRewriteQueryIntoVectorsWithIndexOptions() { - SearchExecutionContext searchExecutionContext = createSearchExecutionContext(); - - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(null); - QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, searchExecutionContext); - assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); - assertEquals(queryBuilder.shouldPruneTokens(), ((SparseVectorQueryBuilder) rewrittenQueryBuilder).shouldPruneTokens()); - assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); - } } From cf7a302fe306be973ceae762cc2ce5970319d546 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 22 May 2025 17:16:44 +0000 Subject: [PATCH 53/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorQueryBuilder.java | 8 +++----- .../core/ml/search/SparseVectorQueryBuilderTests.java | 6 +----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 3d1a0cee20ebc..b0f6873efa462 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -125,9 +125,8 @@ public SparseVectorQueryBuilder( public SparseVectorQueryBuilder(StreamInput in) throws IOException { super(in); this.fieldName = in.readString(); - if (in.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) || - in.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS) - ) { + if (in.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) + || in.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { this.shouldPruneTokens = in.readOptionalBoolean(); } else { this.shouldPruneTokens = in.readBoolean(); @@ -181,8 +180,7 @@ protected void doWriteTo(StreamOutput out) throws IOException { out.writeString(fieldName); if (out.getTransportVersion().isPatchFrom(SPARSE_VECTOR_FIELD_PRUNING_OPTIONS_8_19) - || out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS) - ) { + || out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { out.writeOptionalBoolean(shouldPruneTokens); } else { out.writeBoolean(shouldPruneTokens != null && shouldPruneTokens); diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index cc98e14821180..b3dae7114706c 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -155,11 +155,7 @@ protected Object simulateMethod(Method method, Object[] args) { @Override protected void initializeAdditionalMappings(MapperService mapperService) throws IOException { - mapperService.merge( - "_doc", - new CompressedXContent(getTestSparseVectorIndexMapping()), - MapperService.MergeReason.MAPPING_UPDATE - ); + mapperService.merge("_doc", new CompressedXContent(getTestSparseVectorIndexMapping()), MapperService.MergeReason.MAPPING_UPDATE); } private String getTestSparseVectorIndexMapping() { From 7a12676c944064387e6f5094a54fc4c820baf824 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 22 May 2025 15:55:05 -0400 Subject: [PATCH 54/86] fix lint --- .../vectors/SparseVectorFieldMapperTests.java | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index a6d50b5ada95e..e23c9f1ed664e 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -153,7 +153,9 @@ public void testWithIndexOptionsPruningConfigOnly() throws Exception { public void testDotInFieldName() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); - ParsedDocument parsedDocument = mapper.parse(source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20)))); + ParsedDocument parsedDocument = mapper.parse( + source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20))) + ); List fields = parsedDocument.rootDoc().getFields("field"); assertEquals(2, fields.size()); @@ -194,7 +196,8 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce // then fail appropriately assertEquals( - "[sparse_vector] fields take hashes that map a feature to a strictly positive float, but got unexpected token " + "START_ARRAY", + "[sparse_vector] fields take hashes that map a feature to a strictly positive float, " + + "but got unexpected token " + "START_ARRAY", e.getCause().getMessage() ); @@ -299,7 +302,8 @@ public void testTokensFreqRatioCorrect() { assertThat( eTestInteger.getMessage(), containsString( - "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: [0:0] [pruning_config] failed to parse field [tokens_freq_ratio_threshold]" + "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: " + + "[0:0] [pruning_config] failed to parse field [tokens_freq_ratio_threshold]" ) ); @@ -315,7 +319,8 @@ public void testTokensFreqRatioCorrect() { assertThat( eTestRangeLower.getMessage(), containsString( - "Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0" + "Failed to parse mapping: java.lang.IllegalArgumentException: " + + "[tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0" ) ); @@ -331,7 +336,8 @@ public void testTokensFreqRatioCorrect() { assertThat( eTestRangeHigher.getMessage(), containsString( - "Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_freq_ratio_threshold] must be between [1] and [100], got 101" + "Failed to parse mapping: java.lang.IllegalArgumentException: " + + "[tokens_freq_ratio_threshold] must be between [1] and [100], got 101" ) ); } @@ -349,7 +355,8 @@ public void testTokensWeightThresholdCorrect() { assertThat( eTestDouble.getMessage(), containsString( - "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: [0:0] [pruning_config] failed to parse field [tokens_weight_threshold]" + "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: " + + "[0:0] [pruning_config] failed to parse field [tokens_weight_threshold]" ) ); From 28b5b74ac6e112ca702f1847bfdcf1e06ef97be0 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 22 May 2025 20:03:51 +0000 Subject: [PATCH 55/86] [CI] Auto commit changes from spotless --- .../vectors/SparseVectorFieldMapperTests.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index e23c9f1ed664e..f5fbd1aa2c1f9 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -153,9 +153,7 @@ public void testWithIndexOptionsPruningConfigOnly() throws Exception { public void testDotInFieldName() throws Exception { DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); - ParsedDocument parsedDocument = mapper.parse( - source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20))) - ); + ParsedDocument parsedDocument = mapper.parse(source(b -> b.field("field", Map.of("foo.bar", 10, "foobar", 20)))); List fields = parsedDocument.rootDoc().getFields("field"); assertEquals(2, fields.size()); @@ -197,7 +195,8 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce // then fail appropriately assertEquals( "[sparse_vector] fields take hashes that map a feature to a strictly positive float, " - + "but got unexpected token " + "START_ARRAY", + + "but got unexpected token " + + "START_ARRAY", e.getCause().getMessage() ); @@ -303,7 +302,7 @@ public void testTokensFreqRatioCorrect() { eTestInteger.getMessage(), containsString( "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: " - + "[0:0] [pruning_config] failed to parse field [tokens_freq_ratio_threshold]" + + "[0:0] [pruning_config] failed to parse field [tokens_freq_ratio_threshold]" ) ); @@ -320,7 +319,7 @@ public void testTokensFreqRatioCorrect() { eTestRangeLower.getMessage(), containsString( "Failed to parse mapping: java.lang.IllegalArgumentException: " - + "[tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0" + + "[tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0" ) ); @@ -337,7 +336,7 @@ public void testTokensFreqRatioCorrect() { eTestRangeHigher.getMessage(), containsString( "Failed to parse mapping: java.lang.IllegalArgumentException: " - + "[tokens_freq_ratio_threshold] must be between [1] and [100], got 101" + + "[tokens_freq_ratio_threshold] must be between [1] and [100], got 101" ) ); } @@ -356,7 +355,7 @@ public void testTokensWeightThresholdCorrect() { eTestDouble.getMessage(), containsString( "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: " - + "[0:0] [pruning_config] failed to parse field [tokens_weight_threshold]" + + "[0:0] [pruning_config] failed to parse field [tokens_weight_threshold]" ) ); From 515cf9e122ac82873aad0cb8f01759e2ced004a3 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Sat, 24 May 2025 17:05:59 -0400 Subject: [PATCH 56/86] additional cleanups and refactoring --- .../xcontent/AbstractObjectParser.java | 9 -- .../vectors/SparseVectorFieldMapper.java | 101 ++++++++++++------ .../mapper/vectors/TokenPruningConfig.java | 17 ++- .../vectors/SparseVectorFieldMapperTests.java | 66 ++++-------- .../ml/search/SparseVectorQueryBuilder.java | 2 +- 5 files changed, 97 insertions(+), 98 deletions(-) diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java index 1a6bf51aea896..244e1270fe530 100644 --- a/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java +++ b/libs/x-content/src/main/java/org/elasticsearch/xcontent/AbstractObjectParser.java @@ -287,15 +287,6 @@ public void declareBoolean(BiConsumer consumer, ParseField field declareField(consumer, XContentParser::booleanValue, field, ValueType.BOOLEAN); } - public void declareBooleanOrNull(BiConsumer consumer, boolean nullValue, ParseField field) { - declareField( - consumer, - p -> p.currentToken() == XContentParser.Token.VALUE_NULL ? nullValue : p.booleanValue(), - field, - ValueType.BOOLEAN_OR_NULL - ); - } - public void declareObjectArray(BiConsumer> consumer, ContextParser objectParser, ParseField field) { declareFieldArray(consumer, objectParser, field, ValueType.OBJECT_ARRAY); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index b4578fa6a8cb1..ade74f5dbbbf7 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -34,7 +34,6 @@ import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperBuilderContext; -import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MappingParserContext; import org.elasticsearch.index.mapper.SourceLoader; import org.elasticsearch.index.mapper.SourceValueFetcher; @@ -43,9 +42,16 @@ import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.search.fetch.StoredFieldsSpec; import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.DeprecationHandler; +import org.elasticsearch.xcontent.NamedXContentRegistry; +import org.elasticsearch.xcontent.ParseField; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentParser.Token; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.support.MapXContentParser; import java.io.IOException; import java.io.UncheckedIOException; @@ -56,6 +62,7 @@ import java.util.stream.Stream; import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST; +import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; /** * A {@link FieldMapper} that exposes Lucene's {@link FeatureField} as a sparse @@ -85,23 +92,22 @@ private static SparseVectorFieldMapper toType(FieldMapper in) { public static class Builder extends FieldMapper.Builder { private final Parameter stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false); private final Parameter> meta = Parameter.metaParam(); - private final Parameter indexOptions; + private final Parameter indexOptions = new Parameter<>( + SPARSE_VECTOR_INDEX_OPTIONS, + true, + () -> null, + (n, c, o) -> parseIndexOptions(c, o), + m -> toType(m).fieldType().indexOptions, + (b, n, v) -> { + if (v != null) { + b.field(n, v); + } + }, + Objects::toString + );; public Builder(String name) { super(name); - this.indexOptions = new Parameter<>( - SPARSE_VECTOR_INDEX_OPTIONS, - true, - () -> null, - (n, c, o) -> parseIndexOptions(c, o), - m -> toType(m).fieldType().indexOptions, - (b, n, v) -> { - if (v != null) { - b.field(n, v); - } - }, - Objects::toString - ); } public Builder setStored(boolean value) { @@ -128,6 +134,16 @@ public IndexOptions getIndexOptions() { return fieldType().getIndexOptions(); } + private static final ConstructingObjectParser INDEX_OPTIONS_PARSER = new ConstructingObjectParser<>( + SPARSE_VECTOR_INDEX_OPTIONS, + args -> new IndexOptions((Boolean) args[0], (TokenPruningConfig) args[1]) + ); + + static { + INDEX_OPTIONS_PARSER.declareBoolean(optionalConstructorArg(), IndexOptions.PRUNE_FIELD_NAME); + INDEX_OPTIONS_PARSER.declareObject(optionalConstructorArg(), TokenPruningConfig.PARSER, IndexOptions.PRUNING_CONFIG_FIELD_NAME); + } + private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) { if (propNode == null) { return null; @@ -135,14 +151,18 @@ private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingPar Map indexOptionsMap = XContentMapValues.nodeMapValue(propNode, SPARSE_VECTOR_INDEX_OPTIONS); - Boolean prune = IndexOptions.parseIndexOptionsPruneValue(indexOptionsMap); - TokenPruningConfig pruningConfig = IndexOptions.parseIndexOptionsPruningConfig(prune, indexOptionsMap); + XContentParser parser = new MapXContentParser( + NamedXContentRegistry.EMPTY, + DeprecationHandler.IGNORE_DEPRECATIONS, + indexOptionsMap, + XContentType.JSON + ); - if (prune == null && pruningConfig == null) { - return null; + try { + return INDEX_OPTIONS_PARSER.parse(parser, null); + } catch (IOException e) { + throw new UncheckedIOException(e); } - - return new SparseVectorFieldMapper.IndexOptions(prune, pruningConfig); } public static final TypeParser PARSER = new TypeParser((n, c) -> { @@ -427,13 +447,25 @@ public void reset() { } public static class IndexOptions implements ToXContent { - public static final String PRUNE_FIELD_NAME = "prune"; - public static final String PRUNING_CONFIG_FIELD_NAME = "pruning_config"; + public static final ParseField PRUNE_FIELD_NAME = new ParseField("prune"); + public static final ParseField PRUNING_CONFIG_FIELD_NAME = new ParseField("pruning_config"); final Boolean prune; final TokenPruningConfig pruningConfig; IndexOptions(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) { + if (pruningConfig != null && (prune == null || prune == false)) { + throw new IllegalArgumentException( + "[" + + SPARSE_VECTOR_INDEX_OPTIONS + + "] field [" + + PRUNING_CONFIG_FIELD_NAME.getPreferredName() + + "] should only be set if [" + + PRUNE_FIELD_NAME.getPreferredName() + + "] is set to true" + ); + } + this.prune = prune; this.pruningConfig = pruningConfig; } @@ -470,18 +502,19 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.startObject(); if (prune != null) { - builder.field(PRUNE_FIELD_NAME, prune); + builder.field(PRUNE_FIELD_NAME.getPreferredName(), prune); } if (pruningConfig != null) { - builder.field(PRUNING_CONFIG_FIELD_NAME, pruningConfig); + builder.field(PRUNING_CONFIG_FIELD_NAME.getPreferredName(), pruningConfig); } builder.endObject(); return builder; } + /* public static Boolean parseIndexOptionsPruneValue(Map indexOptionsMap) { - Object shouldPrune = indexOptionsMap.get(IndexOptions.PRUNE_FIELD_NAME); + Object shouldPrune = indexOptionsMap.get(IndexOptions.PRUNE_FIELD_NAME.getPreferredName()); if (shouldPrune == null) { return null; } @@ -491,12 +524,14 @@ public static Boolean parseIndexOptionsPruneValue(Map indexOptio } throw new MapperParsingException( - "[" + SPARSE_VECTOR_INDEX_OPTIONS + "] field [" + PRUNE_FIELD_NAME + "] should be true or false" + "[" + SPARSE_VECTOR_INDEX_OPTIONS + "] field [" + PRUNE_FIELD_NAME.getPreferredName() + "] should be true or false" ); } + */ + /* public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { - Object pruningConfiguration = indexOptionsMap.get(IndexOptions.PRUNING_CONFIG_FIELD_NAME); + Object pruningConfiguration = indexOptionsMap.get(IndexOptions.PRUNING_CONFIG_FIELD_NAME.getPreferredName()); if (pruningConfiguration == null) { return null; } @@ -506,16 +541,20 @@ public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, M "[" + SPARSE_VECTOR_INDEX_OPTIONS + "] field [" - + PRUNING_CONFIG_FIELD_NAME + + PRUNING_CONFIG_FIELD_NAME.getPreferredName() + "] should only be set if [" - + PRUNE_FIELD_NAME + + PRUNE_FIELD_NAME.getPreferredName() + "] is set to true" ); } - Map pruningConfigurationMap = XContentMapValues.nodeMapValue(pruningConfiguration, PRUNING_CONFIG_FIELD_NAME); + Map pruningConfigurationMap = XContentMapValues.nodeMapValue( + pruningConfiguration, + PRUNING_CONFIG_FIELD_NAME.getPreferredName() + ); return TokenPruningConfig.parseFromMap(pruningConfigurationMap); } + */ } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java index c797a0e86395d..58b32d9c08d7c 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/TokenPruningConfig.java @@ -9,7 +9,6 @@ package org.elasticsearch.index.mapper.vectors; -import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -26,6 +25,7 @@ import org.elasticsearch.xcontent.support.MapXContentParser; import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Locale; import java.util.Map; import java.util.Objects; @@ -191,7 +191,7 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx return new TokenPruningConfig(ratioThreshold, weightThreshold, onlyScorePrunedTokens); } - private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( PRUNING_CONFIG_FIELD, args -> new TokenPruningConfig( args[0] == null ? DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD : (Float) args[0], @@ -201,9 +201,9 @@ public static TokenPruningConfig fromXContent(XContentParser parser) throws IOEx ); static { - PARSER.declareFloatOrNull(optionalConstructorArg(), DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, TOKENS_FREQ_RATIO_THRESHOLD); - PARSER.declareFloatOrNull(optionalConstructorArg(), DEFAULT_TOKENS_WEIGHT_THRESHOLD, TOKENS_WEIGHT_THRESHOLD); - PARSER.declareBooleanOrNull(optionalConstructorArg(), false, ONLY_SCORE_PRUNED_TOKENS_FIELD); + PARSER.declareFloat(optionalConstructorArg(), TOKENS_FREQ_RATIO_THRESHOLD); + PARSER.declareFloat(optionalConstructorArg(), TOKENS_WEIGHT_THRESHOLD); + PARSER.declareBoolean(optionalConstructorArg(), ONLY_SCORE_PRUNED_TOKENS_FIELD); } public static TokenPruningConfig parseFromMap(Map pruningConfigMap) { @@ -220,11 +220,8 @@ public static TokenPruningConfig parseFromMap(Map pruningConfigM ); return PARSER.parse(parser, null); - } catch (Exception exc) { - if (exc.getCause() != null && exc.getCause().getClass().equals(IllegalArgumentException.class)) { - throw new ElasticsearchException(exc.getCause()); - } - throw new ElasticsearchException(exc); + } catch (IOException ioEx) { + throw new UncheckedIOException(ioEx); } } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index f5fbd1aa2c1f9..2b04229701c4f 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -39,9 +39,12 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Random; +import static java.lang.Math.random; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.NEW_SPARSE_VECTOR_INDEX_VERSION; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.PREVIOUS_SPARSE_VECTOR_INDEX_VERSION; +import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION; import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -131,7 +134,12 @@ public void testDefaults() throws Exception { } public void testDefaultsPreIndexOptions() throws Exception { - DocumentMapper mapper = getDocumentMapperPreviousVersion(fieldMapping(this::minimalMapping)); + IndexVersion indexVersion = IndexVersionUtils.randomVersionBetween( + new Random(), + NEW_SPARSE_VECTOR_INDEX_VERSION, + IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION) + ); + DocumentMapper mapper = createDocumentMapper(indexVersion, fieldMapping(this::minimalMapping)); assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); checkParsedDocument(mapper); @@ -242,7 +250,7 @@ public void testPruneMustBeBoolean() { b.field("prune", "othervalue"); b.endObject(); }))); - assertThat(e.getMessage(), containsString("[index_options] field [prune] should be true or false")); + assertThat(e.getMessage(), containsString("[index_options] failed to parse field [prune]")); } public void testPruningConfigurationIsMap() { @@ -253,7 +261,7 @@ public void testPruningConfigurationIsMap() { b.field("pruning_config", "this_is_not_a_map"); b.endObject(); }))); - assertThat(e.getMessage(), containsString("pruning_config should be a map but was of type:")); + assertThat(e.getMessage(), containsString("[index_options] pruning_config doesn't support values of type:")); } public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { @@ -268,10 +276,7 @@ public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { b.endObject(); b.endObject(); }))); - assertThat( - eTestPruneIsFalse.getMessage(), - containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") - ); + assertThat(eTestPruneIsFalse.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); Exception eTestPruneIsMissing = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -284,7 +289,7 @@ public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { }))); assertThat( eTestPruneIsMissing.getMessage(), - containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") + containsString("Failed to parse mapping: Failed to build [index_options] after last required field arrived") ); } @@ -300,10 +305,7 @@ public void testTokensFreqRatioCorrect() { }))); assertThat( eTestInteger.getMessage(), - containsString( - "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: " - + "[0:0] [pruning_config] failed to parse field [tokens_freq_ratio_threshold]" - ) + containsString("Failed to parse mapping: [0:0] [index_options] failed to parse field [pruning_config]") ); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { @@ -315,13 +317,7 @@ public void testTokensFreqRatioCorrect() { b.endObject(); b.endObject(); }))); - assertThat( - eTestRangeLower.getMessage(), - containsString( - "Failed to parse mapping: java.lang.IllegalArgumentException: " - + "[tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0" - ) - ); + assertThat(eTestRangeLower.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -332,13 +328,7 @@ public void testTokensFreqRatioCorrect() { b.endObject(); b.endObject(); }))); - assertThat( - eTestRangeHigher.getMessage(), - containsString( - "Failed to parse mapping: java.lang.IllegalArgumentException: " - + "[tokens_freq_ratio_threshold] must be between [1] and [100], got 101" - ) - ); + assertThat(eTestRangeHigher.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); } public void testTokensWeightThresholdCorrect() { @@ -351,13 +341,7 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); b.endObject(); }))); - assertThat( - eTestDouble.getMessage(), - containsString( - "Failed to parse mapping: org.elasticsearch.xcontent.XContentParseException: " - + "[0:0] [pruning_config] failed to parse field [tokens_weight_threshold]" - ) - ); + assertThat(eTestDouble.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -368,10 +352,7 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); b.endObject(); }))); - assertThat( - eTestRangeLower.getMessage(), - containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_weight_threshold] must be between 0 and 1") - ); + assertThat(eTestRangeLower.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -382,10 +363,7 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); b.endObject(); }))); - assertThat( - eTestRangeHigher.getMessage(), - containsString("Failed to parse mapping: java.lang.IllegalArgumentException: [tokens_weight_threshold] must be between 0 and 1") - ); + assertThat(eTestRangeHigher.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); } public void testStoreIsNotUpdateable() throws IOException { @@ -542,10 +520,4 @@ private void checkParsedDocument(DocumentMapper mapper) throws IOException { int freq2 = getFrequency(featureField2.tokenStream(null, null)); assertTrue(freq1 < freq2); } - - private final IndexVersion PRE_SPARSE_VECTOR_INDEX_OPTIONS_VERSION = IndexVersions.DEFAULT_TO_ACORN_HNSW_FILTER_HEURISTIC; - - private DocumentMapper getDocumentMapperPreviousVersion(XContentBuilder mappings) throws IOException { - return createMapperService(PRE_SPARSE_VECTOR_INDEX_OPTIONS_VERSION, mappings).documentMapper(); - } } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index b0f6873efa462..e84d37d1ca73f 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -183,7 +183,7 @@ protected void doWriteTo(StreamOutput out) throws IOException { || out.getTransportVersion().onOrAfter(TransportVersions.SPARSE_VECTOR_FIELD_PRUNING_OPTIONS)) { out.writeOptionalBoolean(shouldPruneTokens); } else { - out.writeBoolean(shouldPruneTokens != null && shouldPruneTokens); + out.writeBoolean(shouldPruneTokens != null ? shouldPruneTokens : DEFAULT_PRUNE); } out.writeOptionalCollection(queryVectors); out.writeOptionalString(inferenceId); From 432031561d23c34b8868b1f6db6da86c034c5cd4 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Sat, 24 May 2025 18:32:36 -0400 Subject: [PATCH 57/86] fix tests --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 2b04229701c4f..707e5469f3d0b 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -17,6 +17,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; +import org.elasticsearch.common.Randomness; import org.elasticsearch.common.Strings; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.index.IndexVersion; @@ -135,7 +136,7 @@ public void testDefaults() throws Exception { public void testDefaultsPreIndexOptions() throws Exception { IndexVersion indexVersion = IndexVersionUtils.randomVersionBetween( - new Random(), + Randomness.get(), NEW_SPARSE_VECTOR_INDEX_VERSION, IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION) ); From 1cd6f5ee1b9640f60389df861ed4073c59a44b35 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Sat, 24 May 2025 22:40:28 +0000 Subject: [PATCH 58/86] [CI] Auto commit changes from spotless --- .../index/mapper/vectors/SparseVectorFieldMapperTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 707e5469f3d0b..5ed08cdb8f8b1 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -40,7 +40,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Random; import static java.lang.Math.random; import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.NEW_SPARSE_VECTOR_INDEX_VERSION; From c5b9defd0458fae67f30b422d09f7dec5023d0e3 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 26 May 2025 16:55:22 -0400 Subject: [PATCH 59/86] refactor and cleanups --- .../vectors/SparseVectorFieldMapper.java | 12 ++-- .../ml/search/SparseVectorQueryBuilder.java | 66 ++++++++++++++----- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index ade74f5dbbbf7..9ebd13c38018e 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -80,8 +80,8 @@ public class SparseVectorFieldMapper extends FieldMapper { static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR; static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT; - public static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = - IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; + + static final IndexVersion SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION = IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT; public static final NodeFeature SPARSE_VECTOR_INDEX_OPTIONS_FEATURE = new NodeFeature("sparse_vector.index_options_supported"); @@ -98,13 +98,9 @@ public static class Builder extends FieldMapper.Builder { () -> null, (n, c, o) -> parseIndexOptions(c, o), m -> toType(m).fieldType().indexOptions, - (b, n, v) -> { - if (v != null) { - b.field(n, v); - } - }, + XContentBuilder::field, Objects::toString - );; + ).acceptsNull(); public Builder(String name) { super(name); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index e84d37d1ca73f..e5750fd5de355 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -17,6 +17,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; @@ -235,23 +236,10 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { ); } - Boolean pruneTokensToUse = shouldPruneTokens; - TokenPruningConfig pruningConfigToUse = tokenPruningConfig; + TokenPruningConfig pruningConfig = getTokenPruningConfigForQuery(ft, context); - // if the query options for pruning are not set, we need to check the index options for this field - // and use those if set - however, only if the index was created after we added this support. - if (ft.getClass().equals(SparseVectorFieldMapper.SparseVectorFieldType.class) - && context.indexVersionCreated().onOrAfter(SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)) { - SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType = (SparseVectorFieldMapper.SparseVectorFieldType) ft; - - if (asSVFieldType.getIndexOptions() != null) { - pruneTokensToUse = pruneTokensToUse == null ? asSVFieldType.getIndexOptions().getPrune() : pruneTokensToUse; - pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; - } - } - - return (pruneTokensToUse != null && pruneTokensToUse) - ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningConfigToUse, queryVectors, ft, context) + return pruningConfig != null + ? WeightedTokensUtils.queryBuilderWithPrunedTokens(fieldName, pruningConfig, queryVectors, ft, context) : WeightedTokensUtils.queryBuilderWithAllTokens(fieldName, queryVectors, ft, context); } @@ -362,6 +350,52 @@ public TransportVersion getMinimalSupportedVersion() { return TransportVersions.V_8_15_0; } + private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, SearchExecutionContext context) { + TokenPruningConfig queryPruningConfig = (shouldPruneTokens != null && shouldPruneTokens) ? tokenPruningConfig : null; + + // query values should always override any index options + if (shouldPruneTokens != null) { + return queryPruningConfig; + } + + // if we are not on a supported index version, do not prune by default + // nor do we check the index options + if (context.indexVersionCreated().onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT) == false + && context.indexVersionCreated() + .between( + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, + IndexVersions.UPGRADE_TO_LUCENE_10_0_0 + ) == false) { + return null; + } + + if (ft instanceof SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType) { + Boolean pruneTokensToUse = shouldPruneTokens; + TokenPruningConfig pruningConfigToUse = tokenPruningConfig; + + if (asSVFieldType.getIndexOptions() != null) { + pruneTokensToUse = asSVFieldType.getIndexOptions().getPrune(); + pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; + } + + // if we're still null, use defaults + pruneTokensToUse = pruneTokensToUse == null ? true : pruneTokensToUse; + pruningConfigToUse = pruningConfigToUse == null + ? new TokenPruningConfig( + TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, + TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, + false + ) + : pruningConfigToUse; + + return pruneTokensToUse ? pruningConfigToUse : null; + } + + // should never happen that we're not operating on a SparseVectorFieldType + // but just in case, return null and do not prune + return null; + } + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>(NAME, a -> { String fieldName = (String) a[0]; @SuppressWarnings("unchecked") From fd07f1cf457fde5c0f97d019c56310fd3b23e77d Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 26 May 2025 18:09:06 -0400 Subject: [PATCH 60/86] additional cleanups for clarity --- .../ml/search/SparseVectorQueryBuilder.java | 50 ++++++++----------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index e5750fd5de355..b9164f574eb05 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -351,13 +351,6 @@ public TransportVersion getMinimalSupportedVersion() { } private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, SearchExecutionContext context) { - TokenPruningConfig queryPruningConfig = (shouldPruneTokens != null && shouldPruneTokens) ? tokenPruningConfig : null; - - // query values should always override any index options - if (shouldPruneTokens != null) { - return queryPruningConfig; - } - // if we are not on a supported index version, do not prune by default // nor do we check the index options if (context.indexVersionCreated().onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT) == false @@ -366,34 +359,33 @@ private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, Sea IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, IndexVersions.UPGRADE_TO_LUCENE_10_0_0 ) == false) { - return null; + return (shouldPruneTokens != null && shouldPruneTokens) ? tokenPruningConfig : null; } - if (ft instanceof SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType) { - Boolean pruneTokensToUse = shouldPruneTokens; - TokenPruningConfig pruningConfigToUse = tokenPruningConfig; + Boolean shouldQueryPruneTokens = shouldPruneTokens; + TokenPruningConfig pruningConfigToUse = tokenPruningConfig; - if (asSVFieldType.getIndexOptions() != null) { - pruneTokensToUse = asSVFieldType.getIndexOptions().getPrune(); - pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; - } + if (ft instanceof SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType + && asSVFieldType.getIndexOptions() != null + ) { + shouldQueryPruneTokens = shouldQueryPruneTokens == null ? asSVFieldType.getIndexOptions().getPrune() : shouldQueryPruneTokens; + pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; + } - // if we're still null, use defaults - pruneTokensToUse = pruneTokensToUse == null ? true : pruneTokensToUse; - pruningConfigToUse = pruningConfigToUse == null - ? new TokenPruningConfig( - TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, - TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, - false - ) - : pruningConfigToUse; - - return pruneTokensToUse ? pruningConfigToUse : null; + if (shouldQueryPruneTokens != null && shouldQueryPruneTokens == false) { + return null; } - // should never happen that we're not operating on a SparseVectorFieldType - // but just in case, return null and do not prune - return null; + // if we're here, we should prune if set or by default + // if we don't have a pruning config, use the default + pruningConfigToUse = pruningConfigToUse == null + ? new TokenPruningConfig( + TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, + TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, + false + ) : pruningConfigToUse; + + return pruningConfigToUse; } private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>(NAME, a -> { From 42bc77d4d2b0e25fa06990dc06d19620acbfb925 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 26 May 2025 22:16:03 +0000 Subject: [PATCH 61/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorQueryBuilder.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index b9164f574eb05..0fe70896f2997 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -365,9 +365,7 @@ private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, Sea Boolean shouldQueryPruneTokens = shouldPruneTokens; TokenPruningConfig pruningConfigToUse = tokenPruningConfig; - if (ft instanceof SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType - && asSVFieldType.getIndexOptions() != null - ) { + if (ft instanceof SparseVectorFieldMapper.SparseVectorFieldType asSVFieldType && asSVFieldType.getIndexOptions() != null) { shouldQueryPruneTokens = shouldQueryPruneTokens == null ? asSVFieldType.getIndexOptions().getPrune() : shouldQueryPruneTokens; pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; } @@ -383,7 +381,8 @@ private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, Sea TokenPruningConfig.DEFAULT_TOKENS_FREQ_RATIO_THRESHOLD, TokenPruningConfig.DEFAULT_TOKENS_WEIGHT_THRESHOLD, false - ) : pruningConfigToUse; + ) + : pruningConfigToUse; return pruningConfigToUse; } From 8f6672ff4086ffe8871887d0a305c1e93d26b158 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 27 May 2025 08:27:29 -0400 Subject: [PATCH 62/86] YAML test cleanups --- .../test/multi_cluster/50_sparse_vector.yml | 5 ----- .../test/remote_cluster/50_sparse_vector.yml | 6 ------ .../rest-api-spec/test/ml/sparse_vector_search.yml | 13 ------------- 3 files changed, 24 deletions(-) diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index cf30014216ffc..968c0896c0345 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -226,8 +226,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { acknowledged: true } @@ -241,7 +239,6 @@ teardown: - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } - - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options mappings defaults": @@ -265,8 +262,6 @@ teardown: type: text ml.tokens: type: sparse_vector - embeddings: - type: sparse_vector - match: { acknowledged: true } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index b8fe638fdd0a7..da47c2e291ac8 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -225,8 +225,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { acknowledged: true } @@ -240,7 +238,6 @@ teardown: - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.prune: true } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_freq_ratio_threshold: 1.0 } - match: { sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options.pruning_config.tokens_weight_threshold: 0.4 } - - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options mappings defaults": @@ -264,8 +261,6 @@ teardown: type: text ml.tokens: type: sparse_vector - embeddings: - type: sparse_vector - match: { acknowledged: true } @@ -277,7 +272,6 @@ teardown: index: sparse_vector_pruning_test - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options - - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options prune missing do not allow config": diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 1afede3bf247f..365f3692312bf 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -556,8 +556,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { acknowledged: true } @@ -594,8 +592,6 @@ teardown: type: text ml.tokens: type: sparse_vector - embeddings: - type: sparse_vector - match: { acknowledged: true } @@ -607,7 +603,6 @@ teardown: index: sparse_vector_pruning_test - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options - - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options prune missing do not allow config": @@ -636,8 +631,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -669,8 +662,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 1.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -702,8 +693,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 101.0 tokens_weight_threshold: 0.4 - embeddings: - type: sparse_vector - match: { status: 400 } @@ -735,8 +724,6 @@ teardown: pruning_config: tokens_freq_ratio_threshold: 5.0 tokens_weight_threshold: 3.5 - embeddings: - type: sparse_vector - match: { status: 400 } From f499269d58d372357099294bdb33ff34bd2071e3 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 27 May 2025 10:31:07 -0400 Subject: [PATCH 63/86] fix semantic highligter rtest --- .../xpack/inference/highlight/SemanticTextHighlighterTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index b808dce4f9750..a6845a9756bf7 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -138,7 +138,7 @@ public void testSparseVector() throws Exception { tokens, null, null, - null, + false, null ); NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), sparseQuery, ScoreMode.Max); From 5e9fee18f5955f3e3ba2954563cb3c12a65c9121 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 27 May 2025 13:58:00 -0400 Subject: [PATCH 64/86] fix semantictexthighliter tests --- .../xpack/inference/highlight/SemanticTextHighlighterTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index a6845a9756bf7..28b6e81ffab1d 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -232,7 +232,7 @@ private void assertHighlightOneDoc( assertThat(topDocs.totalHits.value(), equalTo(1L)); int docID = topDocs.scoreDocs[0].doc; SemanticTextHighlighter highlighter = new SemanticTextHighlighter(); - var execContext = createSearchExecutionContext(mapperService); + var execContext = createSearchExecutionContext(mapperService, searcher); var luceneQuery = execContext.toQuery(request.source().query()).query(); FetchContext fetchContext = mock(FetchContext.class); Mockito.when(fetchContext.highlight()).thenReturn(new SearchHighlightContext(Collections.emptyList())); From 62099190c01e1c5c3209a472ff53a807c12aa47b Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Tue, 27 May 2025 19:34:53 -0400 Subject: [PATCH 65/86] (correctly) fix SemanticTextHighlighterTests tests --- .../highlight/SemanticTextHighlighterTests.java | 11 ++++++++--- .../xpack/inference/highlight/queries.json | 16 +++++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index 28b6e81ffab1d..0b238ff990efe 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -133,19 +133,23 @@ public void testSparseVector() throws Exception { Map queryMap = (Map) queries.get("sparse_vector_1"); List tokens = readSparseVector(queryMap.get("embeddings")); var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_ELSER); + + Boolean doNotPruneTokens = randomBoolean() ? false : null; + SparseVectorQueryBuilder sparseQuery = new SparseVectorQueryBuilder( fieldType.getEmbeddingsField().fullPath(), tokens, null, null, - false, + doNotPruneTokens, null ); NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), sparseQuery, ScoreMode.Max); var shardRequest = createShardSearchRequest(nestedQueryBuilder); var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON); - String[] expectedScorePassages = ((List) queryMap.get("expected_by_score")).toArray(String[]::new); + String expectedScoringDocsKey = doNotPruneTokens == null ? "expected_by_score_with_pruning" : "expected_by_score"; + String[] expectedScorePassages = ((List) queryMap.get(expectedScoringDocsKey)).toArray(String[]::new); for (int i = 0; i < expectedScorePassages.length; i++) { assertHighlightOneDoc( mapperService, @@ -158,7 +162,8 @@ public void testSparseVector() throws Exception { ); } - String[] expectedOffsetPassages = ((List) queryMap.get("expected_by_offset")).toArray(String[]::new); + String expectedOffsetDocsKey = doNotPruneTokens == null ? "expected_by_offset_with_pruning" : "expected_by_offset"; + String[] expectedOffsetPassages = ((List) queryMap.get(expectedOffsetDocsKey)).toArray(String[]::new); assertHighlightOneDoc( mapperService, shardRequest, diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json index 6227f3f498854..6aa2788a864c3 100644 --- a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json +++ b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json @@ -462,6 +462,20 @@ "After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n", "\nDuring the Hundred Years' War, Paris was occupied by England-friendly Burgundian forces from 1418, before being occupied outright by the English when Henry V of England entered the French capital in 1420; in spite of a 1429 effort by Joan of Arc to liberate the city, it would remain under English occupation until 1436.\n\n\nIn the late 16th-century French Wars of Religion, Paris was a stronghold of the Catholic League, the organisers of 24 August 1572 St. Bartholomew's Day massacre in which thousands of French Protestants were killed. The conflicts ended when pretender to the throne Henry IV, after converting to Catholicism to gain entry to the capital, entered the city in 1594 to claim the crown of France. This king made several improvements to the capital during his reign: he completed the construction of Paris's first uncovered, sidewalk-lined bridge, the Pont Neuf, built a Louvre extension connecting it to the Tuileries Palace, and created the first Paris residential square, the Place Royale, now Place des Vosges. In spite of Henry IV's efforts to improve city circulation, the narrowness of Paris's streets was a contributing factor in his assassination near Les Halles marketplace in 1610.\n\n\nDuring the 17th century, Cardinal Richelieu, chief minister of Louis XIII, was determined to make Paris the most beautiful city in Europe. He built five new bridges, a new chapel for the College of Sorbonne, and a palace for himself, the Palais-Cardinal. ", "Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n" + ], + "expected_by_score_with_pruning": [ + "\nThe Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the Île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\n\n\nThe Romans conquered the Paris Basin in 52 BC and began their settlement on Paris's Left Bank. The Roman town was originally called Lutetia (more fully, Lutetia Parisiorum, \"Lutetia of the Parisii\", modern French Lutèce). It became a prosperous city with a forum, baths, temples, theatres, and an amphitheatre.\n\n\nBy the end of the Western Roman Empire, the town was known as Parisius, a Latin name that would later become Paris in French. Christianity was introduced in the middle of the 3rd century AD by Saint Denis, the first Bishop of Paris: according to legend, when he refused to renounce his faith before the Roman occupiers, he was beheaded on the hill which became known as Mons Martyrum (Latin \"Hill of Martyrs\"), later \"Montmartre\", from where he walked headless to the north of the city; the place where he fell and was buried became an important religious shrine, the Basilica of Saint-Denis, and many French kings are buried there.\n\n\nClovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. ", + "Clovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. As the Frankish domination of Gaul began, there was a gradual immigration by the Franks to Paris and the Parisian Francien dialects were born. Fortification of the Île de la Cité failed to avert sacking by Vikings in 845, but Paris's strategic importance—with its bridges preventing ships from passing—was established by successful defence in the Siege of Paris (885–886), for which the then Count of Paris (comte de Paris), Odo of France, was elected king of West Francia. From the Capetian dynasty that began with the 987 election of Hugh Capet, Count of Paris and Duke of the Franks (duc des Francs), as king of a unified West Francia, Paris gradually became the largest and most prosperous city in France.\n\n\nHigh and Late Middle Ages to Louis XIV\n\nBy the end of the 12th century, Paris had become the political, economic, religious, and cultural capital of France. The Palais de la Cité, the royal residence, was located at the western end of the Île de la Cité. In 1163, during the reign of Louis VII, Maurice de Sully, bishop of Paris, undertook the construction of the Notre Dame Cathedral at its eastern extremity.\n\n\nAfter the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. ", + "Open from 1804, these were the cemeteries of Père Lachaise, Montmartre, Montparnasse, and later Passy. New suburban cemeteries were created in the early 20th century: The largest of these are the Cimetière parisien de Saint-Ouen, the Cimetière parisien de Pantin, also known as Cimetière parisien de Pantin-Bobigny, the Cimetière parisien d'Ivry, and the Cimetière parisien de Bagneux. Famous people buried in Parisian cemeteries include Oscar Wilde, Frédéric Chopin, Jim Morrison, Édith Piaf and Serge Gainsbourg.\n\n\nEducation\n\nParis is the département with the highest proportion of highly educated people. In 2009, around 40 percent of Parisians held a licence-level diploma or higher, the highest proportion in France. 13 percent have no diploma, the third-lowest percentage in France. Education in Paris and the Île-de-France region employs approximately 330,000 people, 170,000 of whom are teachers and professors, teaching approximately 2.9 million students in around 9,000 primary, secondary, and higher education schools and institutions.\n\n\nThe University of Paris, founded in the 12th century, is often called the Sorbonne after one of its original medieval colleges. In 1970, it was broken up into thirteen autonomous universities, following the student demonstrations in 1968. Most of the campuses today are in the Latin Quarter where the old university was located, while others are scattered around the city and the suburbs.\n\n\nThe Paris region hosts France's highest concentration of the grandes écoles – 55 specialised centres of higher-education outside or inside the public university structure. ", + "\nThe population of Paris had dropped by 100,000 during the Revolution, but after 1799 it surged with 160,000 new residents, reaching 660,000 by 1815. Napoleon replaced the elected government of Paris with a prefect that reported directly to him. He began erecting monuments to military glory, including the Arc de Triomphe, and improved the neglected infrastructure of the city with new fountains, the Canal de l'Ourcq, Père Lachaise Cemetery and the city's first metal bridge, the Pont des Arts.\n\n\nDuring the Restoration, the bridges and squares of Paris were returned to their pre-Revolution names; the July Revolution in 1830 (commemorated by the July Column on the Place de la Bastille) brought to power a constitutional monarch, Louis Philippe I. The first railway line to Paris opened in 1837, beginning a new period of massive migration from the provinces to the city. In 1848, Louis-Philippe was overthrown by a popular uprising in the streets of Paris. His successor, Napoleon III, alongside the newly appointed prefect of the Seine, Georges-Eugène Haussmann, launched a huge public works project to build wide new boulevards, a new opera house, a central market, new aqueducts, sewers and parks, including the Bois de Boulogne and Bois de Vincennes. In 1860, Napoleon III annexed the surrounding towns and created eight new arrondissements, expanding Paris to its current limits.\n\n\nDuring the Franco-Prussian War (1870–1871), Paris was besieged by the Prussian Army. ", + "One of the most notable hospitals is the Hôtel-Dieu, founded in 651, the oldest hospital in Paris and the oldest worldwide still operating, although the current building is the product of a reconstruction of 1877. Other hospitals include Pitié-Salpêtrière Hospital, one of the largest in Europe, Hôpital Cochin, Bichat–Claude Bernard Hospital, Hôpital Européen Georges-Pompidou, Bicêtre Hospital, Beaujon Hospital, the Curie Institute, Lariboisière Hospital, Necker–Enfants Malades Hospital, Hôpital Saint-Louis, Hôpital de la Charité and the American Hospital of Paris.\n\n\nInternational relations\n\nInternational organisations\n\nThe United Nations Educational, Scientific and Cultural Organization (UNESCO) has had its headquarters in Paris since November 1958. Paris is also the home of the Organisation for Economic Co-operation and Development (OECD). Paris hosts the headquarters of the European Space Agency, the International Energy Agency, European Securities and Markets Authority and the European Banking Authority.\n\n\nTwin towns – sister cities\n\nSince April 1956, Paris is exclusively and reciprocally twinned with:\n\n\nOther relationships\n\nParis has agreements of friendship and co-operation with:\n\n\nSee also\n\nNotes\n\nReferences\n\nCitations\n\nSources\n\nFurther reading\n\nExternal links\n\n" + ], + "expected_by_offset_with_pruning": [ + "\nThe Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the Île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\n\n\nThe Romans conquered the Paris Basin in 52 BC and began their settlement on Paris's Left Bank. The Roman town was originally called Lutetia (more fully, Lutetia Parisiorum, \"Lutetia of the Parisii\", modern French Lutèce). It became a prosperous city with a forum, baths, temples, theatres, and an amphitheatre.\n\n\nBy the end of the Western Roman Empire, the town was known as Parisius, a Latin name that would later become Paris in French. Christianity was introduced in the middle of the 3rd century AD by Saint Denis, the first Bishop of Paris: according to legend, when he refused to renounce his faith before the Roman occupiers, he was beheaded on the hill which became known as Mons Martyrum (Latin \"Hill of Martyrs\"), later \"Montmartre\", from where he walked headless to the north of the city; the place where he fell and was buried became an important religious shrine, the Basilica of Saint-Denis, and many French kings are buried there.\n\n\nClovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. ", + "Clovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. As the Frankish domination of Gaul began, there was a gradual immigration by the Franks to Paris and the Parisian Francien dialects were born. Fortification of the Île de la Cité failed to avert sacking by Vikings in 845, but Paris's strategic importance—with its bridges preventing ships from passing—was established by successful defence in the Siege of Paris (885–886), for which the then Count of Paris (comte de Paris), Odo of France, was elected king of West Francia. From the Capetian dynasty that began with the 987 election of Hugh Capet, Count of Paris and Duke of the Franks (duc des Francs), as king of a unified West Francia, Paris gradually became the largest and most prosperous city in France.\n\n\nHigh and Late Middle Ages to Louis XIV\n\nBy the end of the 12th century, Paris had become the political, economic, religious, and cultural capital of France. The Palais de la Cité, the royal residence, was located at the western end of the Île de la Cité. In 1163, during the reign of Louis VII, Maurice de Sully, bishop of Paris, undertook the construction of the Notre Dame Cathedral at its eastern extremity.\n\n\nAfter the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. ", + "\nThe population of Paris had dropped by 100,000 during the Revolution, but after 1799 it surged with 160,000 new residents, reaching 660,000 by 1815. Napoleon replaced the elected government of Paris with a prefect that reported directly to him. He began erecting monuments to military glory, including the Arc de Triomphe, and improved the neglected infrastructure of the city with new fountains, the Canal de l'Ourcq, Père Lachaise Cemetery and the city's first metal bridge, the Pont des Arts.\n\n\nDuring the Restoration, the bridges and squares of Paris were returned to their pre-Revolution names; the July Revolution in 1830 (commemorated by the July Column on the Place de la Bastille) brought to power a constitutional monarch, Louis Philippe I. The first railway line to Paris opened in 1837, beginning a new period of massive migration from the provinces to the city. In 1848, Louis-Philippe was overthrown by a popular uprising in the streets of Paris. His successor, Napoleon III, alongside the newly appointed prefect of the Seine, Georges-Eugène Haussmann, launched a huge public works project to build wide new boulevards, a new opera house, a central market, new aqueducts, sewers and parks, including the Bois de Boulogne and Bois de Vincennes. In 1860, Napoleon III annexed the surrounding towns and created eight new arrondissements, expanding Paris to its current limits.\n\n\nDuring the Franco-Prussian War (1870–1871), Paris was besieged by the Prussian Army. ", + "Open from 1804, these were the cemeteries of Père Lachaise, Montmartre, Montparnasse, and later Passy. New suburban cemeteries were created in the early 20th century: The largest of these are the Cimetière parisien de Saint-Ouen, the Cimetière parisien de Pantin, also known as Cimetière parisien de Pantin-Bobigny, the Cimetière parisien d'Ivry, and the Cimetière parisien de Bagneux. Famous people buried in Parisian cemeteries include Oscar Wilde, Frédéric Chopin, Jim Morrison, Édith Piaf and Serge Gainsbourg.\n\n\nEducation\n\nParis is the département with the highest proportion of highly educated people. In 2009, around 40 percent of Parisians held a licence-level diploma or higher, the highest proportion in France. 13 percent have no diploma, the third-lowest percentage in France. Education in Paris and the Île-de-France region employs approximately 330,000 people, 170,000 of whom are teachers and professors, teaching approximately 2.9 million students in around 9,000 primary, secondary, and higher education schools and institutions.\n\n\nThe University of Paris, founded in the 12th century, is often called the Sorbonne after one of its original medieval colleges. In 1970, it was broken up into thirteen autonomous universities, following the student demonstrations in 1968. Most of the campuses today are in the Latin Quarter where the old university was located, while others are scattered around the city and the suburbs.\n\n\nThe Paris region hosts France's highest concentration of the grandes écoles – 55 specialised centres of higher-education outside or inside the public university structure. ", + "One of the most notable hospitals is the Hôtel-Dieu, founded in 651, the oldest hospital in Paris and the oldest worldwide still operating, although the current building is the product of a reconstruction of 1877. Other hospitals include Pitié-Salpêtrière Hospital, one of the largest in Europe, Hôpital Cochin, Bichat–Claude Bernard Hospital, Hôpital Européen Georges-Pompidou, Bicêtre Hospital, Beaujon Hospital, the Curie Institute, Lariboisière Hospital, Necker–Enfants Malades Hospital, Hôpital Saint-Louis, Hôpital de la Charité and the American Hospital of Paris.\n\n\nInternational relations\n\nInternational organisations\n\nThe United Nations Educational, Scientific and Cultural Organization (UNESCO) has had its headquarters in Paris since November 1958. Paris is also the home of the Organisation for Economic Co-operation and Development (OECD). Paris hosts the headquarters of the European Space Agency, the International Energy Agency, European Securities and Markets Authority and the European Banking Authority.\n\n\nTwin towns – sister cities\n\nSince April 1956, Paris is exclusively and reciprocally twinned with:\n\n\nOther relationships\n\nParis has agreements of friendship and co-operation with:\n\n\nSee also\n\nNotes\n\nReferences\n\nCitations\n\nSources\n\nFurther reading\n\nExternal links\n\n" ] } -} \ No newline at end of file +} From 11922717d407d7514d51c43d3d4300915a3857bc Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 28 May 2025 11:44:33 -0400 Subject: [PATCH 66/86] no pruning if we do not have an indexreader --- .../xpack/core/ml/search/SparseVectorQueryBuilder.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 0fe70896f2997..b768075e87033 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -351,6 +351,11 @@ public TransportVersion getMinimalSupportedVersion() { } private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, SearchExecutionContext context) { + // if we do not have an index reader, there can be no token pruning + if (context.getIndexReader() == null) { + return null; + } + // if we are not on a supported index version, do not prune by default // nor do we check the index options if (context.indexVersionCreated().onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT) == false From d7f13511e29fd99cdbf2984400996eca8e8e6ac5 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 28 May 2025 11:47:31 -0400 Subject: [PATCH 67/86] check for searcher existence instead of IndxReader --- .../xpack/core/ml/search/SparseVectorQueryBuilder.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index b768075e87033..1c349e09ff800 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -351,8 +351,8 @@ public TransportVersion getMinimalSupportedVersion() { } private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, SearchExecutionContext context) { - // if we do not have an index reader, there can be no token pruning - if (context.getIndexReader() == null) { + // if we do not have searcher, there can be no token pruning because there will be no IndexReader + if (context.searcher() == null) { return null; } From 94e21dfd48499d2083a40be2d4a66b733366a265 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 28 May 2025 16:06:23 -0400 Subject: [PATCH 68/86] update docs with applies_to --- .../mapping-reference/sparse-vector.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md index e831a27fac868..dc9af7e642b13 100644 --- a/docs/reference/elasticsearch/mapping-reference/sparse-vector.md +++ b/docs/reference/elasticsearch/mapping-reference/sparse-vector.md @@ -63,19 +63,19 @@ index_options Parameters for `index_options` are: -`prune` -: (Optional, boolean) [preview] Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true. +`prune` {applies_to}`stack: preview 9.1` +: (Optional, boolean) Whether to perform pruning, omitting the non-significant tokens from the query to improve query performance. If `prune` is true but the `pruning_config` is not specified, pruning will occur but default values will be used. Default: true. -`pruning_config` -: (Optional, object) [preview] Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur. +`pruning_config` {applies_to}`stack: preview 9.1` +: (Optional, object) Optional pruning configuration. If enabled, this will omit non-significant tokens from the query in order to improve query performance. This is only used if `prune` is set to `true`. If `prune` is set to `true` but `pruning_config` is not specified, default values will be used. If `prune` is set to false, an exception will occur. Parameters for `pruning_config` include: - `tokens_freq_ratio_threshold` - : (Optional, integer) [preview] Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned. This value must between 1 and 100. Default: `5`. + `tokens_freq_ratio_threshold` {applies_to}`stack: preview 9.1` + : (Optional, integer) Tokens whose frequency is more than `tokens_freq_ratio_threshold` times the average frequency of all tokens in the specified field are considered outliers and pruned. This value must between 1 and 100. Default: `5`. - `tokens_weight_threshold` - : (Optional, float) [preview] Tokens whose weight is less than `tokens_weight_threshold` are considered insignificant and pruned. This value must be between 0 and 1. Default: `0.4`. + `tokens_weight_threshold` {applies_to}`stack: preview 9.1` + : (Optional, float) Tokens whose weight is less than `tokens_weight_threshold` are considered insignificant and pruned. This value must be between 0 and 1. Default: `0.4`. ::::{note} The default values for `tokens_freq_ratio_threshold` and `tokens_weight_threshold` were chosen based on tests using ELSERv2 that provided the most optimal results. From 212c850330d783e146e5781b87a93e9dc1189884 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 30 May 2025 08:52:28 -0400 Subject: [PATCH 69/86] failing simple test --- .../SparseVectorIndexOptionsUpgradeIT.java | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java diff --git a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java new file mode 100644 index 0000000000000..6a54657271890 --- /dev/null +++ b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java @@ -0,0 +1,60 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.upgrades; + +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.test.rest.ESRestTestCase; +import org.junit.Assert; + +import java.util.ArrayList; +import java.util.List; + +public class SparseVectorIndexOptionsUpgradeIT extends AbstractUpgradeTestCase { + + private final boolean testHasIndexOptions; + private final boolean testIndexShouldPrune; + private final boolean testQueryShouldNotPrune; + private final boolean usePreviousIndexVersion; + + public SparseVectorIndexOptionsUpgradeIT( + boolean setIndexOptions, + boolean setIndexShouldPrune, + boolean setQueryShouldNotPrune, + boolean usePreviousIndexVersion + ) { + this.testHasIndexOptions = setIndexOptions; + this.testIndexShouldPrune = setIndexShouldPrune; + this.testQueryShouldNotPrune = setQueryShouldNotPrune; + this.usePreviousIndexVersion = usePreviousIndexVersion; + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + List params = new ArrayList<>(); + // create a matrix of all combinations + // of our first three parameters + for (int i=0; i < 8; i++) { + params.add(new Object[] { + (i & 1) == 0, (i & 2) == 0, (i & 4) == 0, false + }); + } + // and add in overrides for the previous index versions + params.add(new Object[] { false, false, false, true }); + params.add(new Object[] { false, false, true, true }); + return params; + } + + public void testItPrunesTokensIfIndexOptions() { + Assert.assertFalse(true); + } + + public void testBehavioralAnalyticsDataRetention() throws Exception { + Assert.assertFalse(true); + } +} From c8fcd94faef04c5b6a03a4ea29848b8346626a09 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 30 May 2025 13:03:18 +0000 Subject: [PATCH 70/86] [CI] Auto commit changes from spotless --- .../upgrades/SparseVectorIndexOptionsUpgradeIT.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java index 6a54657271890..3b3e6c3d06b70 100644 --- a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java +++ b/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java @@ -9,7 +9,6 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; -import org.elasticsearch.test.rest.ESRestTestCase; import org.junit.Assert; import java.util.ArrayList; @@ -39,10 +38,8 @@ public static Iterable parameters() throws Exception { List params = new ArrayList<>(); // create a matrix of all combinations // of our first three parameters - for (int i=0; i < 8; i++) { - params.add(new Object[] { - (i & 1) == 0, (i & 2) == 0, (i & 4) == 0, false - }); + for (int i = 0; i < 8; i++) { + params.add(new Object[] { (i & 1) == 0, (i & 2) == 0, (i & 4) == 0, false }); } // and add in overrides for the previous index versions params.add(new Object[] { false, false, false, true }); From 7d2f361737e950d6e61f5c595b5eafe150ad759c Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 30 May 2025 13:31:48 -0400 Subject: [PATCH 71/86] move test to core package --- .../core/ml/search/SparseVectorIndexOptionsIT.java} | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) rename x-pack/{qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java => plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java} (85%) diff --git a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java similarity index 85% rename from x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java rename to x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 3b3e6c3d06b70..9b1ac02a03723 100644 --- a/x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/SparseVectorIndexOptionsUpgradeIT.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -5,23 +5,24 @@ * 2.0. */ -package org.elasticsearch.upgrades; +package org.elasticsearch.xpack.core.ml.search; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import org.elasticsearch.test.ESIntegTestCase; import org.junit.Assert; import java.util.ArrayList; import java.util.List; -public class SparseVectorIndexOptionsUpgradeIT extends AbstractUpgradeTestCase { +public class SparseVectorIndexOptionsIT extends ESIntegTestCase { private final boolean testHasIndexOptions; private final boolean testIndexShouldPrune; private final boolean testQueryShouldNotPrune; private final boolean usePreviousIndexVersion; - public SparseVectorIndexOptionsUpgradeIT( + public SparseVectorIndexOptionsIT( boolean setIndexOptions, boolean setIndexShouldPrune, boolean setQueryShouldNotPrune, @@ -50,8 +51,4 @@ public static Iterable parameters() throws Exception { public void testItPrunesTokensIfIndexOptions() { Assert.assertFalse(true); } - - public void testBehavioralAnalyticsDataRetention() throws Exception { - Assert.assertFalse(true); - } } From 0f9bd65fb79eb07359ed4368f707bad7d65972d6 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Fri, 30 May 2025 13:40:11 -0400 Subject: [PATCH 72/86] move test to _actual_ proper place --- .../xpack/core/ml/search/SparseVectorIndexOptionsIT.java | 1 - 1 file changed, 1 deletion(-) rename x-pack/plugin/core/src/{test => internalClusterTest}/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java (98%) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java similarity index 98% rename from x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java rename to x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 9b1ac02a03723..0430ff7a9e8d4 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -49,6 +49,5 @@ public static Iterable parameters() throws Exception { } public void testItPrunesTokensIfIndexOptions() { - Assert.assertFalse(true); } } From 2afeb2b32fab10a258aceaa16d499f9062cfe3c9 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 30 May 2025 17:51:00 +0000 Subject: [PATCH 73/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorIndexOptionsIT.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 0430ff7a9e8d4..b0107624499d4 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -10,7 +10,6 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.elasticsearch.test.ESIntegTestCase; -import org.junit.Assert; import java.util.ArrayList; import java.util.List; @@ -48,6 +47,5 @@ public static Iterable parameters() throws Exception { return params; } - public void testItPrunesTokensIfIndexOptions() { - } + public void testItPrunesTokensIfIndexOptions() {} } From 60a6b3e80184820ac31ab34329f214fbd74782af Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 2 Jun 2025 11:51:41 -0400 Subject: [PATCH 74/86] add integration tests for index_options / defaults --- .../ml/search/SparseVectorIndexOptionsIT.java | 217 +++++++++++++++++- .../ml/search/SparseVectorQueryBuilder.java | 1 + 2 files changed, 209 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index b0107624499d4..95c0a165ae5eb 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -9,28 +9,47 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import org.apache.http.HttpStatus; +import org.elasticsearch.client.Request; +import org.elasticsearch.client.Response; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentHelper; +import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xpack.core.XPackClientPlugin; +import org.hamcrest.Matchers; +import org.junit.Before; + +import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Map; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; public class SparseVectorIndexOptionsIT extends ESIntegTestCase { + private static final String TEST_INDEX_NAME = "index_with_sparse_vector"; + private static final String SPARSE_VECTOR_FIELD = "sparse_vector_field"; + private static final int TEST_PRUNING_TOKENS_FREQ_THRESHOLD = 1; + private static final float TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD = 1.0f; private final boolean testHasIndexOptions; private final boolean testIndexShouldPrune; private final boolean testQueryShouldNotPrune; - private final boolean usePreviousIndexVersion; public SparseVectorIndexOptionsIT( boolean setIndexOptions, boolean setIndexShouldPrune, - boolean setQueryShouldNotPrune, - boolean usePreviousIndexVersion + boolean setQueryShouldNotPrune ) { this.testHasIndexOptions = setIndexOptions; this.testIndexShouldPrune = setIndexShouldPrune; this.testQueryShouldNotPrune = setQueryShouldNotPrune; - this.usePreviousIndexVersion = usePreviousIndexVersion; } @ParametersFactory @@ -39,13 +58,193 @@ public static Iterable parameters() throws Exception { // create a matrix of all combinations // of our first three parameters for (int i = 0; i < 8; i++) { - params.add(new Object[] { (i & 1) == 0, (i & 2) == 0, (i & 4) == 0, false }); + params.add(new Object[] { (i & 1) == 0, (i & 2) == 0, (i & 4) == 0 }); } - // and add in overrides for the previous index versions - params.add(new Object[] { false, false, false, true }); - params.add(new Object[] { false, false, true, true }); return params; } - public void testItPrunesTokensIfIndexOptions() {} + @Override + protected boolean addMockHttpTransport() { + return false; // enable http + } + + @Override + protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { + Settings.Builder settings = Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings)); + return settings.build(); + } + + @Override + protected Collection> nodePlugins() { + return List.of(XPackClientPlugin.class); + } + + static final int INTERNAL_UNMANAGED_FLAG_VALUE = 2; + static final String FlAG_SETTING_KEY = IndexMetadata.INDEX_PRIORITY_SETTING.getKey(); + + @Before + public void setup() { + assertAcked(prepareCreate(TEST_INDEX_NAME).setMapping(getTestIndexMapping())); + ensureGreen(TEST_INDEX_NAME); + + for (Map.Entry doc : TEST_DOCUMENTS.entrySet()) { + index(TEST_INDEX_NAME, doc.getKey(), doc.getValue()); + } + flushAndRefresh(TEST_INDEX_NAME); + } + + public void testSparseVectorTokenPruning() throws IOException { + Response response = performSearch(getBuilderForSearch()); + assertThat(response.getStatusLine().getStatusCode(), Matchers.equalTo(HttpStatus.SC_OK)); + final Map responseMap = XContentHelper.convertToMap( + XContentType.JSON.xContent(), + response.getEntity().getContent(), + true + ); + assertCorrectResponse(responseMap); + } + + @SuppressWarnings("unchecked") + private void assertCorrectResponse(Map responseMap) { + List expectedIds = getTestExpectedDocIds(); + + Map mapHits = (Map) responseMap.get("hits"); + Map mapHitsTotal = (Map) mapHits.get("total"); + int actualTotalHits = (int) mapHitsTotal.get("value"); + int numHitsExpected = expectedIds.size(); + + // assertEquals(getAssertMessage("Search result total hits count mismatch"), numHitsExpected, actualTotalHits); + + List> hits = (List>) mapHits.get("hits"); + List actualDocIds = new ArrayList<>(); + for (Map doc : hits) { + actualDocIds.add((String)doc.get("_id")); + } + + assertEquals(getAssertMessage("Result document ids mismatch"), expectedIds, actualDocIds); + } + + private String getTestIndexMapping() { + if (isRunningAgainstOldCluster()) { + return "{\"properties\":{\"" + SPARSE_VECTOR_FIELD + "\":{\"type\":\"sparse_vector\"}}}"; + } + + String testPruningConfigMapping = "\"pruning_config\":{\"tokens_freq_ratio_threshold\":" + + TEST_PRUNING_TOKENS_FREQ_THRESHOLD + + ",\"tokens_weight_threshold\":" + + TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD + + "}"; + + String pruningMappingString = testIndexShouldPrune + ? "\"prune\":true," + testPruningConfigMapping + : "\"prune\":false"; + String indexOptionsString = testHasIndexOptions + ? ",\"index_options\":{" + pruningMappingString + "}" + : ""; + + return "{\"properties\":{\"" + + SPARSE_VECTOR_FIELD + + "\":{\"type\":\"sparse_vector\"" + + indexOptionsString + + "}}}"; + } + + private boolean isRunningAgainstOldCluster() { + return false; + } + + private List getTestExpectedDocIds() { + if (testQueryShouldNotPrune) { + // query overrides prune = false in all cases + return EXPECTED_DOC_IDS_WITHOUT_PRUNING; + } + + if (testHasIndexOptions) { + // index has set index options in the mapping + return testIndexShouldPrune + ? EXPECTED_DOC_IDS_WITH_PRUNING + : EXPECTED_DOC_IDS_WITHOUT_PRUNING; + } + + // default pruning should be true with default configuration + return EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING; + } + + private Response performSearch(String source) throws IOException { + Request request = new Request("GET", TEST_INDEX_NAME + "/_search"); + request.setJsonEntity(source); + return getRestClient().performRequest(request); + } + + private String getBuilderForSearch() { + boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); + SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( + SPARSE_VECTOR_FIELD, + shouldUseDefaultTokens ? SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS : SEARCH_WEIGHTED_TOKENS, + null, + null, + testQueryShouldNotPrune ? false : null, + null + ); + + return "{\"query\":" + Strings.toString(queryBuilder) + "}"; + } + + private String getAssertMessage(String message) { + return message + + " (Params: hasIndexOptions=" + + testHasIndexOptions + + ", indexShouldPrune=" + + testIndexShouldPrune + + ", queryShouldNotPrune=" + + testQueryShouldNotPrune + + "): " + + getDescriptiveTestType(); + } + + private String getDescriptiveTestType() { + String testDescription = ""; + if (testQueryShouldNotPrune) { + testDescription = "query override prune=false:"; + } + + if (testHasIndexOptions) { + testDescription += " pruning index_options explicitly set:"; + } else { + testDescription = " no index options set, tokens should be pruned by default:"; + } + + if (testIndexShouldPrune == false) { + testDescription += " index options has pruning set to false"; + } + + return testDescription; + } + + private static final Map TEST_DOCUMENTS = Map.of( + "1", "{\"sparse_vector_field\":{\"cheese\": 2.671405,\"is\": 0.11809908,\"comet\": 0.26088917}}", + "2", "{\"sparse_vector_field\":{\"planet\": 2.3438394,\"is\": 0.54600334,\"astronomy\": 0.36015007,\"moon\": 0.20022368}}", + "3", "{\"sparse_vector_field\":{\"is\": 0.6891394,\"globe\": 0.484035,\"ocean\": 0.080102935,\"underground\": 0.053516876}}" + ); + + private static final List SEARCH_WEIGHTED_TOKENS = List.of( + new WeightedToken("cheese", 0.5f), + new WeightedToken("comet", 0.5f), + new WeightedToken("globe", 0.484035f), + new WeightedToken("ocean", 0.080102935f), + new WeightedToken("underground", 0.053516876f), + new WeightedToken("is", 0.54600334f) + ); + + private static final List SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS = List.of( + new WeightedToken("planet", 0.2f) + ); + + private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of( + "1", "3", "2" + ); + + private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of("1"); + + private static final List EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING = List.of("2"); } diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 1c349e09ff800..6c50b7296d901 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -375,6 +375,7 @@ private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, Sea pruningConfigToUse = pruningConfigToUse == null ? asSVFieldType.getIndexOptions().getPruningConfig() : pruningConfigToUse; } + // do not prune if shouldQueryPruneTokens is explicitly set to false if (shouldQueryPruneTokens != null && shouldQueryPruneTokens == false) { return null; } From 9fd5e728ebdc2e42d46bdd3bcbd4789faaf252c8 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 2 Jun 2025 11:53:37 -0400 Subject: [PATCH 75/86] cleanup IT --- .../xpack/core/ml/search/SparseVectorIndexOptionsIT.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 95c0a165ae5eb..533fe4580e27c 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -12,7 +12,6 @@ import org.apache.http.HttpStatus; import org.elasticsearch.client.Request; import org.elasticsearch.client.Response; -import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentHelper; @@ -79,9 +78,6 @@ protected Collection> nodePlugins() { return List.of(XPackClientPlugin.class); } - static final int INTERNAL_UNMANAGED_FLAG_VALUE = 2; - static final String FlAG_SETTING_KEY = IndexMetadata.INDEX_PRIORITY_SETTING.getKey(); - @Before public void setup() { assertAcked(prepareCreate(TEST_INDEX_NAME).setMapping(getTestIndexMapping())); @@ -113,7 +109,7 @@ private void assertCorrectResponse(Map responseMap) { int actualTotalHits = (int) mapHitsTotal.get("value"); int numHitsExpected = expectedIds.size(); - // assertEquals(getAssertMessage("Search result total hits count mismatch"), numHitsExpected, actualTotalHits); + assertEquals(getAssertMessage("Search result total hits count mismatch"), numHitsExpected, actualTotalHits); List> hits = (List>) mapHits.get("hits"); List actualDocIds = new ArrayList<>(); From fe2e267f8df8b764c2056b57fa065c69210588ca Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 2 Jun 2025 16:01:30 +0000 Subject: [PATCH 76/86] [CI] Auto commit changes from spotless --- .../ml/search/SparseVectorIndexOptionsIT.java | 48 +++++++------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 533fe4580e27c..8d32af8eac07f 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -17,7 +17,6 @@ import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; - import org.elasticsearch.xcontent.XContentType; import org.elasticsearch.xpack.core.XPackClientPlugin; import org.hamcrest.Matchers; @@ -41,11 +40,7 @@ public class SparseVectorIndexOptionsIT extends ESIntegTestCase { private final boolean testIndexShouldPrune; private final boolean testQueryShouldNotPrune; - public SparseVectorIndexOptionsIT( - boolean setIndexOptions, - boolean setIndexShouldPrune, - boolean setQueryShouldNotPrune - ) { + public SparseVectorIndexOptionsIT(boolean setIndexOptions, boolean setIndexShouldPrune, boolean setQueryShouldNotPrune) { this.testHasIndexOptions = setIndexOptions; this.testIndexShouldPrune = setIndexShouldPrune; this.testQueryShouldNotPrune = setQueryShouldNotPrune; @@ -114,7 +109,7 @@ private void assertCorrectResponse(Map responseMap) { List> hits = (List>) mapHits.get("hits"); List actualDocIds = new ArrayList<>(); for (Map doc : hits) { - actualDocIds.add((String)doc.get("_id")); + actualDocIds.add((String) doc.get("_id")); } assertEquals(getAssertMessage("Result document ids mismatch"), expectedIds, actualDocIds); @@ -131,18 +126,10 @@ private String getTestIndexMapping() { + TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD + "}"; - String pruningMappingString = testIndexShouldPrune - ? "\"prune\":true," + testPruningConfigMapping - : "\"prune\":false"; - String indexOptionsString = testHasIndexOptions - ? ",\"index_options\":{" + pruningMappingString + "}" - : ""; - - return "{\"properties\":{\"" - + SPARSE_VECTOR_FIELD - + "\":{\"type\":\"sparse_vector\"" - + indexOptionsString - + "}}}"; + String pruningMappingString = testIndexShouldPrune ? "\"prune\":true," + testPruningConfigMapping : "\"prune\":false"; + String indexOptionsString = testHasIndexOptions ? ",\"index_options\":{" + pruningMappingString + "}" : ""; + + return "{\"properties\":{\"" + SPARSE_VECTOR_FIELD + "\":{\"type\":\"sparse_vector\"" + indexOptionsString + "}}}"; } private boolean isRunningAgainstOldCluster() { @@ -157,9 +144,7 @@ private List getTestExpectedDocIds() { if (testHasIndexOptions) { // index has set index options in the mapping - return testIndexShouldPrune - ? EXPECTED_DOC_IDS_WITH_PRUNING - : EXPECTED_DOC_IDS_WITHOUT_PRUNING; + return testIndexShouldPrune ? EXPECTED_DOC_IDS_WITH_PRUNING : EXPECTED_DOC_IDS_WITHOUT_PRUNING; } // default pruning should be true with default configuration @@ -174,7 +159,7 @@ private Response performSearch(String source) throws IOException { private String getBuilderForSearch() { boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); - SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( + SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( SPARSE_VECTOR_FIELD, shouldUseDefaultTokens ? SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS : SEARCH_WEIGHTED_TOKENS, null, @@ -218,9 +203,12 @@ private String getDescriptiveTestType() { } private static final Map TEST_DOCUMENTS = Map.of( - "1", "{\"sparse_vector_field\":{\"cheese\": 2.671405,\"is\": 0.11809908,\"comet\": 0.26088917}}", - "2", "{\"sparse_vector_field\":{\"planet\": 2.3438394,\"is\": 0.54600334,\"astronomy\": 0.36015007,\"moon\": 0.20022368}}", - "3", "{\"sparse_vector_field\":{\"is\": 0.6891394,\"globe\": 0.484035,\"ocean\": 0.080102935,\"underground\": 0.053516876}}" + "1", + "{\"sparse_vector_field\":{\"cheese\": 2.671405,\"is\": 0.11809908,\"comet\": 0.26088917}}", + "2", + "{\"sparse_vector_field\":{\"planet\": 2.3438394,\"is\": 0.54600334,\"astronomy\": 0.36015007,\"moon\": 0.20022368}}", + "3", + "{\"sparse_vector_field\":{\"is\": 0.6891394,\"globe\": 0.484035,\"ocean\": 0.080102935,\"underground\": 0.053516876}}" ); private static final List SEARCH_WEIGHTED_TOKENS = List.of( @@ -232,13 +220,9 @@ private String getDescriptiveTestType() { new WeightedToken("is", 0.54600334f) ); - private static final List SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS = List.of( - new WeightedToken("planet", 0.2f) - ); + private static final List SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS = List.of(new WeightedToken("planet", 0.2f)); - private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of( - "1", "3", "2" - ); + private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of("1", "3", "2"); private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of("1"); From 9953513cd88755ecb7f2e21284a570d19aac8af4 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 2 Jun 2025 17:12:03 -0400 Subject: [PATCH 77/86] cleanups; add query pruning override random test --- .../ml/search/SparseVectorIndexOptionsIT.java | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 8d32af8eac07f..af9998bc872b8 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -15,6 +15,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentHelper; +import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.xcontent.XContentType; @@ -39,18 +40,20 @@ public class SparseVectorIndexOptionsIT extends ESIntegTestCase { private final boolean testHasIndexOptions; private final boolean testIndexShouldPrune; private final boolean testQueryShouldNotPrune; + private final boolean overrideQueryPruningConfig; public SparseVectorIndexOptionsIT(boolean setIndexOptions, boolean setIndexShouldPrune, boolean setQueryShouldNotPrune) { this.testHasIndexOptions = setIndexOptions; this.testIndexShouldPrune = setIndexShouldPrune; this.testQueryShouldNotPrune = setQueryShouldNotPrune; + this.overrideQueryPruningConfig = (testHasIndexOptions && testIndexShouldPrune) && randomBoolean(); } @ParametersFactory public static Iterable parameters() throws Exception { List params = new ArrayList<>(); // create a matrix of all combinations - // of our first three parameters + // of our three parameters for (int i = 0; i < 8; i++) { params.add(new Object[] { (i & 1) == 0, (i & 2) == 0, (i & 4) == 0 }); } @@ -64,8 +67,7 @@ protected boolean addMockHttpTransport() { @Override protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { - Settings.Builder settings = Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings)); - return settings.build(); + return Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings)).build(); } @Override @@ -116,10 +118,6 @@ private void assertCorrectResponse(Map responseMap) { } private String getTestIndexMapping() { - if (isRunningAgainstOldCluster()) { - return "{\"properties\":{\"" + SPARSE_VECTOR_FIELD + "\":{\"type\":\"sparse_vector\"}}}"; - } - String testPruningConfigMapping = "\"pruning_config\":{\"tokens_freq_ratio_threshold\":" + TEST_PRUNING_TOKENS_FREQ_THRESHOLD + ",\"tokens_weight_threshold\":" @@ -132,11 +130,11 @@ private String getTestIndexMapping() { return "{\"properties\":{\"" + SPARSE_VECTOR_FIELD + "\":{\"type\":\"sparse_vector\"" + indexOptionsString + "}}}"; } - private boolean isRunningAgainstOldCluster() { - return false; - } - private List getTestExpectedDocIds() { + if (overrideQueryPruningConfig) { + return EXPECTED_DOC_IDS_WITH_QUERY_OVERRIDE; + } + if (testQueryShouldNotPrune) { // query overrides prune = false in all cases return EXPECTED_DOC_IDS_WITHOUT_PRUNING; @@ -159,13 +157,17 @@ private Response performSearch(String source) throws IOException { private String getBuilderForSearch() { boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); - SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( + TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig + ? new TokenPruningConfig(3f, 0.5f, true) + : null; + + SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( SPARSE_VECTOR_FIELD, shouldUseDefaultTokens ? SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS : SEARCH_WEIGHTED_TOKENS, null, null, - testQueryShouldNotPrune ? false : null, - null + overrideQueryPruningConfig ? Boolean.TRUE : (testQueryShouldNotPrune ? false : null), + queryPruningConfig ); return "{\"query\":" + Strings.toString(queryBuilder) + "}"; @@ -224,7 +226,9 @@ private String getDescriptiveTestType() { private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of("1", "3", "2"); - private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of("1"); + private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of(); private static final List EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING = List.of("2"); + + private static final List EXPECTED_DOC_IDS_WITH_QUERY_OVERRIDE = List.of(); } From 925173c93035641540c7319806b48163eebc23ae Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 2 Jun 2025 21:18:55 +0000 Subject: [PATCH 78/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorIndexOptionsIT.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index af9998bc872b8..e891619095766 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -157,11 +157,9 @@ private Response performSearch(String source) throws IOException { private String getBuilderForSearch() { boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); - TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig - ? new TokenPruningConfig(3f, 0.5f, true) - : null; + TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig ? new TokenPruningConfig(3f, 0.5f, true) : null; - SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( + SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( SPARSE_VECTOR_FIELD, shouldUseDefaultTokens ? SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS : SEARCH_WEIGHTED_TOKENS, null, From 04a597ed124d0d813c59a5fe9b1e266b31be6d56 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Mon, 2 Jun 2025 19:05:12 -0400 Subject: [PATCH 79/86] check for supported index version index_options --- .../search/SparseVectorQueryBuilderTests.java | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index b3dae7114706c..c49969d38925e 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -63,12 +63,21 @@ public class SparseVectorQueryBuilderTests extends AbstractQueryTestCase WEIGHTED_TOKENS = List.of(new WeightedToken("foo", .42f)); private static final int NUM_TOKENS = WEIGHTED_TOKENS.size(); + private final IndexVersion indexVersionToTest; @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) public @interface InjectSparseVectorIndexOptions { } + public SparseVectorQueryBuilderTests() { + // The sparse_vector field is not supported on versions 8.0 to 8.10. Because of this we'll only allow + // index versions after its reintroduction. + indexVersionToTest = randomBoolean() + ? IndexVersion.current() + : IndexVersionUtils.randomVersionBetween(random(), IndexVersions.NEW_SPARSE_VECTOR, IndexVersion.current()); + } + @Override protected SparseVectorQueryBuilder doCreateTestQueryBuilder() { TokenPruningConfig tokenPruningConfig = randomBoolean() @@ -115,12 +124,7 @@ protected Collection> getPlugins() { @Override protected Settings createTestIndexSettings() { - // The sparse_vector field is not supported on versions 8.0 to 8.10. Because of this we'll only allow - // index versions after its reintroduction. - final IndexVersion indexVersionCreated = randomBoolean() - ? IndexVersion.current() - : IndexVersionUtils.randomVersionBetween(random(), IndexVersions.NEW_SPARSE_VECTOR, IndexVersion.current()); - return Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, indexVersionCreated).build(); + return Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, indexVersionToTest).build(); } @Override @@ -171,6 +175,10 @@ private String getTestSparseVectorIndexMapping() { } private boolean currentTestHasIndexOptions() { + if (indexVersionSupportsIndexOptions() == false) { + return false; + } + Class clazz = this.getClass(); Class injectSparseVectorIndexOptions = InjectSparseVectorIndexOptions.class; @@ -182,6 +190,18 @@ private boolean currentTestHasIndexOptions() { } } + private boolean indexVersionSupportsIndexOptions() { + if (indexVersionToTest.onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)) { + return true; + } + + if (indexVersionToTest.between(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) { + return true; + } + + return false; + } + @Override protected void doAssertLuceneQuery(SparseVectorQueryBuilder queryBuilder, Query query, SearchExecutionContext context) { assertThat(query, instanceOf(SparseVectorQueryWrapper.class)); @@ -277,6 +297,8 @@ private void testDoToQuery(SparseVectorQueryBuilder queryBuilder, SearchExecutio assertTrue(query instanceof SparseVectorQueryWrapper); var sparseQuery = (SparseVectorQueryWrapper) query; + + // check if we have explicit pruning, or pruning via the index_options if (queryBuilder.shouldPruneTokens() || currentTestHasIndexOptions()) { // It's possible that all documents were pruned for aggressive pruning configurations assertTrue(sparseQuery.getTermsQuery() instanceof BooleanQuery || sparseQuery.getTermsQuery() instanceof MatchNoDocsQuery); From bbcd3093f89af81283f3ceb830688ee1164e9da1 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 2 Jun 2025 23:12:50 +0000 Subject: [PATCH 80/86] [CI] Auto commit changes from spotless --- .../core/ml/search/SparseVectorQueryBuilderTests.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index c49969d38925e..e1b2f86d83d4c 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -74,8 +74,8 @@ public SparseVectorQueryBuilderTests() { // The sparse_vector field is not supported on versions 8.0 to 8.10. Because of this we'll only allow // index versions after its reintroduction. indexVersionToTest = randomBoolean() - ? IndexVersion.current() - : IndexVersionUtils.randomVersionBetween(random(), IndexVersions.NEW_SPARSE_VECTOR, IndexVersion.current()); + ? IndexVersion.current() + : IndexVersionUtils.randomVersionBetween(random(), IndexVersions.NEW_SPARSE_VECTOR, IndexVersion.current()); } @Override @@ -195,7 +195,10 @@ private boolean indexVersionSupportsIndexOptions() { return true; } - if (indexVersionToTest.between(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) { + if (indexVersionToTest.between( + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, + IndexVersions.UPGRADE_TO_LUCENE_10_0_0 + )) { return true; } From c4c65b9ecf8c57b198b6cb550a2289bc1fc1e439 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 4 Jun 2025 09:07:52 -0400 Subject: [PATCH 81/86] cleanups and refactoring --- .../vectors/SparseVectorFieldMapper.java | 45 ------- .../vectors/SparseVectorFieldMapperTests.java | 47 +++++++ .../ml/search/SparseVectorIndexOptionsIT.java | 122 ++++++++++-------- .../ml/search/SparseVectorQueryBuilder.java | 2 +- .../search/SparseVectorQueryBuilderTests.java | 3 +- .../SemanticTextHighlighterTests.java | 10 +- .../xpack/inference/highlight/queries.json | 14 -- 7 files changed, 124 insertions(+), 119 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index 9ebd13c38018e..40ae0170c9ca0 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -507,50 +507,5 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.endObject(); return builder; } - - /* - public static Boolean parseIndexOptionsPruneValue(Map indexOptionsMap) { - Object shouldPrune = indexOptionsMap.get(IndexOptions.PRUNE_FIELD_NAME.getPreferredName()); - if (shouldPrune == null) { - return null; - } - - if (shouldPrune instanceof Boolean boolValue) { - return boolValue; - } - - throw new MapperParsingException( - "[" + SPARSE_VECTOR_INDEX_OPTIONS + "] field [" + PRUNE_FIELD_NAME.getPreferredName() + "] should be true or false" - ); - } - */ - - /* - public static TokenPruningConfig parseIndexOptionsPruningConfig(Boolean prune, Map indexOptionsMap) { - Object pruningConfiguration = indexOptionsMap.get(IndexOptions.PRUNING_CONFIG_FIELD_NAME.getPreferredName()); - if (pruningConfiguration == null) { - return null; - } - - if (prune == null || prune == false) { - throw new MapperParsingException( - "[" - + SPARSE_VECTOR_INDEX_OPTIONS - + "] field [" - + PRUNING_CONFIG_FIELD_NAME.getPreferredName() - + "] should only be set if [" - + PRUNE_FIELD_NAME.getPreferredName() - + "] is set to true" - ); - } - - Map pruningConfigurationMap = XContentMapValues.nodeMapValue( - pruningConfiguration, - PRUNING_CONFIG_FIELD_NAME.getPreferredName() - ); - - return TokenPruningConfig.parseFromMap(pruningConfigurationMap); - } - */ } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 5ed08cdb8f8b1..a8986d5f89054 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -32,6 +32,7 @@ import org.elasticsearch.search.lookup.Source; import org.elasticsearch.test.index.IndexVersionUtils; import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParseException; import org.hamcrest.Matchers; import org.junit.AssumptionViolatedException; @@ -251,6 +252,8 @@ public void testPruneMustBeBoolean() { b.endObject(); }))); assertThat(e.getMessage(), containsString("[index_options] failed to parse field [prune]")); + assertThat(e.getCause().getCause(), instanceOf(UnsupportedOperationException.class)); + assertThat(e.getCause().getCause().getMessage(), containsString("use text() instead")); } public void testPruningConfigurationIsMap() { @@ -262,6 +265,11 @@ public void testPruningConfigurationIsMap() { b.endObject(); }))); assertThat(e.getMessage(), containsString("[index_options] pruning_config doesn't support values of type:")); + assertThat(e.getCause(), instanceOf(XContentParseException.class)); + assertThat( + e.getCause().getMessage(), + containsString("[index_options] pruning_config doesn't support values of type: VALUE_STRING") + ); } public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { @@ -277,6 +285,11 @@ public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { b.endObject(); }))); assertThat(eTestPruneIsFalse.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestPruneIsFalse.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestPruneIsFalse.getCause().getCause().getCause().getMessage(), + containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") + ); Exception eTestPruneIsMissing = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -291,6 +304,11 @@ public void testWithIndexOptionsPruningConfigPruneRequired() throws Exception { eTestPruneIsMissing.getMessage(), containsString("Failed to parse mapping: Failed to build [index_options] after last required field arrived") ); + assertThat(eTestPruneIsMissing.getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestPruneIsMissing.getCause().getCause().getMessage(), + containsString("[index_options] field [pruning_config] should only be set if [prune] is set to true") + ); } public void testTokensFreqRatioCorrect() { @@ -307,6 +325,13 @@ public void testTokensFreqRatioCorrect() { eTestInteger.getMessage(), containsString("Failed to parse mapping: [0:0] [index_options] failed to parse field [pruning_config]") ); + assertThat(eTestInteger.getCause().getCause(), instanceOf(XContentParseException.class)); + assertThat( + eTestInteger.getCause().getCause().getMessage(), + containsString("[pruning_config] failed to parse field [tokens_freq_ratio_threshold]") + ); + assertThat(eTestInteger.getCause().getCause().getCause(), instanceOf(NumberFormatException.class)); + assertThat(eTestInteger.getCause().getCause().getCause().getMessage(), containsString("For input string: \"notaninteger\"")); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -318,6 +343,11 @@ public void testTokensFreqRatioCorrect() { b.endObject(); }))); assertThat(eTestRangeLower.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeLower.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeLower.getCause().getCause().getCause().getMessage(), + containsString("[tokens_freq_ratio_threshold] must be between [1] and [100], got -2.0") + ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -329,6 +359,11 @@ public void testTokensFreqRatioCorrect() { b.endObject(); }))); assertThat(eTestRangeHigher.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeHigher.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeHigher.getCause().getCause().getCause().getMessage(), + containsString("[tokens_freq_ratio_threshold] must be between [1] and [100], got 101.0") + ); } public void testTokensWeightThresholdCorrect() { @@ -342,6 +377,8 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); }))); assertThat(eTestDouble.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestDouble.getCause().getCause().getCause(), instanceOf(NumberFormatException.class)); + assertThat(eTestDouble.getCause().getCause().getCause().getMessage(), containsString("For input string: \"notadouble\"")); Exception eTestRangeLower = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -353,6 +390,11 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); }))); assertThat(eTestRangeLower.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeLower.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeLower.getCause().getCause().getCause().getMessage(), + containsString("[tokens_weight_threshold] must be between 0 and 1") + ); Exception eTestRangeHigher = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> { b.field("type", "sparse_vector"); @@ -364,6 +406,11 @@ public void testTokensWeightThresholdCorrect() { b.endObject(); }))); assertThat(eTestRangeHigher.getMessage(), containsString("[index_options] failed to parse field [pruning_config]")); + assertThat(eTestRangeHigher.getCause().getCause().getCause(), instanceOf(IllegalArgumentException.class)); + assertThat( + eTestRangeHigher.getCause().getCause().getCause().getMessage(), + containsString("[tokens_weight_threshold] must be between 0 and 1") + ); } public void testStoreIsNotUpdateable() throws IOException { diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index e891619095766..dcbb5d12af1e2 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -9,16 +9,17 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; -import org.apache.http.HttpStatus; -import org.elasticsearch.client.Request; -import org.elasticsearch.client.Response; +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.Strings; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.index.mapper.vectors.TokenPruningConfig; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.rest.RestStatus; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.test.ESIntegTestCase; -import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; import org.elasticsearch.xpack.core.XPackClientPlugin; import org.hamcrest.Matchers; import org.junit.Before; @@ -60,16 +61,6 @@ public static Iterable parameters() throws Exception { return params; } - @Override - protected boolean addMockHttpTransport() { - return false; // enable http - } - - @Override - protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { - return Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings)).build(); - } - @Override protected Collection> nodePlugins() { return List.of(XPackClientPlugin.class); @@ -86,48 +77,76 @@ public void setup() { flushAndRefresh(TEST_INDEX_NAME); } - public void testSparseVectorTokenPruning() throws IOException { - Response response = performSearch(getBuilderForSearch()); - assertThat(response.getStatusLine().getStatusCode(), Matchers.equalTo(HttpStatus.SC_OK)); - final Map responseMap = XContentHelper.convertToMap( - XContentType.JSON.xContent(), - response.getEntity().getContent(), - true - ); - assertCorrectResponse(responseMap); + public void testSparseVectorTokenPruning() throws Exception { + assertBusy(() -> { + SearchResponse response = performSearch(getBuilderForSearch()); + try { + assertThat(response.status(), Matchers.equalTo(RestStatus.OK)); + assertCorrectResponse(response); + } finally { + response.decRef(); + } + }); } - @SuppressWarnings("unchecked") - private void assertCorrectResponse(Map responseMap) { + private void assertCorrectResponse(SearchResponse response) { List expectedIds = getTestExpectedDocIds(); - Map mapHits = (Map) responseMap.get("hits"); - Map mapHitsTotal = (Map) mapHits.get("total"); - int actualTotalHits = (int) mapHitsTotal.get("value"); - int numHitsExpected = expectedIds.size(); - - assertEquals(getAssertMessage("Search result total hits count mismatch"), numHitsExpected, actualTotalHits); + assertEquals( + getAssertMessage("Search result total hits count mismatch"), + expectedIds.size(), + response.getHits().getTotalHits().value() + ); - List> hits = (List>) mapHits.get("hits"); List actualDocIds = new ArrayList<>(); - for (Map doc : hits) { - actualDocIds.add((String) doc.get("_id")); + for (SearchHit doc : response.getHits().getHits()) { + actualDocIds.add(doc.getId()); } assertEquals(getAssertMessage("Result document ids mismatch"), expectedIds, actualDocIds); } private String getTestIndexMapping() { - String testPruningConfigMapping = "\"pruning_config\":{\"tokens_freq_ratio_threshold\":" - + TEST_PRUNING_TOKENS_FREQ_THRESHOLD - + ",\"tokens_weight_threshold\":" - + TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD - + "}"; + try { + XContentBuilder docBuilder = XContentFactory.jsonBuilder(); + docBuilder.startObject(); + { + docBuilder.startObject("properties"); + { + docBuilder.startObject(SPARSE_VECTOR_FIELD); + { + docBuilder.field("type", "sparse_vector"); + addIndexFieldIndexOptions(docBuilder); + } + docBuilder.endObject(); + } + docBuilder.endObject(); + } + docBuilder.endObject(); + return Strings.toString(docBuilder); + } catch (IOException e) { + throw new AssertionError(e); + } + } - String pruningMappingString = testIndexShouldPrune ? "\"prune\":true," + testPruningConfigMapping : "\"prune\":false"; - String indexOptionsString = testHasIndexOptions ? ",\"index_options\":{" + pruningMappingString + "}" : ""; + private void addIndexFieldIndexOptions(XContentBuilder docBuilder) throws IOException { + if (testHasIndexOptions == false) { + return; + } - return "{\"properties\":{\"" + SPARSE_VECTOR_FIELD + "\":{\"type\":\"sparse_vector\"" + indexOptionsString + "}}}"; + docBuilder.startObject("index_options"); + if (testIndexShouldPrune) { + docBuilder.field("prune", true); + docBuilder.startObject("pruning_config"); + { + docBuilder.field("tokens_freq_ratio_threshold", TEST_PRUNING_TOKENS_FREQ_THRESHOLD); + docBuilder.field("tokens_weight_threshold", TEST_PRUNING_TOKENS_WEIGHT_THRESHOLD); + } + docBuilder.endObject(); + } else { + docBuilder.field("prune", false); + } + docBuilder.endObject(); } private List getTestExpectedDocIds() { @@ -149,26 +168,27 @@ private List getTestExpectedDocIds() { return EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING; } - private Response performSearch(String source) throws IOException { - Request request = new Request("GET", TEST_INDEX_NAME + "/_search"); - request.setJsonEntity(source); - return getRestClient().performRequest(request); + private SearchResponse performSearch(SearchSourceBuilder source) { + SearchRequest searchRequest = new SearchRequest(TEST_INDEX_NAME); + searchRequest.source(source); + return client().search(searchRequest).actionGet(); } - private String getBuilderForSearch() { + private SearchSourceBuilder getBuilderForSearch() throws IOException { boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig ? new TokenPruningConfig(3f, 0.5f, true) : null; + Boolean shouldPrune = overrideQueryPruningConfig ? Boolean.TRUE : (testQueryShouldNotPrune ? Boolean.FALSE : null); SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( SPARSE_VECTOR_FIELD, shouldUseDefaultTokens ? SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS : SEARCH_WEIGHTED_TOKENS, null, null, - overrideQueryPruningConfig ? Boolean.TRUE : (testQueryShouldNotPrune ? false : null), + shouldPrune, queryPruningConfig ); - return "{\"query\":" + Strings.toString(queryBuilder) + "}"; + return new SearchSourceBuilder().query(queryBuilder); } private String getAssertMessage(String message) { diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java index 6c50b7296d901..e51928096fb73 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilder.java @@ -357,7 +357,7 @@ private TokenPruningConfig getTokenPruningConfigForQuery(MappedFieldType ft, Sea } // if we are not on a supported index version, do not prune by default - // nor do we check the index options + // nor do we check the index options, so, we'll return a pruning config only if the query specifies it. if (context.indexVersionCreated().onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT) == false && context.indexVersionCreated() .between( diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index e1b2f86d83d4c..ee52bfc92b1f3 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -26,6 +26,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.CheckedConsumer; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.MapperService; @@ -224,7 +225,7 @@ protected void doAssertLuceneQuery(SparseVectorQueryBuilder queryBuilder, Query } } - private void withSearchIndex(Consumer consumer) throws IOException { + private void withSearchIndex(CheckedConsumer consumer) throws IOException { try (Directory directory = newDirectory(); RandomIndexWriter iw = new RandomIndexWriter(random(), directory)) { Document document = new Document(); document.add(new FloatDocValuesField(SPARSE_VECTOR_FIELD, 1.0f)); diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index 0b238ff990efe..ff57282921952 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -134,22 +134,19 @@ public void testSparseVector() throws Exception { List tokens = readSparseVector(queryMap.get("embeddings")); var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_ELSER); - Boolean doNotPruneTokens = randomBoolean() ? false : null; - SparseVectorQueryBuilder sparseQuery = new SparseVectorQueryBuilder( fieldType.getEmbeddingsField().fullPath(), tokens, null, null, - doNotPruneTokens, + false, null ); NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), sparseQuery, ScoreMode.Max); var shardRequest = createShardSearchRequest(nestedQueryBuilder); var sourceToParse = new SourceToParse("0", readSampleDoc(useLegacyFormat), XContentType.JSON); - String expectedScoringDocsKey = doNotPruneTokens == null ? "expected_by_score_with_pruning" : "expected_by_score"; - String[] expectedScorePassages = ((List) queryMap.get(expectedScoringDocsKey)).toArray(String[]::new); + String[] expectedScorePassages = ((List) queryMap.get("expected_by_score")).toArray(String[]::new); for (int i = 0; i < expectedScorePassages.length; i++) { assertHighlightOneDoc( mapperService, @@ -162,8 +159,7 @@ public void testSparseVector() throws Exception { ); } - String expectedOffsetDocsKey = doNotPruneTokens == null ? "expected_by_offset_with_pruning" : "expected_by_offset"; - String[] expectedOffsetPassages = ((List) queryMap.get(expectedOffsetDocsKey)).toArray(String[]::new); + String[] expectedOffsetPassages = ((List) queryMap.get("expected_by_offset")).toArray(String[]::new); assertHighlightOneDoc( mapperService, shardRequest, diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json index 6aa2788a864c3..a41d5fafaec41 100644 --- a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json +++ b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json @@ -462,20 +462,6 @@ "After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n", "\nDuring the Hundred Years' War, Paris was occupied by England-friendly Burgundian forces from 1418, before being occupied outright by the English when Henry V of England entered the French capital in 1420; in spite of a 1429 effort by Joan of Arc to liberate the city, it would remain under English occupation until 1436.\n\n\nIn the late 16th-century French Wars of Religion, Paris was a stronghold of the Catholic League, the organisers of 24 August 1572 St. Bartholomew's Day massacre in which thousands of French Protestants were killed. The conflicts ended when pretender to the throne Henry IV, after converting to Catholicism to gain entry to the capital, entered the city in 1594 to claim the crown of France. This king made several improvements to the capital during his reign: he completed the construction of Paris's first uncovered, sidewalk-lined bridge, the Pont Neuf, built a Louvre extension connecting it to the Tuileries Palace, and created the first Paris residential square, the Place Royale, now Place des Vosges. In spite of Henry IV's efforts to improve city circulation, the narrowness of Paris's streets was a contributing factor in his assassination near Les Halles marketplace in 1610.\n\n\nDuring the 17th century, Cardinal Richelieu, chief minister of Louis XIII, was determined to make Paris the most beautiful city in Europe. He built five new bridges, a new chapel for the College of Sorbonne, and a palace for himself, the Palais-Cardinal. ", "Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n" - ], - "expected_by_score_with_pruning": [ - "\nThe Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the Île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\n\n\nThe Romans conquered the Paris Basin in 52 BC and began their settlement on Paris's Left Bank. The Roman town was originally called Lutetia (more fully, Lutetia Parisiorum, \"Lutetia of the Parisii\", modern French Lutèce). It became a prosperous city with a forum, baths, temples, theatres, and an amphitheatre.\n\n\nBy the end of the Western Roman Empire, the town was known as Parisius, a Latin name that would later become Paris in French. Christianity was introduced in the middle of the 3rd century AD by Saint Denis, the first Bishop of Paris: according to legend, when he refused to renounce his faith before the Roman occupiers, he was beheaded on the hill which became known as Mons Martyrum (Latin \"Hill of Martyrs\"), later \"Montmartre\", from where he walked headless to the north of the city; the place where he fell and was buried became an important religious shrine, the Basilica of Saint-Denis, and many French kings are buried there.\n\n\nClovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. ", - "Clovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. As the Frankish domination of Gaul began, there was a gradual immigration by the Franks to Paris and the Parisian Francien dialects were born. Fortification of the Île de la Cité failed to avert sacking by Vikings in 845, but Paris's strategic importance—with its bridges preventing ships from passing—was established by successful defence in the Siege of Paris (885–886), for which the then Count of Paris (comte de Paris), Odo of France, was elected king of West Francia. From the Capetian dynasty that began with the 987 election of Hugh Capet, Count of Paris and Duke of the Franks (duc des Francs), as king of a unified West Francia, Paris gradually became the largest and most prosperous city in France.\n\n\nHigh and Late Middle Ages to Louis XIV\n\nBy the end of the 12th century, Paris had become the political, economic, religious, and cultural capital of France. The Palais de la Cité, the royal residence, was located at the western end of the Île de la Cité. In 1163, during the reign of Louis VII, Maurice de Sully, bishop of Paris, undertook the construction of the Notre Dame Cathedral at its eastern extremity.\n\n\nAfter the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. ", - "Open from 1804, these were the cemeteries of Père Lachaise, Montmartre, Montparnasse, and later Passy. New suburban cemeteries were created in the early 20th century: The largest of these are the Cimetière parisien de Saint-Ouen, the Cimetière parisien de Pantin, also known as Cimetière parisien de Pantin-Bobigny, the Cimetière parisien d'Ivry, and the Cimetière parisien de Bagneux. Famous people buried in Parisian cemeteries include Oscar Wilde, Frédéric Chopin, Jim Morrison, Édith Piaf and Serge Gainsbourg.\n\n\nEducation\n\nParis is the département with the highest proportion of highly educated people. In 2009, around 40 percent of Parisians held a licence-level diploma or higher, the highest proportion in France. 13 percent have no diploma, the third-lowest percentage in France. Education in Paris and the Île-de-France region employs approximately 330,000 people, 170,000 of whom are teachers and professors, teaching approximately 2.9 million students in around 9,000 primary, secondary, and higher education schools and institutions.\n\n\nThe University of Paris, founded in the 12th century, is often called the Sorbonne after one of its original medieval colleges. In 1970, it was broken up into thirteen autonomous universities, following the student demonstrations in 1968. Most of the campuses today are in the Latin Quarter where the old university was located, while others are scattered around the city and the suburbs.\n\n\nThe Paris region hosts France's highest concentration of the grandes écoles – 55 specialised centres of higher-education outside or inside the public university structure. ", - "\nThe population of Paris had dropped by 100,000 during the Revolution, but after 1799 it surged with 160,000 new residents, reaching 660,000 by 1815. Napoleon replaced the elected government of Paris with a prefect that reported directly to him. He began erecting monuments to military glory, including the Arc de Triomphe, and improved the neglected infrastructure of the city with new fountains, the Canal de l'Ourcq, Père Lachaise Cemetery and the city's first metal bridge, the Pont des Arts.\n\n\nDuring the Restoration, the bridges and squares of Paris were returned to their pre-Revolution names; the July Revolution in 1830 (commemorated by the July Column on the Place de la Bastille) brought to power a constitutional monarch, Louis Philippe I. The first railway line to Paris opened in 1837, beginning a new period of massive migration from the provinces to the city. In 1848, Louis-Philippe was overthrown by a popular uprising in the streets of Paris. His successor, Napoleon III, alongside the newly appointed prefect of the Seine, Georges-Eugène Haussmann, launched a huge public works project to build wide new boulevards, a new opera house, a central market, new aqueducts, sewers and parks, including the Bois de Boulogne and Bois de Vincennes. In 1860, Napoleon III annexed the surrounding towns and created eight new arrondissements, expanding Paris to its current limits.\n\n\nDuring the Franco-Prussian War (1870–1871), Paris was besieged by the Prussian Army. ", - "One of the most notable hospitals is the Hôtel-Dieu, founded in 651, the oldest hospital in Paris and the oldest worldwide still operating, although the current building is the product of a reconstruction of 1877. Other hospitals include Pitié-Salpêtrière Hospital, one of the largest in Europe, Hôpital Cochin, Bichat–Claude Bernard Hospital, Hôpital Européen Georges-Pompidou, Bicêtre Hospital, Beaujon Hospital, the Curie Institute, Lariboisière Hospital, Necker–Enfants Malades Hospital, Hôpital Saint-Louis, Hôpital de la Charité and the American Hospital of Paris.\n\n\nInternational relations\n\nInternational organisations\n\nThe United Nations Educational, Scientific and Cultural Organization (UNESCO) has had its headquarters in Paris since November 1958. Paris is also the home of the Organisation for Economic Co-operation and Development (OECD). Paris hosts the headquarters of the European Space Agency, the International Energy Agency, European Securities and Markets Authority and the European Banking Authority.\n\n\nTwin towns – sister cities\n\nSince April 1956, Paris is exclusively and reciprocally twinned with:\n\n\nOther relationships\n\nParis has agreements of friendship and co-operation with:\n\n\nSee also\n\nNotes\n\nReferences\n\nCitations\n\nSources\n\nFurther reading\n\nExternal links\n\n" - ], - "expected_by_offset_with_pruning": [ - "\nThe Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the Île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\n\n\nThe Romans conquered the Paris Basin in 52 BC and began their settlement on Paris's Left Bank. The Roman town was originally called Lutetia (more fully, Lutetia Parisiorum, \"Lutetia of the Parisii\", modern French Lutèce). It became a prosperous city with a forum, baths, temples, theatres, and an amphitheatre.\n\n\nBy the end of the Western Roman Empire, the town was known as Parisius, a Latin name that would later become Paris in French. Christianity was introduced in the middle of the 3rd century AD by Saint Denis, the first Bishop of Paris: according to legend, when he refused to renounce his faith before the Roman occupiers, he was beheaded on the hill which became known as Mons Martyrum (Latin \"Hill of Martyrs\"), later \"Montmartre\", from where he walked headless to the north of the city; the place where he fell and was buried became an important religious shrine, the Basilica of Saint-Denis, and many French kings are buried there.\n\n\nClovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. ", - "Clovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. As the Frankish domination of Gaul began, there was a gradual immigration by the Franks to Paris and the Parisian Francien dialects were born. Fortification of the Île de la Cité failed to avert sacking by Vikings in 845, but Paris's strategic importance—with its bridges preventing ships from passing—was established by successful defence in the Siege of Paris (885–886), for which the then Count of Paris (comte de Paris), Odo of France, was elected king of West Francia. From the Capetian dynasty that began with the 987 election of Hugh Capet, Count of Paris and Duke of the Franks (duc des Francs), as king of a unified West Francia, Paris gradually became the largest and most prosperous city in France.\n\n\nHigh and Late Middle Ages to Louis XIV\n\nBy the end of the 12th century, Paris had become the political, economic, religious, and cultural capital of France. The Palais de la Cité, the royal residence, was located at the western end of the Île de la Cité. In 1163, during the reign of Louis VII, Maurice de Sully, bishop of Paris, undertook the construction of the Notre Dame Cathedral at its eastern extremity.\n\n\nAfter the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. ", - "\nThe population of Paris had dropped by 100,000 during the Revolution, but after 1799 it surged with 160,000 new residents, reaching 660,000 by 1815. Napoleon replaced the elected government of Paris with a prefect that reported directly to him. He began erecting monuments to military glory, including the Arc de Triomphe, and improved the neglected infrastructure of the city with new fountains, the Canal de l'Ourcq, Père Lachaise Cemetery and the city's first metal bridge, the Pont des Arts.\n\n\nDuring the Restoration, the bridges and squares of Paris were returned to their pre-Revolution names; the July Revolution in 1830 (commemorated by the July Column on the Place de la Bastille) brought to power a constitutional monarch, Louis Philippe I. The first railway line to Paris opened in 1837, beginning a new period of massive migration from the provinces to the city. In 1848, Louis-Philippe was overthrown by a popular uprising in the streets of Paris. His successor, Napoleon III, alongside the newly appointed prefect of the Seine, Georges-Eugène Haussmann, launched a huge public works project to build wide new boulevards, a new opera house, a central market, new aqueducts, sewers and parks, including the Bois de Boulogne and Bois de Vincennes. In 1860, Napoleon III annexed the surrounding towns and created eight new arrondissements, expanding Paris to its current limits.\n\n\nDuring the Franco-Prussian War (1870–1871), Paris was besieged by the Prussian Army. ", - "Open from 1804, these were the cemeteries of Père Lachaise, Montmartre, Montparnasse, and later Passy. New suburban cemeteries were created in the early 20th century: The largest of these are the Cimetière parisien de Saint-Ouen, the Cimetière parisien de Pantin, also known as Cimetière parisien de Pantin-Bobigny, the Cimetière parisien d'Ivry, and the Cimetière parisien de Bagneux. Famous people buried in Parisian cemeteries include Oscar Wilde, Frédéric Chopin, Jim Morrison, Édith Piaf and Serge Gainsbourg.\n\n\nEducation\n\nParis is the département with the highest proportion of highly educated people. In 2009, around 40 percent of Parisians held a licence-level diploma or higher, the highest proportion in France. 13 percent have no diploma, the third-lowest percentage in France. Education in Paris and the Île-de-France region employs approximately 330,000 people, 170,000 of whom are teachers and professors, teaching approximately 2.9 million students in around 9,000 primary, secondary, and higher education schools and institutions.\n\n\nThe University of Paris, founded in the 12th century, is often called the Sorbonne after one of its original medieval colleges. In 1970, it was broken up into thirteen autonomous universities, following the student demonstrations in 1968. Most of the campuses today are in the Latin Quarter where the old university was located, while others are scattered around the city and the suburbs.\n\n\nThe Paris region hosts France's highest concentration of the grandes écoles – 55 specialised centres of higher-education outside or inside the public university structure. ", - "One of the most notable hospitals is the Hôtel-Dieu, founded in 651, the oldest hospital in Paris and the oldest worldwide still operating, although the current building is the product of a reconstruction of 1877. Other hospitals include Pitié-Salpêtrière Hospital, one of the largest in Europe, Hôpital Cochin, Bichat–Claude Bernard Hospital, Hôpital Européen Georges-Pompidou, Bicêtre Hospital, Beaujon Hospital, the Curie Institute, Lariboisière Hospital, Necker–Enfants Malades Hospital, Hôpital Saint-Louis, Hôpital de la Charité and the American Hospital of Paris.\n\n\nInternational relations\n\nInternational organisations\n\nThe United Nations Educational, Scientific and Cultural Organization (UNESCO) has had its headquarters in Paris since November 1958. Paris is also the home of the Organisation for Economic Co-operation and Development (OECD). Paris hosts the headquarters of the European Space Agency, the International Energy Agency, European Securities and Markets Authority and the European Banking Authority.\n\n\nTwin towns – sister cities\n\nSince April 1956, Paris is exclusively and reciprocally twinned with:\n\n\nOther relationships\n\nParis has agreements of friendship and co-operation with:\n\n\nSee also\n\nNotes\n\nReferences\n\nCitations\n\nSources\n\nFurther reading\n\nExternal links\n\n" ] } } From 88fc1f4a30698f444947f678e51c5c897fac4547 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 4 Jun 2025 13:18:09 +0000 Subject: [PATCH 82/86] [CI] Auto commit changes from spotless --- .../xpack/core/ml/search/SparseVectorQueryBuilderTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index ee52bfc92b1f3..3b7c77b7e7916 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -52,7 +52,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -import java.util.function.Consumer; import static org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder.QUERY_VECTOR_FIELD; import static org.hamcrest.CoreMatchers.instanceOf; From c0704dfe2f4f13be9f0761d64a943ad53c5f11e2 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 4 Jun 2025 09:36:08 -0400 Subject: [PATCH 83/86] clean SparseVectorQueryBuilderTests --- .../search/SparseVectorQueryBuilderTests.java | 163 +++++------------- .../test/multi_cluster/50_sparse_vector.yml | 58 ++++++- 2 files changed, 96 insertions(+), 125 deletions(-) diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java index 3b7c77b7e7916..ba6af5c96c91d 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/search/SparseVectorQueryBuilderTests.java @@ -44,10 +44,6 @@ import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults; import java.io.IOException; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collection; @@ -65,17 +61,20 @@ public class SparseVectorQueryBuilderTests extends AbstractQueryTestCase clazz = this.getClass(); - Class injectSparseVectorIndexOptions = InjectSparseVectorIndexOptions.class; - - try { - Method method = clazz.getMethod(this.getTestName()); - return method.isAnnotationPresent(injectSparseVectorIndexOptions); - } catch (NoSuchMethodException e) { - return false; - } - } - - private boolean indexVersionSupportsIndexOptions() { - if (indexVersionToTest.onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)) { - return true; - } - - if (indexVersionToTest.between( - IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, - IndexVersions.UPGRADE_TO_LUCENE_10_0_0 - )) { - return true; - } - - return false; + String mapping = Strings.toString(PutMappingRequest.simpleMapping(SPARSE_VECTOR_FIELD, "type=sparse_vector")); + mapperService.merge("_doc", new CompressedXContent(mapping), MapperService.MergeReason.MAPPING_UPDATE); } @Override @@ -242,15 +199,11 @@ private void withSearchIndex(CheckedConsumer { - try { - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - QueryBuilder rewriteQuery = null; - rewriteQuery = rewriteQuery(queryBuilder, new SearchExecutionContext(context)); - assertNotNull(rewriteQuery.toQuery(context)); - assertTrue("query should be cacheable: " + queryBuilder.toString(), context.isCacheable()); - } catch (IOException e) { - throw new RuntimeException(e); - } + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + QueryBuilder rewriteQuery = null; + rewriteQuery = rewriteQuery(queryBuilder, new SearchExecutionContext(context)); + assertNotNull(rewriteQuery.toQuery(context)); + assertTrue("query should be cacheable: " + queryBuilder.toString(), context.isCacheable()); }); } @@ -260,12 +213,8 @@ public void testCacheability() throws IOException { @Override public void testMustRewrite() throws IOException { withSearchIndex((context) -> { - try { - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - queryBuilder.toQuery(context); - } catch (IOException e) { - throw new RuntimeException(e); - } + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + queryBuilder.toQuery(context); }); } @@ -275,17 +224,13 @@ public void testMustRewrite() throws IOException { @Override public void testToQuery() throws IOException { withSearchIndex((context) -> { - try { - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); - if (queryBuilder.getQueryVectors() == null) { - QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); - assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); - testDoToQuery((SparseVectorQueryBuilder) rewrittenQueryBuilder, context); - } else { - testDoToQuery(queryBuilder, context); - } - } catch (IOException e) { - throw new RuntimeException(e); + SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(); + if (queryBuilder.getQueryVectors() == null) { + QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); + assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); + testDoToQuery((SparseVectorQueryBuilder) rewrittenQueryBuilder, context); + } else { + testDoToQuery(queryBuilder, context); } }); } @@ -301,8 +246,8 @@ private void testDoToQuery(SparseVectorQueryBuilder queryBuilder, SearchExecutio assertTrue(query instanceof SparseVectorQueryWrapper); var sparseQuery = (SparseVectorQueryWrapper) query; - // check if we have explicit pruning, or pruning via the index_options - if (queryBuilder.shouldPruneTokens() || currentTestHasIndexOptions()) { + // check if we have explicit pruning or implicit pruning=true + if (queryBuilder.shouldPruneTokens() || indexVersionSupportsIndexOptions()) { // It's possible that all documents were pruned for aggressive pruning configurations assertTrue(sparseQuery.getTermsQuery() instanceof BooleanQuery || sparseQuery.getTermsQuery() instanceof MatchNoDocsQuery); } else { @@ -401,48 +346,18 @@ public void testThatWeCorrectlyRewriteQueryIntoVectors() { assertNotNull(((SparseVectorQueryBuilder) rewrittenQueryBuilder).getQueryVectors()); } - @InjectSparseVectorIndexOptions - public void testItUsesIndexOptionsDefaults() throws IOException { - withSearchIndex((context) -> { - try { - SparseVectorQueryBuilder builder = createTestQueryBuilder(null); - assertFalse(builder.shouldPruneTokens()); - testDoToQuery(builder, context); - } catch (IOException ex) { - throw new RuntimeException(ex); - } - }); - } + private boolean indexVersionSupportsIndexOptions() { + if (indexVersionToTest.onOrAfter(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)) { + return true; + } - @InjectSparseVectorIndexOptions - public void testItOverridesIndexOptionsDefaults() throws IOException { - withSearchIndex((context) -> { - try { - TokenPruningConfig pruningConfig = new TokenPruningConfig(2, 0.3f, false); - SparseVectorQueryBuilder builder = createTestQueryBuilder(pruningConfig); - assertTrue(builder.shouldPruneTokens()); - testDoToQuery(builder, context); - } catch (IOException ex) { - throw new RuntimeException(ex); - } - }); - } + if (indexVersionToTest.between( + IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, + IndexVersions.UPGRADE_TO_LUCENE_10_0_0 + )) { + return true; + } - @InjectSparseVectorIndexOptions - public void testToQueryRewriteWithIndexOptions() throws IOException { - withSearchIndex((context) -> { - SparseVectorQueryBuilder queryBuilder = createTestQueryBuilder(null); - try { - if (queryBuilder.getQueryVectors() == null) { - QueryBuilder rewrittenQueryBuilder = rewriteAndFetch(queryBuilder, context); - assertTrue(rewrittenQueryBuilder instanceof SparseVectorQueryBuilder); - testDoToQuery((SparseVectorQueryBuilder) rewrittenQueryBuilder, context); - } else { - testDoToQuery(queryBuilder, context); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - }); + return false; } } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 968c0896c0345..2c7750d9407ad 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -273,7 +273,6 @@ teardown: index: sparse_vector_pruning_test - not_exists: sparse_vector_pruning_test.mappings.properties.ml.properties.tokens.index_options - - not_exists: sparse_vector_pruning_test.mappings.properties.embeddings.index_options --- "Check sparse_vector token pruning index_options prune missing do not allow config": @@ -554,3 +553,60 @@ teardown: - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "3" } - match: { hits.hits.2._id: "2" } + +--- +"Check sparse_vector should prune by default": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-pruning-default + body: + mappings: + properties: + content_embedding: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-pruning-default + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-pruning-default + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } From a43d2e3cf446bfa1e2f64a551a0751666c47d779 Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 4 Jun 2025 13:17:36 -0400 Subject: [PATCH 84/86] fix failing test --- .../xpack/core/ml/search/SparseVectorIndexOptionsIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index dcbb5d12af1e2..84eea246bf232 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -244,7 +244,7 @@ private String getDescriptiveTestType() { private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of("1", "3", "2"); - private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of(); + private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of("3"); private static final List EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING = List.of("2"); From 7a41e0ff5960411f1aafb89f24fe1f4e5111b32c Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Wed, 4 Jun 2025 19:32:22 -0400 Subject: [PATCH 85/86] clean fix integration / rest tests --- .../ml/search/SparseVectorIndexOptionsIT.java | 25 ++++---- .../test/remote_cluster/50_sparse_vector.yml | 57 +++++++++++++++++++ .../test/ml/sparse_vector_search.yml | 57 +++++++++++++++++++ 3 files changed, 129 insertions(+), 10 deletions(-) diff --git a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java index 84eea246bf232..4160b1d5432a7 100644 --- a/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java +++ b/x-pack/plugin/core/src/internalClusterTest/java/org/elasticsearch/xpack/core/ml/search/SparseVectorIndexOptionsIT.java @@ -176,8 +176,12 @@ private SearchResponse performSearch(SearchSourceBuilder source) { private SearchSourceBuilder getBuilderForSearch() throws IOException { boolean shouldUseDefaultTokens = (testQueryShouldNotPrune == false && testHasIndexOptions == false); - TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig ? new TokenPruningConfig(3f, 0.5f, true) : null; + + // if we're overriding the index pruning config in the query, always prune + // if not, and the query should _not_ prune, set prune=false, + // else, set to `null` to let the index options propagate Boolean shouldPrune = overrideQueryPruningConfig ? Boolean.TRUE : (testQueryShouldNotPrune ? Boolean.FALSE : null); + TokenPruningConfig queryPruningConfig = overrideQueryPruningConfig ? new TokenPruningConfig(3f, 0.5f, true) : null; SparseVectorQueryBuilder queryBuilder = new SparseVectorQueryBuilder( SPARSE_VECTOR_FIELD, @@ -206,7 +210,11 @@ private String getAssertMessage(String message) { private String getDescriptiveTestType() { String testDescription = ""; if (testQueryShouldNotPrune) { - testDescription = "query override prune=false:"; + testDescription += " query override prune=false:"; + } + + if (overrideQueryPruningConfig) { + testDescription += " query override pruningConfig=true:"; } if (testHasIndexOptions) { @@ -232,19 +240,16 @@ private String getDescriptiveTestType() { ); private static final List SEARCH_WEIGHTED_TOKENS = List.of( - new WeightedToken("cheese", 0.5f), - new WeightedToken("comet", 0.5f), - new WeightedToken("globe", 0.484035f), - new WeightedToken("ocean", 0.080102935f), - new WeightedToken("underground", 0.053516876f), - new WeightedToken("is", 0.54600334f) + new WeightedToken("pugs", 0.5f), + new WeightedToken("cats", 0.5f), + new WeightedToken("is", 0.14600334f) ); private static final List SEARCH_WEIGHTED_TOKENS_WITH_DEFAULTS = List.of(new WeightedToken("planet", 0.2f)); - private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of("1", "3", "2"); + private static final List EXPECTED_DOC_IDS_WITHOUT_PRUNING = List.of("3", "2", "1"); - private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of("3"); + private static final List EXPECTED_DOC_IDS_WITH_PRUNING = List.of(); private static final List EXPECTED_DOC_IDS_WITH_DEFAULT_PRUNING = List.of("2"); diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index da47c2e291ac8..e0ceb3f62bc9d 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -552,3 +552,60 @@ teardown: - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "3" } - match: { hits.hits.2._id: "2" } + +--- +"Check sparse_vector should prune by default": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-pruning-default + body: + mappings: + properties: + content_embedding: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-pruning-default + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-pruning-default + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 365f3692312bf..767d0d7878698 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -887,3 +887,60 @@ teardown: - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "3" } - match: { hits.hits.2._id: "2" } + +--- +"Check sparse_vector should prune by default": + + - requires: + cluster_features: 'sparse_vector.index_options_supported' + reason: "sparse_vector token pruning index options added support in 8.19" + - skip: + features: headers + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + indices.create: + index: test-sparse-vector-pruning-default + body: + mappings: + properties: + content_embedding: + type: sparse_vector + + - match: { acknowledged: true } + + - do: + headers: + Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Content-Type: application/json + bulk: + index: test-sparse-vector-pruning-default + refresh: true + body: | + {"index": { "_id": "1" }} + {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} + {"index": { "_id": "2" }} + {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"index": { "_id": "3" }} + {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + + - do: + search: + index: test-sparse-vector-pruning-default + body: + query: + sparse_vector: + field: content_embedding + query_vector: + cheese: 0.5 + comet: 0.5 + globe: 0.484035 + ocean: 0.080102935 + underground: 0.053516876 + is: 0.54600334 + + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "1" } + - match: { hits.hits.1._id: "3" } From c02a647701bc7a95e7b87de49db4476b7333b87e Mon Sep 17 00:00:00 2001 From: "Mark J. Hoy" Date: Thu, 5 Jun 2025 09:14:58 -0400 Subject: [PATCH 86/86] fix yaml default pruning tests --- .../test/multi_cluster/50_sparse_vector.yml | 35 +++++++++++------ .../test/remote_cluster/50_sparse_vector.yml | 35 +++++++++++------ .../test/ml/sparse_vector_search.yml | 39 ++++++++++++------- 3 files changed, 74 insertions(+), 35 deletions(-) diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml index 2c7750d9407ad..52c98ce12dfbe 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml @@ -588,9 +588,27 @@ teardown: {"index": { "_id": "1" }} {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} {"index": { "_id": "2" }} - {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"content_embedding":{"planet": 2.3438394,"is": 0.14600334,"astronomy": 0.36015007,"moon": 0.20022368}} {"index": { "_id": "3" }} - {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"content_embedding":{"is": 0.1891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"index": { "_id": "4" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "5" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "6" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "7" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "8" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "9" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "10" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "11" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "12" }} + {"content_embedding":{"is": 0.1891394}} - do: search: @@ -600,13 +618,8 @@ teardown: sparse_vector: field: content_embedding query_vector: - cheese: 0.5 - comet: 0.5 - globe: 0.484035 - ocean: 0.080102935 - underground: 0.053516876 - is: 0.54600334 + pugs: 0.5 + cats: 0.5 + is: 0.04600334 - - match: { hits.total.value: 2 } - - match: { hits.hits.0._id: "1" } - - match: { hits.hits.1._id: "3" } + - match: { hits.total.value: 0 } diff --git a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml index e0ceb3f62bc9d..248e9d2af0d70 100644 --- a/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml +++ b/x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml @@ -587,9 +587,27 @@ teardown: {"index": { "_id": "1" }} {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} {"index": { "_id": "2" }} - {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"content_embedding":{"planet": 2.3438394,"is": 0.14600334,"astronomy": 0.36015007,"moon": 0.20022368}} {"index": { "_id": "3" }} - {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"content_embedding":{"is": 0.1891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"index": { "_id": "4" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "5" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "6" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "7" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "8" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "9" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "10" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "11" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "12" }} + {"content_embedding":{"is": 0.1891394}} - do: search: @@ -599,13 +617,8 @@ teardown: sparse_vector: field: content_embedding query_vector: - cheese: 0.5 - comet: 0.5 - globe: 0.484035 - ocean: 0.080102935 - underground: 0.053516876 - is: 0.54600334 + pugs: 0.5 + cats: 0.5 + is: 0.04600334 - - match: { hits.total.value: 2 } - - match: { hits.hits.0._id: "1" } - - match: { hits.hits.1._id: "3" } + - match: { hits.total.value: 0 } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml index 767d0d7878698..0b7cd44d500a8 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml @@ -899,7 +899,7 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json indices.create: index: test-sparse-vector-pruning-default @@ -913,7 +913,7 @@ teardown: - do: headers: - Authorization: "Basic dGVzdF91c2VyOngtcGFjay10ZXN0LXBhc3N3b3Jk" #test_user credentials + Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser Content-Type: application/json bulk: index: test-sparse-vector-pruning-default @@ -922,9 +922,27 @@ teardown: {"index": { "_id": "1" }} {"content_embedding":{"cheese": 2.671405,"is": 0.11809908,"comet": 0.26088917}} {"index": { "_id": "2" }} - {"content_embedding":{"planet": 2.3438394,"is": 0.54600334,"astronomy": 0.36015007,"moon": 0.20022368}} + {"content_embedding":{"planet": 2.3438394,"is": 0.14600334,"astronomy": 0.36015007,"moon": 0.20022368}} {"index": { "_id": "3" }} - {"content_embedding":{"is": 0.6891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"content_embedding":{"is": 0.1891394,"globe": 0.484035,"ocean": 0.080102935,"underground": 0.053516876}} + {"index": { "_id": "4" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "5" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "6" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "7" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "8" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "9" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "10" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "11" }} + {"content_embedding":{"is": 0.1891394}} + {"index": { "_id": "12" }} + {"content_embedding":{"is": 0.1891394}} - do: search: @@ -934,13 +952,8 @@ teardown: sparse_vector: field: content_embedding query_vector: - cheese: 0.5 - comet: 0.5 - globe: 0.484035 - ocean: 0.080102935 - underground: 0.053516876 - is: 0.54600334 + pugs: 0.5 + cats: 0.5 + is: 0.04600334 - - match: { hits.total.value: 2 } - - match: { hits.hits.0._id: "1" } - - match: { hits.hits.1._id: "3" } + - match: { hits.total.value: 0 }