Prepare serverless patch including #128784 #128740 (#128807)

breskeby · dpifke-elastic · jimczi · web-flow · commit 159c57f0de3b · 2025-06-03T11:56:27.000+02:00
* Change default for vector.rescoring.directio to false (#128784) On serverless (and potentially elsewhere), direct IO is not available, which can cause BBQ shards to fail to read with org.apache.lucene.CorruptIndexException when this setting is true. * Optimize sparse vector stats collection (#128740) This change improves the performance of sparse vector statistics gathering by using the document count of terms directly, rather than relying on the field name field to compute stats. By avoiding per-term disk/network reads and instead leveraging statistics already loaded into leaf readers at index opening, we expect to significantly reduce overhead. Relates to #128583 --------- Co-authored-by: Dave Pifke <dave.pifke@elastic.co> Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
diff --git a/docs/changelog/128740.yaml b/docs/changelog/128740.yaml
@@ -0,0 +1,5 @@
+pr: 128740
+summary: Optimize sparse vector stats collection
+area: Stats
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java
@@ -57,7 +57,7 @@
 @SuppressForbidden(reason = "Copied from lucene")
 public class DirectIOLucene99FlatVectorsReader extends FlatVectorsReader implements OffHeapStats {
 
-    private static final boolean USE_DIRECT_IO = Boolean.parseBoolean(System.getProperty("vector.rescoring.directio", "true"));
+    private static final boolean USE_DIRECT_IO = Boolean.parseBoolean(System.getProperty("vector.rescoring.directio", "false"));
 
     private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(DirectIOLucene99FlatVectorsReader.class);
 
diff --git a/server/src/main/java/org/elasticsearch/index/engine/Engine.java b/server/src/main/java/org/elasticsearch/index/engine/Engine.java
@@ -26,7 +26,6 @@
 import org.apache.lucene.index.SegmentInfos;
 import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.QueryCache;
 import org.apache.lucene.search.QueryCachingPolicy;
@@ -65,7 +64,6 @@
 import org.elasticsearch.index.codec.FieldInfosWithUsages;
 import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils;
 import org.elasticsearch.index.mapper.DocumentParser;
-import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
 import org.elasticsearch.index.mapper.LuceneDocument;
 import org.elasticsearch.index.mapper.Mapper;
 import org.elasticsearch.index.mapper.Mapping;
@@ -385,14 +383,15 @@ protected final SparseVectorStats sparseVectorStats(IndexReader indexReader, Lis
 
     private long getSparseVectorValueCount(final LeafReader atomicReader, List<BytesRef> fields) throws IOException {
         long count = 0;
-        Terms terms = atomicReader.terms(FieldNamesFieldMapper.NAME);
-        if (terms == null) {
-            return count;
-        }
-        TermsEnum termsEnum = terms.iterator();
-        for (var fieldName : fields) {
-            if (termsEnum.seekExact(fieldName)) {
-                count += termsEnum.docFreq();
+        for (var fieldNameBR : fields) {
+            var fieldName = fieldNameBR.utf8ToString();
+            var fi = atomicReader.getFieldInfos().fieldInfo(fieldName);
+            if (fi == null) {
+                continue;
+            }
+            Terms terms = atomicReader.terms(fieldName);
+            if (terms != null) {
+                count += terms.getDocCount();
             }
         }
         return count;