Skip to content

Commit 159c57f

Browse files
breskebydpifke-elasticjimczi
authored
Prepare serverless patch including #128784 #128740 (#128807)
* Change default for vector.rescoring.directio to false (#128784) On serverless (and potentially elsewhere), direct IO is not available, which can cause BBQ shards to fail to read with org.apache.lucene.CorruptIndexException when this setting is true. * Optimize sparse vector stats collection (#128740) This change improves the performance of sparse vector statistics gathering by using the document count of terms directly, rather than relying on the field name field to compute stats. By avoiding per-term disk/network reads and instead leveraging statistics already loaded into leaf readers at index opening, we expect to significantly reduce overhead. Relates to #128583 --------- Co-authored-by: Dave Pifke <dave.pifke@elastic.co> Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
1 parent b286748 commit 159c57f

File tree

3 files changed

+15
-11
lines changed

3 files changed

+15
-11
lines changed

docs/changelog/128740.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 128740
2+
summary: Optimize sparse vector stats collection
3+
area: Stats
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/index/codec/vectors/es818/DirectIOLucene99FlatVectorsReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
@SuppressForbidden(reason = "Copied from lucene")
5858
public class DirectIOLucene99FlatVectorsReader extends FlatVectorsReader implements OffHeapStats {
5959

60-
private static final boolean USE_DIRECT_IO = Boolean.parseBoolean(System.getProperty("vector.rescoring.directio", "true"));
60+
private static final boolean USE_DIRECT_IO = Boolean.parseBoolean(System.getProperty("vector.rescoring.directio", "false"));
6161

6262
private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(DirectIOLucene99FlatVectorsReader.class);
6363

server/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.lucene.index.SegmentInfos;
2727
import org.apache.lucene.index.SegmentReader;
2828
import org.apache.lucene.index.Terms;
29-
import org.apache.lucene.index.TermsEnum;
3029
import org.apache.lucene.search.IndexSearcher;
3130
import org.apache.lucene.search.QueryCache;
3231
import org.apache.lucene.search.QueryCachingPolicy;
@@ -65,7 +64,6 @@
6564
import org.elasticsearch.index.codec.FieldInfosWithUsages;
6665
import org.elasticsearch.index.codec.vectors.reflect.OffHeapByteSizeUtils;
6766
import org.elasticsearch.index.mapper.DocumentParser;
68-
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
6967
import org.elasticsearch.index.mapper.LuceneDocument;
7068
import org.elasticsearch.index.mapper.Mapper;
7169
import org.elasticsearch.index.mapper.Mapping;
@@ -385,14 +383,15 @@ protected final SparseVectorStats sparseVectorStats(IndexReader indexReader, Lis
385383

386384
private long getSparseVectorValueCount(final LeafReader atomicReader, List<BytesRef> fields) throws IOException {
387385
long count = 0;
388-
Terms terms = atomicReader.terms(FieldNamesFieldMapper.NAME);
389-
if (terms == null) {
390-
return count;
391-
}
392-
TermsEnum termsEnum = terms.iterator();
393-
for (var fieldName : fields) {
394-
if (termsEnum.seekExact(fieldName)) {
395-
count += termsEnum.docFreq();
386+
for (var fieldNameBR : fields) {
387+
var fieldName = fieldNameBR.utf8ToString();
388+
var fi = atomicReader.getFieldInfos().fieldInfo(fieldName);
389+
if (fi == null) {
390+
continue;
391+
}
392+
Terms terms = atomicReader.terms(fieldName);
393+
if (terms != null) {
394+
count += terms.getDocCount();
396395
}
397396
}
398397
return count;

0 commit comments

Comments
 (0)