diff --git a/docs/changelog/120869.yaml b/docs/changelog/120869.yaml new file mode 100644 index 0000000000000..024d51db894fd --- /dev/null +++ b/docs/changelog/120869.yaml @@ -0,0 +1,5 @@ +pr: 120869 +summary: Threadpool merge scheduler +area: Engine +type: feature +issues: [] diff --git a/docs/changelog/127613.yaml b/docs/changelog/127613.yaml new file mode 100644 index 0000000000000..de043e209b32e --- /dev/null +++ b/docs/changelog/127613.yaml @@ -0,0 +1,5 @@ +pr: 127613 +summary: Threadpool merge executor is aware of available disk space +area: Engine +type: feature +issues: [] diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java index 80de2ffcaa7ac..243e4219ffef1 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/InternalEngineMergeIT.java @@ -8,24 +8,40 @@ */ package org.elasticsearch.index.engine; +import org.elasticsearch.action.admin.cluster.node.stats.NodeStats; +import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse; import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase.ClusterScope; import org.elasticsearch.test.ESIntegTestCase.Scope; +import org.elasticsearch.threadpool.ThreadPool; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.lessThan; import static org.hamcrest.Matchers.lessThanOrEqualTo; -@ClusterScope(supportsDedicatedMasters = false, numDataNodes = 1, scope = Scope.SUITE) +@ClusterScope(supportsDedicatedMasters = false, numDataNodes = 1, numClientNodes = 0, scope = Scope.TEST) public class InternalEngineMergeIT extends ESIntegTestCase { + private boolean useThreadPoolMerging; + + @Override + protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { + useThreadPoolMerging = randomBoolean(); + Settings.Builder settings = Settings.builder().put(super.nodeSettings(nodeOrdinal, otherSettings)); + settings.put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), useThreadPoolMerging); + return settings.build(); + } + public void testMergesHappening() throws Exception { final int numOfShards = randomIntBetween(1, 5); // some settings to keep num segments low @@ -83,4 +99,60 @@ public void testMergesHappening() throws Exception { assertThat(count, lessThanOrEqualTo(upperNumberSegments)); } + public void testMergesUseTheMergeThreadPool() throws Exception { + final String indexName = randomIdentifier(); + createIndex(indexName, indexSettings(randomIntBetween(1, 3), 0).build()); + long id = 0; + final int minMerges = randomIntBetween(1, 5); + long totalDocs = 0; + + while (true) { + int docs = randomIntBetween(100, 200); + totalDocs += docs; + + BulkRequestBuilder request = client().prepareBulk(); + for (int j = 0; j < docs; ++j) { + request.add( + new IndexRequest(indexName).id(Long.toString(id++)) + .source(jsonBuilder().startObject().field("l", randomLong()).endObject()) + ); + } + BulkResponse response = request.get(); + assertNoFailures(response); + refresh(indexName); + + var mergesResponse = client().admin().indices().prepareStats(indexName).clear().setMerge(true).get(); + var primaries = mergesResponse.getIndices().get(indexName).getPrimaries(); + if (primaries.merge.getTotal() >= minMerges) { + break; + } + } + + forceMerge(); + refresh(indexName); + + // after a force merge there should only be 1 segment per shard + var shardsWithMultipleSegments = getShardSegments().stream() + .filter(shardSegments -> shardSegments.getSegments().size() > 1) + .toList(); + assertTrue("there are shards with multiple segments " + shardsWithMultipleSegments, shardsWithMultipleSegments.isEmpty()); + + final long expectedTotalDocs = totalDocs; + assertHitCount(prepareSearch(indexName).setQuery(QueryBuilders.matchAllQuery()).setTrackTotalHits(true), expectedTotalDocs); + + IndicesStatsResponse indicesStats = client().admin().indices().prepareStats(indexName).setMerge(true).get(); + long mergeCount = indicesStats.getIndices().get(indexName).getPrimaries().merge.getTotal(); + NodesStatsResponse nodesStatsResponse = client().admin().cluster().prepareNodesStats().setThreadPool(true).get(); + assertThat(nodesStatsResponse.getNodes().size(), equalTo(1)); + + NodeStats nodeStats = nodesStatsResponse.getNodes().get(0); + if (useThreadPoolMerging) { + assertThat( + nodeStats.getThreadPool().stats().stream().filter(s -> ThreadPool.Names.MERGE.equals(s.name())).findAny().get().completed(), + equalTo(mergeCount) + ); + } else { + assertTrue(nodeStats.getThreadPool().stats().stream().filter(s -> ThreadPool.Names.MERGE.equals(s.name())).findAny().isEmpty()); + } + } } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java new file mode 100644 index 0000000000000..f3a9e5db28047 --- /dev/null +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerStressTestIT.java @@ -0,0 +1,310 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.index.MergePolicy.OneMerge; +import org.apache.lucene.index.MergeScheduler; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.store.Directory; +import org.elasticsearch.action.admin.indices.segments.IndexShardSegments; +import org.elasticsearch.action.admin.indices.segments.IndicesSegmentResponse; +import org.elasticsearch.action.admin.indices.segments.ShardSegments; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.CollectionUtils; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.MergePolicyConfig; +import org.elasticsearch.index.MergeSchedulerConfig; +import org.elasticsearch.index.merge.MergeStats; +import org.elasticsearch.index.merge.OnGoingMerge; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.plugins.EnginePlugin; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.plugins.PluginsService; +import org.elasticsearch.test.ESSingleNodeTestCase; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Executor; +import java.util.concurrent.Semaphore; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAllSuccessful; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +public class ThreadPoolMergeSchedulerStressTestIT extends ESSingleNodeTestCase { + + private static final int MERGE_SCHEDULER_MAX_CONCURRENCY = 3; + + @Override + protected Settings nodeSettings() { + return Settings.builder() + .put(super.nodeSettings()) + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + // when there are more threads than scheduler(s)' concurrency capacity, excess merges will be backlogged + // alternatively, when scheduler(s)' concurrency capacity exceeds the executor's thread count, excess merges will be enqueued + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), MERGE_SCHEDULER_MAX_CONCURRENCY + randomFrom(-2, -1, 0, 1, 2)) + .build(); + } + + @Override + protected Collection> getPlugins() { + return CollectionUtils.appendToCopy(super.getPlugins(), ThreadPoolMergeSchedulerStressTestIT.TestEnginePlugin.class); + } + + public static class TestEnginePlugin extends Plugin implements EnginePlugin { + + final AtomicReference mergeExecutorServiceReference = new AtomicReference<>(); + final Set enqueuedMergesSet = ConcurrentCollections.newConcurrentSet(); + final Set runningMergesSet = ConcurrentCollections.newConcurrentSet(); + // maybe let a few merges run at the start + final int initialRunMergesCount = randomIntBetween(0, 5); + final Semaphore runMergeSemaphore = new Semaphore(initialRunMergesCount); + final int waitMergesEnqueuedCount = randomIntBetween(50, 100); + + class TestInternalEngine extends org.elasticsearch.index.engine.InternalEngine { + + TestInternalEngine(EngineConfig engineConfig) { + super(engineConfig); + } + + protected ElasticsearchMergeScheduler createMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + @Nullable ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + ElasticsearchMergeScheduler mergeScheduler = super.createMergeScheduler( + shardId, + indexSettings, + threadPoolMergeExecutorService + ); + assertThat(mergeScheduler, instanceOf(ThreadPoolMergeScheduler.class)); + // assert there is a single merge executor service for all shards + mergeExecutorServiceReference.compareAndSet(null, threadPoolMergeExecutorService); + assertThat(mergeExecutorServiceReference.get(), is(threadPoolMergeExecutorService)); + return new TestMergeScheduler((ThreadPoolMergeScheduler) mergeScheduler); + } + + class TestMergeScheduler implements ElasticsearchMergeScheduler { + + ThreadPoolMergeScheduler delegateMergeScheduler; + + TestMergeScheduler(ThreadPoolMergeScheduler threadPoolMergeScheduler) { + this.delegateMergeScheduler = threadPoolMergeScheduler; + } + + @Override + public Set onGoingMerges() { + return delegateMergeScheduler.onGoingMerges(); + } + + @Override + public MergeStats stats() { + return delegateMergeScheduler.stats(); + } + + @Override + public void refreshConfig() { + delegateMergeScheduler.refreshConfig(); + } + + @Override + public MergeScheduler getMergeScheduler() { + return new MergeScheduler() { + @Override + public void merge(MergeSource mergeSource, MergeTrigger trigger) { + delegateMergeScheduler.merge(new MergeSource() { + @Override + public OneMerge getNextMerge() { + OneMerge nextMerge = mergeSource.getNextMerge(); + if (nextMerge != null) { + assertTrue(TestEnginePlugin.this.enqueuedMergesSet.add(nextMerge)); + // avoid excess merges pilling up + if (TestEnginePlugin.this.enqueuedMergesSet + .size() > TestEnginePlugin.this.waitMergesEnqueuedCount) { + runMergeSemaphore.release(); + } + } + return nextMerge; + } + + @Override + public void onMergeFinished(OneMerge merge) { + mergeSource.onMergeFinished(merge); + } + + @Override + public boolean hasPendingMerges() { + return mergeSource.hasPendingMerges(); + } + + @Override + public void merge(OneMerge merge) throws IOException { + assertNotNull(merge); + try { + // most merges need to acquire the semaphore in order to run + if (frequently()) { + runMergeSemaphore.acquire(); + } + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + // assert to-be-run merge was enqueued + assertTrue(TestEnginePlugin.this.enqueuedMergesSet.remove(merge)); + TestEnginePlugin.this.runningMergesSet.add(merge); + assertThat( + TestEnginePlugin.this.runningMergesSet.size(), + lessThanOrEqualTo( + TestEnginePlugin.this.mergeExecutorServiceReference.get().getMaxConcurrentMerges() + ) + ); + mergeSource.merge(merge); + assertTrue(TestEnginePlugin.this.runningMergesSet.remove(merge)); + } + }, trigger); + } + + @Override + public Directory wrapForMerge(OneMerge merge, Directory in) { + return delegateMergeScheduler.wrapForMerge(merge, in); + } + + @Override + public Executor getIntraMergeExecutor(OneMerge merge) { + return delegateMergeScheduler.getIntraMergeExecutor(merge); + } + + @Override + public void close() throws IOException { + delegateMergeScheduler.close(); + } + }; + } + } + } + + @Override + public Optional getEngineFactory(IndexSettings indexSettings) { + return Optional.of(TestInternalEngine::new); + } + + } + + public void testMergingFallsBehindAndThenCatchesUp() throws Exception { + createIndex( + "index", + // stress test merging across multiple shards + indexSettings(randomIntBetween(1, 10), 0) + // few segments per merge ought to result in more merging activity + .put(MergePolicyConfig.INDEX_MERGE_POLICY_MAX_MERGE_AT_ONCE_SETTING.getKey(), randomIntBetween(2, 3)) + .put(MergePolicyConfig.INDEX_MERGE_POLICY_SEGMENTS_PER_TIER_SETTING.getKey(), randomIntBetween(2, 3)) + // few concurrent merges allowed per scheduler + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), randomIntBetween(1, MERGE_SCHEDULER_MAX_CONCURRENCY)) + // many pending merges allowed, in order to disable indexing throttle + .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), randomIntBetween(1, Integer.MAX_VALUE)) + .build() + ); + ensureGreen("index"); + // generate merging activity across many threads + Thread[] indexingThreads = new Thread[randomIntBetween(20, 30)]; + AtomicBoolean indexingDone = new AtomicBoolean(false); + for (int i = 0; i < indexingThreads.length; i++) { + int finalI = i; + indexingThreads[i] = new Thread(() -> { + long termUpto = 0; + while (indexingDone.get() == false) { + for (int j = 0; j < 100; j++) { + // Provoke slowish merging by making many unique terms: + StringBuilder sb = new StringBuilder(); + for (int k = 0; k < 100; k++) { + sb.append(' '); + sb.append(termUpto++); + } + prepareIndex("index").setId("thread_" + finalI + "_term_" + termUpto) + .setSource("field" + (j % 10), sb.toString()) + .get(); + if (j % 2 == 0) { + indicesAdmin().prepareRefresh("index").get(); + } + } + indicesAdmin().prepareRefresh("index").get(); + } + }); + indexingThreads[i].start(); + } + TestEnginePlugin testEnginePlugin = getTestEnginePlugin(); + assertBusy(() -> { + // wait for merges to enqueue or backlog + assertThat(testEnginePlugin.enqueuedMergesSet.size(), greaterThanOrEqualTo(testEnginePlugin.waitMergesEnqueuedCount)); + }, 1, TimeUnit.MINUTES); + // finish up indexing + indexingDone.set(true); + for (Thread indexingThread : indexingThreads) { + indexingThread.join(); + } + // even when indexing is done, queued and backlogged merges can themselves trigger further merging + // don't let this test be bothered by that, and simply unblock all merges + // 100k is a fudge value, but there's no easy way to find a smartest one here + testEnginePlugin.runMergeSemaphore.release(100_000); + // await all merging to catch up + assertBusy(() -> { + assert testEnginePlugin.runMergeSemaphore.availablePermits() > 0 : "some merges are blocked, test is broken"; + assertThat(testEnginePlugin.runningMergesSet.size(), is(0)); + assertThat(testEnginePlugin.enqueuedMergesSet.size(), is(0)); + testEnginePlugin.mergeExecutorServiceReference.get().allDone(); + }, 1, TimeUnit.MINUTES); + var segmentsCountAfterMergingCaughtUp = getSegmentsCountForAllShards("index"); + // force merge should be a noop after all available merging was done + assertAllSuccessful(indicesAdmin().prepareForceMerge("index").get()); + var segmentsCountAfterForceMerge = getSegmentsCountForAllShards("index"); + assertThat(segmentsCountAfterForceMerge, is(segmentsCountAfterMergingCaughtUp)); + // let's also run a force-merge to 1 segment + assertAllSuccessful(indicesAdmin().prepareForceMerge("index").setMaxNumSegments(1).get()); + assertAllSuccessful(indicesAdmin().prepareRefresh("index").get()); + // assert one segment per shard + { + IndicesSegmentResponse indicesSegmentResponse = indicesAdmin().prepareSegments("index").get(); + Iterator indexShardSegmentsIterator = indicesSegmentResponse.getIndices().get("index").iterator(); + while (indexShardSegmentsIterator.hasNext()) { + for (ShardSegments segments : indexShardSegmentsIterator.next()) { + assertThat(segments.getSegments().size(), is(1)); + } + } + } + } + + private int getSegmentsCountForAllShards(String indexName) { + // refresh, otherwise we'd be still seeing the old merged-away segments + assertAllSuccessful(indicesAdmin().prepareRefresh(indexName).get()); + int count = 0; + IndicesSegmentResponse indicesSegmentResponse = indicesAdmin().prepareSegments(indexName).get(); + Iterator indexShardSegmentsIterator = indicesSegmentResponse.getIndices().get(indexName).iterator(); + while (indexShardSegmentsIterator.hasNext()) { + for (ShardSegments segments : indexShardSegmentsIterator.next()) { + count += segments.getSegments().size(); + } + } + return count; + } + + private TestEnginePlugin getTestEnginePlugin() { + return getInstanceFromNode(PluginsService.class).filterPlugins(TestEnginePlugin.class).toList().get(0); + } +} diff --git a/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java b/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java index 3338675160268..150262f98a10b 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/index/shard/IndexShardIT.java @@ -625,6 +625,7 @@ public static final IndexShard newIndexShard( indexService.getIndexEventListener(), wrapper, indexService.getThreadPool(), + indexService.getThreadPoolMergeExecutorService(), indexService.getBigArrays(), null, Collections.emptyList(), diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java index 74ccdce19d3ad..0ac8c4d0b6fd4 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/IndexingMemoryControllerIT.java @@ -59,6 +59,7 @@ EngineConfig engineConfigWithLargerIndexingMemory(EngineConfig config) { return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), indexSettings, config.getWarmer(), config.getStore(), diff --git a/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java index ab91d92927217..06972d09d609c 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/indices/stats/IndexStatsIT.java @@ -78,6 +78,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; @@ -467,50 +468,61 @@ public void testNonThrottleStats() throws Exception { public void testThrottleStats() throws Exception { assertAcked( - prepareCreate("test").setSettings( + prepareCreate("test_throttle_stats_index").setSettings( settingsBuilder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, "1") .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, "0") .put(MergePolicyConfig.INDEX_MERGE_POLICY_MAX_MERGE_AT_ONCE_SETTING.getKey(), "2") .put(MergePolicyConfig.INDEX_MERGE_POLICY_SEGMENTS_PER_TIER_SETTING.getKey(), "2") .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), "1") .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), "1") + .put(MergeSchedulerConfig.AUTO_THROTTLE_SETTING.getKey(), "true") .put(IndexSettings.INDEX_TRANSLOG_DURABILITY_SETTING.getKey(), Translog.Durability.ASYNC.name()) ) ); - ensureGreen(); - long termUpto = 0; - IndicesStatsResponse stats; + ensureGreen("test_throttle_stats_index"); // make sure we see throttling kicking in: - boolean done = false; - long start = System.currentTimeMillis(); - while (done == false) { - for (int i = 0; i < 100; i++) { - // Provoke slowish merging by making many unique terms: - StringBuilder sb = new StringBuilder(); - for (int j = 0; j < 100; j++) { - sb.append(' '); - sb.append(termUpto++); - } - prepareIndex("test").setId("" + termUpto).setSource("field" + (i % 10), sb.toString()).get(); - if (i % 2 == 0) { - refresh(); + AtomicBoolean done = new AtomicBoolean(); + AtomicLong termUpTo = new AtomicLong(); + Thread[] indexingThreads = new Thread[5]; + for (int threadIdx = 0; threadIdx < indexingThreads.length; threadIdx++) { + indexingThreads[threadIdx] = new Thread(() -> { + while (done.get() == false) { + for (int i = 0; i < 100; i++) { + // Provoke slowish merging by making many unique terms: + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < 100; j++) { + sb.append(' '); + sb.append(termUpTo.incrementAndGet()); + } + prepareIndex("test_throttle_stats_index").setId("" + termUpTo.get()) + .setSource("field" + (i % 10), sb.toString()) + .get(); + if (i % 2 == 0) { + refresh("test_throttle_stats_index"); + } + } + refresh("test_throttle_stats_index"); } - } - refresh(); - stats = indicesAdmin().prepareStats().get(); - // nodesStats = clusterAdmin().prepareNodesStats().setIndices(true).get(); - done = stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis() > 0; - if (System.currentTimeMillis() - start > 300 * 1000) { // Wait 5 minutes for throttling to kick in - fail("index throttling didn't kick in after 5 minutes of intense merging"); - } + }); + indexingThreads[threadIdx].start(); + } + + assertBusy(() -> { + IndicesStatsResponse stats = indicesAdmin().prepareStats("test_throttle_stats_index").get(); + assertTrue(stats.getPrimaries().getIndexing().getTotal().getThrottleTime().millis() > 0); + done.set(true); + }, 5L, TimeUnit.MINUTES); + + for (Thread indexingThread : indexingThreads) { + indexingThread.join(); } // Optimize & flush and wait; else we sometimes get a "Delete Index failed - not acked" // when ESIntegTestCase.after tries to remove indices created by the test: - logger.info("test: now optimize"); - indicesAdmin().prepareForceMerge("test").get(); - flush(); - logger.info("test: test done"); + logger.info("test throttle stats: now optimize"); + indicesAdmin().prepareForceMerge("test_throttle_stats_index").get(); + flush("test_throttle_stats_index"); + logger.info("test throttle stats: test done"); } public void testSimpleStats() throws Exception { diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index aecc750bd4e39..164012670743a 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -87,6 +87,8 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexingPressure; import org.elasticsearch.index.MergePolicyConfig; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.indices.IndexingMemoryController; import org.elasticsearch.indices.IndicesQueryCache; import org.elasticsearch.indices.IndicesRequestCache; @@ -619,6 +621,10 @@ public void apply(Settings value, Settings current, Settings previous) { TDigestExecutionHint.SETTING, MergePolicyConfig.DEFAULT_MAX_MERGED_SEGMENT_SETTING, MergePolicyConfig.DEFAULT_MAX_TIME_BASED_MERGED_SEGMENT_SETTING, + ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING, TransportService.ENABLE_STACK_OVERFLOW_AVOIDANCE, DataStreamGlobalRetentionSettings.DATA_STREAMS_DEFAULT_RETENTION_SETTING, DataStreamGlobalRetentionSettings.DATA_STREAMS_MAX_RETENTION_SETTING, diff --git a/server/src/main/java/org/elasticsearch/index/IndexModule.java b/server/src/main/java/org/elasticsearch/index/IndexModule.java index 7d63a0432cdbc..6cd63b3c0047d 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexModule.java +++ b/server/src/main/java/org/elasticsearch/index/IndexModule.java @@ -43,6 +43,7 @@ import org.elasticsearch.index.cache.query.QueryCache; import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.engine.EngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.MapperMetrics; import org.elasticsearch.index.mapper.MapperRegistry; @@ -470,6 +471,7 @@ public IndexService newIndexService( CircuitBreakerService circuitBreakerService, BigArrays bigArrays, ThreadPool threadPool, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, ScriptService scriptService, ClusterService clusterService, Client client, @@ -523,6 +525,7 @@ public IndexService newIndexService( circuitBreakerService, bigArrays, threadPool, + threadPoolMergeExecutorService, scriptService, clusterService, client, diff --git a/server/src/main/java/org/elasticsearch/index/IndexService.java b/server/src/main/java/org/elasticsearch/index/IndexService.java index baba9e94db7a7..3617ed3ff3beb 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexService.java +++ b/server/src/main/java/org/elasticsearch/index/IndexService.java @@ -49,6 +49,7 @@ import org.elasticsearch.index.cache.query.QueryCache; import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.engine.EngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldDataCache; @@ -154,6 +155,8 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust private final AsyncTrimTranslogTask trimTranslogTask; private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final BigArrays bigArrays; private final ScriptService scriptService; private final ClusterService clusterService; @@ -178,6 +181,7 @@ public IndexService( CircuitBreakerService circuitBreakerService, BigArrays bigArrays, ThreadPool threadPool, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, ScriptService scriptService, ClusterService clusterService, Client client, @@ -261,6 +265,7 @@ public IndexService( this.indexFoldersDeletionListener = indexFoldersDeletionListener; this.bigArrays = bigArrays; this.threadPool = threadPool; + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; this.scriptService = scriptService; this.clusterService = clusterService; this.client = client; @@ -556,6 +561,7 @@ public synchronized IndexShard createShard( eventListener, readerWrapper, threadPool, + threadPoolMergeExecutorService, bigArrays, engineWarmer, searchOperationListeners, @@ -820,6 +826,10 @@ public ThreadPool getThreadPool() { return threadPool; } + public @Nullable ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService() { + return threadPoolMergeExecutorService; + } + /** * The {@link BigArrays} to use for this index. */ diff --git a/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java b/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java index af3c2cd5172f6..1ef42cdb922c3 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java +++ b/server/src/main/java/org/elasticsearch/index/engine/EngineConfig.java @@ -58,6 +58,8 @@ public final class EngineConfig { private final MapperService mapperService; private final IndexStorePlugin.SnapshotCommitSupplier snapshotCommitSupplier; private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final Engine.Warmer warmer; private final Store store; private final MergePolicy mergePolicy; @@ -150,6 +152,7 @@ public Supplier retentionLeasesSupplier() { public EngineConfig( ShardId shardId, ThreadPool threadPool, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, IndexSettings indexSettings, Engine.Warmer warmer, Store store, @@ -179,6 +182,7 @@ public EngineConfig( this.shardId = shardId; this.indexSettings = indexSettings; this.threadPool = threadPool; + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; this.warmer = warmer == null ? (a) -> {} : warmer; this.store = store; this.mergePolicy = mergePolicy; @@ -287,6 +291,10 @@ public ThreadPool getThreadPool() { return threadPool; } + public @Nullable ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService() { + return threadPoolMergeExecutorService; + } + /** * Returns an {@link org.elasticsearch.index.engine.Engine.Warmer} used to warm new searchers before they are used for searching. */ diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index d113e60edb3dd..325d552bcc474 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -254,7 +254,11 @@ public InternalEngine(EngineConfig engineConfig) { boolean success = false; try { this.lastDeleteVersionPruneTimeMSec = engineConfig.getThreadPool().relativeTimeInMillis(); - mergeScheduler = createMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings()); + mergeScheduler = createMergeScheduler( + engineConfig.getShardId(), + engineConfig.getIndexSettings(), + engineConfig.getThreadPoolMergeExecutorService() + ); scheduler = mergeScheduler.getMergeScheduler(); throttle = new IndexThrottle(); try { @@ -2818,15 +2822,95 @@ LiveIndexWriterConfig getCurrentIndexWriterConfig() { return indexWriter.getConfig(); } - protected ElasticsearchMergeScheduler createMergeScheduler(ShardId shardId, IndexSettings indexSettings) { - return new EngineMergeScheduler(shardId, indexSettings); + private void maybeFlushAfterMerge(OnGoingMerge merge) { + if (indexWriter.hasPendingMerges() == false && System.nanoTime() - lastWriteNanos >= engineConfig.getFlushMergesAfter().nanos()) { + // NEVER do this on a merge thread since we acquire some locks blocking here and if we concurrently rollback the + // writer + // we deadlock on engine#close for instance. + engineConfig.getThreadPool().executor(ThreadPool.Names.FLUSH).execute(new AbstractRunnable() { + @Override + public void onFailure(Exception e) { + if (isClosed.get() == false) { + logger.warn("failed to flush after merge has finished", e); + } else { + logger.info("failed to flush after merge has finished during shard close"); + } + } + + @Override + protected void doRun() { + // if we have no pending merges and we are supposed to flush once merges have finished to + // free up transient disk usage of the (presumably biggish) segments that were just merged + flush(); + } + }); + } else if (merge.getTotalBytesSize() >= engineConfig.getIndexSettings().getFlushAfterMergeThresholdSize().getBytes()) { + // we hit a significant merge which would allow us to free up memory if we'd commit it hence on the next change + // we should execute a flush on the next operation if that's a flush after inactive or indexing a document. + // we could fork a thread and do it right away but we try to minimize forking and piggyback on outside events. + shouldPeriodicallyFlushAfterBigMerge.set(true); + } + } + + protected ElasticsearchMergeScheduler createMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + @Nullable ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + if (threadPoolMergeExecutorService != null) { + return new EngineThreadPoolMergeScheduler(shardId, indexSettings, threadPoolMergeExecutorService); + } else { + return new EngineConcurrentMergeScheduler(shardId, indexSettings); + } + } + + private final class EngineThreadPoolMergeScheduler extends ThreadPoolMergeScheduler { + EngineThreadPoolMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + super(shardId, indexSettings, threadPoolMergeExecutorService, InternalEngine.this::estimateMergeBytes); + } + + @Override + protected synchronized void enableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + logger.info( + "now throttling indexing: numRunningMerges={}, numQueuedMerges={}, maxNumMergesConfigured={}", + numRunningMerges, + numQueuedMerges, + configuredMaxMergeCount + ); + InternalEngine.this.activateThrottling(); + } + + @Override + protected synchronized void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + logger.info( + "stop throttling indexing: numRunningMerges={}, numQueuedMerges={}, maxNumMergesConfigured={}", + numRunningMerges, + numQueuedMerges, + configuredMaxMergeCount + ); + InternalEngine.this.deactivateThrottling(); + } + + @Override + public synchronized void afterMerge(OnGoingMerge merge) { + maybeFlushAfterMerge(merge); + } + + @Override + protected void handleMergeException(final Throwable exc) { + mergeException(exc); + } } - private final class EngineMergeScheduler extends ElasticsearchConcurrentMergeScheduler { + private final class EngineConcurrentMergeScheduler extends ElasticsearchConcurrentMergeScheduler { private final AtomicInteger numMergesInFlight = new AtomicInteger(0); private final AtomicBoolean isThrottling = new AtomicBoolean(); - EngineMergeScheduler(ShardId shardId, IndexSettings indexSettings) { + EngineConcurrentMergeScheduler(ShardId shardId, IndexSettings indexSettings) { super(shardId, indexSettings); } @@ -2850,33 +2934,7 @@ public synchronized void afterMerge(OnGoingMerge merge) { deactivateThrottling(); } } - if (indexWriter.hasPendingMerges() == false - && System.nanoTime() - lastWriteNanos >= engineConfig.getFlushMergesAfter().nanos()) { - // NEVER do this on a merge thread since we acquire some locks blocking here and if we concurrently rollback the writer - // we deadlock on engine#close for instance. - engineConfig.getThreadPool().executor(ThreadPool.Names.FLUSH).execute(new AbstractRunnable() { - @Override - public void onFailure(Exception e) { - if (isClosed.get() == false) { - logger.warn("failed to flush after merge has finished", e); - } else { - logger.info("failed to flush after merge has finished during shard close"); - } - } - - @Override - protected void doRun() { - // if we have no pending merges and we are supposed to flush once merges have finished to - // free up transient disk usage of the (presumably biggish) segments that were just merged - flush(); - } - }); - } else if (merge.getTotalBytesSize() >= engineConfig.getIndexSettings().getFlushAfterMergeThresholdSize().getBytes()) { - // we hit a significant merge which would allow us to free up memory if we'd commit it hence on the next change - // we should execute a flush on the next operation if that's a flush after inactive or indexing a document. - // we could fork a thread and do it right away but we try to minimize forking and piggyback on outside events. - shouldPeriodicallyFlushAfterBigMerge.set(true); - } + maybeFlushAfterMerge(merge); } @Override @@ -3490,4 +3548,13 @@ T performActionWithDirectoryReader(SearcherScope scope, CheckedFunctionLucene issue) + * We can work iteratively in providing estimations for different types of fields and vector encodings. + */ +public class MergeMemoryEstimator { + + // Determined empirically by using Accountable.ramBytesUsed() during merges on Lucene using an instrumented build of Lucene. + // Didn't adapted the ramBytesUsed() code for this as it depends on graph levels and size for non-zero levels, which are difficult + // to estimate without actually building the graph. + public static final long HNSW_PER_DOC_ESTIMATION = 348L; + + /** + * Estimates the memory, in bytes, needed to merge the segments of the given merge. + */ + public static long estimateMergeMemory(MergePolicy.OneMerge merge, IndexReader indexReader) { + assert merge.segments.isEmpty() == false; + + long memoryNeeded = 0; + Map segments = merge.segments.stream().collect(Collectors.toMap(s -> s.info.name, s -> s)); + List leaves = indexReader.leaves(); + SegmentReader segmentReader = null; + for (LeafReaderContext leafReaderContext : leaves) { + segmentReader = Lucene.segmentReader(leafReaderContext.reader()); + String segmentName = segmentReader.getSegmentName(); + SegmentCommitInfo segmentCommitInfo = segments.get(segmentName); + if (segmentCommitInfo != null) { + memoryNeeded += estimateMergeMemory(segmentCommitInfo, segmentReader); + segments.remove(segmentName); + if (segments.isEmpty()) { + break; + } + } + } + + // Estimate segments without readers - the searcher may not have been refreshed yet, so estimate them with the field info from + // the last segment reader + if (segmentReader != null) { + for (SegmentCommitInfo segmentCommitInfo : segments.values()) { + memoryNeeded += estimateMergeMemory(segmentCommitInfo, segmentReader); + } + } + + return memoryNeeded; + } + + private static long estimateMergeMemory(SegmentCommitInfo segmentCommitInfo, SegmentReader reader) { + long maxMem = 0; + for (FieldInfo fieldInfo : reader.getFieldInfos()) { + maxMem = Math.max(maxMem, estimateFieldMemory(fieldInfo, segmentCommitInfo, reader)); + } + return maxMem; + } + + private static long estimateFieldMemory(FieldInfo fieldInfo, SegmentCommitInfo segmentCommitInfo, SegmentReader segmentReader) { + + long maxMem = 0; + if (fieldInfo.hasVectorValues()) { + maxMem = Math.max(maxMem, estimateVectorFieldMemory(fieldInfo, segmentCommitInfo, segmentReader)); + } + // TODO Work on estimations on other field infos when / if needed + + return maxMem; + } + + private static long estimateVectorFieldMemory(FieldInfo fieldInfo, SegmentCommitInfo segmentCommitInfo, SegmentReader segmentReader) { + KnnVectorsReader vectorsReader = segmentReader.getVectorReader(); + if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader perFieldKnnVectorsFormat) { + vectorsReader = perFieldKnnVectorsFormat.getFieldReader(fieldInfo.getName()); + } + + return getVectorFieldEstimation(fieldInfo, segmentCommitInfo, vectorsReader); + } + + private static long getVectorFieldEstimation(FieldInfo fieldInfo, SegmentCommitInfo segmentCommitInfo, KnnVectorsReader vectorsReader) { + int numDocs = segmentCommitInfo.info.maxDoc() - segmentCommitInfo.getDelCount(); + if (vectorsReader instanceof Lucene99HnswVectorsReader) { + return numDocs * HNSW_PER_DOC_ESTIMATION; + + } else { + // Dominated by the heap byte buffer size used to write each vector + if (fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32) { + return fieldInfo.getVectorDimension() * VectorEncoding.FLOAT32.byteSize; + } + // Byte does not use buffering for writing but the IndexOutput directly + return 0; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java new file mode 100644 index 0000000000000..9e74c19d8a85e --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorService.java @@ -0,0 +1,801 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.settings.Setting.Property; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.unit.RelativeByteSizeValue; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.Releasable; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.monitor.fs.FsInfo; +import org.elasticsearch.threadpool.Scheduler; +import org.elasticsearch.threadpool.ThreadPool; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.IdentityHashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; +import java.util.function.Consumer; +import java.util.function.LongUnaryOperator; +import java.util.function.ToLongFunction; + +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING; +import static org.elasticsearch.monitor.fs.FsProbe.getFSInfo; + +public class ThreadPoolMergeExecutorService implements Closeable { + /** How frequently we check disk usage (default: 5 seconds). */ + public static final Setting INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING = Setting.positiveTimeSetting( + "indices.merge.disk.check_interval", + TimeValue.timeValueSeconds(5), + Property.Dynamic, + Property.NodeScope + ); + /** + * The occupied disk space threshold beyond which NO new merges are started. + * Conservatively, the estimated temporary disk space required for the to-be-started merge is counted as occupied disk space. + * Defaults to the routing allocation flood stage limit value (beyond which shards are toggled read-only). + */ + public static final Setting INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING = new Setting<>( + "indices.merge.disk.watermark.high", + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, + (s) -> RelativeByteSizeValue.parseRelativeByteSizeValue(s, "indices.merge.disk.watermark.high"), + new Setting.Validator<>() { + @Override + public void validate(RelativeByteSizeValue value) {} + + @Override + public void validate(RelativeByteSizeValue value, Map, Object> settings, boolean isPresent) { + if (isPresent && settings.get(USE_THREAD_POOL_MERGE_SCHEDULER_SETTING).equals(Boolean.FALSE)) { + throw new IllegalArgumentException( + "indices merge watermark setting is only effective when [" + + USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey() + + "] is set to [true]" + ); + } + } + + @Override + public Iterator> settings() { + List> res = List.of(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, USE_THREAD_POOL_MERGE_SCHEDULER_SETTING); + return res.iterator(); + } + }, + Property.Dynamic, + Property.NodeScope + ); + /** + * The available disk space headroom below which NO new merges are started. + * Conservatively, the estimated temporary disk space required for the to-be-started merge is NOT counted as available disk space. + * Defaults to the routing allocation flood stage headroom value (below which shards are toggled read-only), + * unless the merge occupied disk space threshold is specified, in which case the default headroom value here is unset. + */ + public static final Setting INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING = new Setting<>( + "indices.merge.disk.watermark.high.max_headroom", + (settings) -> { + // if the user explicitly set a value for the occupied disk space threshold, disable the implicit headroom value + if (INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.exists(settings)) { + return "-1"; + } else { + return CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.get(settings).toString(); + } + }, + (s) -> ByteSizeValue.parseBytesSizeValue(s, "indices.merge.disk.watermark.high.max_headroom"), + new Setting.Validator<>() { + @Override + public void validate(ByteSizeValue value) {} + + @Override + public void validate(final ByteSizeValue value, final Map, Object> settings, boolean isPresent) { + if (isPresent) { + if (value.equals(ByteSizeValue.MINUS_ONE)) { + throw new IllegalArgumentException( + "setting a headroom value to less than 0 is not supported, use [null] value to unset" + ); + } + if (settings.get(USE_THREAD_POOL_MERGE_SCHEDULER_SETTING).equals(Boolean.FALSE)) { + throw new IllegalArgumentException( + "indices merge max headroom setting is only effective when [" + + USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey() + + "] is set to [true]" + ); + } + } + final RelativeByteSizeValue highWatermark = (RelativeByteSizeValue) settings.get(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING); + final ByteSizeValue highHeadroom = (ByteSizeValue) settings.get(INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING); + if (highWatermark.isAbsolute() && highHeadroom.equals(ByteSizeValue.MINUS_ONE) == false) { + throw new IllegalArgumentException( + "indices merge max headroom setting is set, but indices merge disk watermark value is not a relative value [" + + highWatermark.getStringRep() + + "]" + ); + } + } + + @Override + public Iterator> settings() { + List> res = List.of( + INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, + INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, + USE_THREAD_POOL_MERGE_SCHEDULER_SETTING + ); + return res.iterator(); + } + }, + Property.Dynamic, + Property.NodeScope + ); + /** + * Floor for IO write rate limit of individual merge tasks (we will never go any lower than this) + */ + static final ByteSizeValue MIN_IO_RATE = ByteSizeValue.ofMb(5L); + /** + * Ceiling for IO write rate limit of individual merge tasks (we will never go any higher than this) + */ + static final ByteSizeValue MAX_IO_RATE = ByteSizeValue.ofMb(10240L); + /** + * Initial value for IO write rate limit of individual merge tasks when doAutoIOThrottle is true + */ + static final ByteSizeValue START_IO_RATE = ByteSizeValue.ofMb(20L); + /** + * Total number of submitted merge tasks that support IO auto throttling and that have not yet been run (or aborted). + * This includes merge tasks that are currently running and that are backlogged (by their respective merge schedulers). + */ + private final AtomicInteger ioThrottledMergeTasksCount = new AtomicInteger(); + /** + * The merge tasks that are waiting execution. This does NOT include backlogged or currently executing merge tasks. + * For instance, this can be empty while there are backlogged merge tasks awaiting re-enqueuing. + * The budget (estimation) for a merge task is the disk space (still) required for it to complete. As the merge progresses, + * its budget decreases (as the bytes already written have been incorporated into the filesystem stats about the used disk space). + */ + private final MergeTaskPriorityBlockingQueue queuedMergeTasks = new MergeTaskPriorityBlockingQueue(); + /** + * The set of all merge tasks currently being executed by merge threads from the pool. + * These are tracked notably in order to be able to update their disk IO throttle rate, after they have started, while executing. + */ + private final Set runningMergeTasks = ConcurrentCollections.newConcurrentSet(); + /** + * Current IO write throttle rate, in bytes per sec, that's in effect for all currently running merge tasks, + * across all {@link ThreadPoolMergeScheduler}s that use this instance of the queue. + */ + private final AtomicIORate targetIORateBytesPerSec = new AtomicIORate(START_IO_RATE.getBytes()); + private final ExecutorService executorService; + /** + * The maximum number of concurrently running merges, given the number of threads in the pool. + */ + private final int maxConcurrentMerges; + private final int concurrentMergesFloorLimitForThrottling; + private final int concurrentMergesCeilLimitForThrottling; + private final AvailableDiskSpacePeriodicMonitor availableDiskSpacePeriodicMonitor; + + private final List mergeEventListeners = new CopyOnWriteArrayList<>(); + + public static @Nullable ThreadPoolMergeExecutorService maybeCreateThreadPoolMergeExecutorService( + ThreadPool threadPool, + ClusterSettings clusterSettings, + NodeEnvironment nodeEnvironment + ) { + if (clusterSettings.get(USE_THREAD_POOL_MERGE_SCHEDULER_SETTING)) { + return new ThreadPoolMergeExecutorService(threadPool, clusterSettings, nodeEnvironment); + } else { + // register no-op setting update consumers so that setting validations work properly + // (some validations are bypassed if there are no update consumers registered), + // i.e. to reject watermark and max headroom updates if the thread pool merge scheduler is disabled + clusterSettings.addSettingsUpdateConsumer(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, (ignored) -> {}); + clusterSettings.addSettingsUpdateConsumer(INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, (ignored) -> {}); + clusterSettings.addSettingsUpdateConsumer(INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING, (ignored) -> {}); + return null; + } + } + + private ThreadPoolMergeExecutorService(ThreadPool threadPool, ClusterSettings clusterSettings, NodeEnvironment nodeEnvironment) { + this.executorService = threadPool.executor(ThreadPool.Names.MERGE); + this.maxConcurrentMerges = threadPool.info(ThreadPool.Names.MERGE).getMax(); + // the intent here is to throttle down whenever we submit a task and no other task is running + this.concurrentMergesFloorLimitForThrottling = 2; + this.concurrentMergesCeilLimitForThrottling = maxConcurrentMerges * 2; + assert concurrentMergesFloorLimitForThrottling <= concurrentMergesCeilLimitForThrottling; + this.availableDiskSpacePeriodicMonitor = startDiskSpaceMonitoring( + threadPool, + nodeEnvironment.dataPaths(), + clusterSettings, + (availableDiskSpaceByteSize) -> this.queuedMergeTasks.updateBudget(availableDiskSpaceByteSize.getBytes()) + ); + } + + boolean submitMergeTask(MergeTask mergeTask) { + assert mergeTask.hasStartedRunning() == false; + // first enqueue the runnable that runs exactly one merge task (the smallest it can find) + if (enqueueMergeTaskExecution() == false) { + // if the thread pool cannot run the merge, just abort it + mergeTask.abort(); + return false; + } else { + if (mergeTask.supportsIOThrottling()) { + // count enqueued merge tasks that support IO auto throttling, and maybe adjust IO rate for all + int currentTaskCount = ioThrottledMergeTasksCount.incrementAndGet(); + targetIORateBytesPerSec.update( + currentTargetIORateBytesPerSec -> newTargetIORateBytesPerSec( + currentTargetIORateBytesPerSec, + currentTaskCount, + concurrentMergesFloorLimitForThrottling, + concurrentMergesCeilLimitForThrottling + ), + (prevTargetIORateBytesPerSec, newTargetIORateBytesPerSec) -> { + // it's OK to have this method update merge tasks concurrently, with different targetMBPerSec values, + // as it's not important that all merge tasks are throttled to the same IO rate at all time. + // For performance reasons, we don't synchronize the updates to targetMBPerSec values with the update of running + // merges. + if (prevTargetIORateBytesPerSec != newTargetIORateBytesPerSec) { + runningMergeTasks.forEach(runningMergeTask -> { + if (runningMergeTask.supportsIOThrottling()) { + runningMergeTask.setIORateLimit(newTargetIORateBytesPerSec); + } + }); + } + } + ); + } + // then enqueue the merge task proper + enqueueMergeTask(mergeTask); + return true; + } + } + + void reEnqueueBackloggedMergeTask(MergeTask mergeTask) { + assert mergeTask.hasStartedRunning() == false; + enqueueMergeTask(mergeTask); + } + + private void enqueueMergeTask(MergeTask mergeTask) { + // To ensure that for a given merge onMergeQueued is called before onMergeAborted or onMergeCompleted, we call onMergeQueued + // before adding the merge task to the queue. Adding to the queue should not fail. + mergeEventListeners.forEach(l -> l.onMergeQueued(mergeTask.getOnGoingMerge(), mergeTask.getMergeMemoryEstimateBytes())); + boolean added = queuedMergeTasks.enqueue(mergeTask); + assert added; + } + + public boolean allDone() { + return queuedMergeTasks.isQueueEmpty() && runningMergeTasks.isEmpty() && ioThrottledMergeTasksCount.get() == 0L; + } + + /** + * Enqueues a runnable that executes exactly one merge task, the smallest that is runnable at some point in time. + * A merge task is not runnable if its scheduler already reached the configured max-allowed concurrency level. + */ + private boolean enqueueMergeTaskExecution() { + try { + executorService.execute(() -> { + // one such runnable always executes a SINGLE merge task from the queue + // this is important for merge queue statistics, i.e. the executor's queue size represents the current amount of merges + while (true) { + PriorityBlockingQueueWithBudget.ElementWithReleasableBudget smallestMergeTaskWithReleasableBudget; + try { + // Will block if there are backlogged merges until they're enqueued again + // (for e.g. if the per-shard concurrent merges count limit is reached). + // Will also block if there is insufficient budget (i.e. estimated available disk space + // for the smallest merge task to run to completion) + smallestMergeTaskWithReleasableBudget = queuedMergeTasks.take(); + } catch (InterruptedException e) { + // An active worker thread has been interrupted while waiting for backlogged merges to be re-enqueued. + // In this case, we terminate the worker thread promptly and forget about the backlogged merges. + // It is OK to forget about merges in this case, because active worker threads are only interrupted + // when the node is shutting down, in which case in-memory accounting of merging activity is not relevant. + // As part of {@link java.util.concurrent.ThreadPoolExecutor#shutdownNow()} the thread pool's work queue + // is also drained, so any queued merge tasks are also forgotten. + break; + } + try (var ignored = smallestMergeTaskWithReleasableBudget) { + MergeTask smallestMergeTask = smallestMergeTaskWithReleasableBudget.element(); + // let the task's scheduler decide if it can actually run the merge task now + ThreadPoolMergeScheduler.Schedule schedule = smallestMergeTask.schedule(); + if (schedule == RUN) { + runMergeTask(smallestMergeTask); + break; + } else if (schedule == ABORT) { + abortMergeTask(smallestMergeTask); + break; + } else { + assert schedule == BACKLOG; + // The merge task is backlogged by the merge scheduler, try to get the next smallest one. + // It's then the duty of the said merge scheduler to re-enqueue the backlogged merge task when + // itself decides that the merge task could be run. Note that it is possible that this merge + // task is re-enqueued and re-took before the budget hold-up here is released upon the next + // {@link PriorityBlockingQueueWithBudget#updateBudget} invocation. + } + } + } + }); + return true; + } catch (Throwable t) { + // cannot execute merges because the executor is shutting down + assert t instanceof RejectedExecutionException; + return false; + } + } + + private void runMergeTask(MergeTask mergeTask) { + assert mergeTask.hasStartedRunning() == false; + boolean added = runningMergeTasks.add(mergeTask); + assert added : "starting merge task [" + mergeTask + "] registered as already running"; + try { + if (mergeTask.supportsIOThrottling()) { + mergeTask.setIORateLimit(targetIORateBytesPerSec.get()); + } + mergeTask.run(); + } finally { + boolean removed = runningMergeTasks.remove(mergeTask); + assert removed : "completed merge task [" + mergeTask + "] not registered as running"; + if (mergeTask.supportsIOThrottling()) { + ioThrottledMergeTasksCount.decrementAndGet(); + } + mergeEventListeners.forEach(l -> l.onMergeCompleted(mergeTask.getOnGoingMerge())); + } + } + + private void abortMergeTask(MergeTask mergeTask) { + assert mergeTask.hasStartedRunning() == false; + assert runningMergeTasks.contains(mergeTask) == false; + try { + mergeTask.abort(); + } finally { + if (mergeTask.supportsIOThrottling()) { + ioThrottledMergeTasksCount.decrementAndGet(); + } + mergeEventListeners.forEach(l -> l.onMergeAborted(mergeTask.getOnGoingMerge())); + } + } + + /** + * Start monitoring the available disk space, and update the available budget for running merge tasks + * Note: this doesn't work correctly for nodes with multiple data paths, as it only considers the data path with the MOST + * available disk space. In this case, merges will NOT be blocked for shards on data paths with insufficient available + * disk space, as long as a single data path has enough available disk space to run merges for any shards that it stores + * (i.e. multiple data path is not really supported when blocking merges due to insufficient available disk space + * (but nothing blows up either, if using multiple data paths)) + */ + static AvailableDiskSpacePeriodicMonitor startDiskSpaceMonitoring( + ThreadPool threadPool, + NodeEnvironment.DataPath[] dataPaths, + ClusterSettings clusterSettings, + Consumer availableDiskSpaceUpdateConsumer + ) { + AvailableDiskSpacePeriodicMonitor availableDiskSpacePeriodicMonitor = new AvailableDiskSpacePeriodicMonitor( + dataPaths, + threadPool, + clusterSettings.get(INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING), + clusterSettings.get(INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING), + clusterSettings.get(INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING), + availableDiskSpaceByteSize -> { + if (availableDiskSpaceByteSize.equals(ByteSizeValue.MINUS_ONE)) { + // The merge executor is currently unaware of the available disk space because of an error. + // Merges are NOT blocked if the available disk space is insufficient. + availableDiskSpaceUpdateConsumer.accept(ByteSizeValue.ofBytes(Long.MAX_VALUE)); + } else { + availableDiskSpaceUpdateConsumer.accept(availableDiskSpaceByteSize); + } + } + ); + if (availableDiskSpacePeriodicMonitor.isScheduled() == false) { + // in case the disk space monitor starts off as disabled, then make sure that merging is NOT blocked + // (in the other case, merging IS blocked until the first update for the available disk space) + availableDiskSpaceUpdateConsumer.accept(ByteSizeValue.ofBytes(Long.MAX_VALUE)); + } + clusterSettings.addSettingsUpdateConsumer( + INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING, + availableDiskSpacePeriodicMonitor::setHighStageWatermark + ); + clusterSettings.addSettingsUpdateConsumer( + INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING, + availableDiskSpacePeriodicMonitor::setHighStageMaxHeadroom + ); + clusterSettings.addSettingsUpdateConsumer( + INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING, + availableDiskSpacePeriodicMonitor::setCheckInterval + ); + return availableDiskSpacePeriodicMonitor; + } + + static class AvailableDiskSpacePeriodicMonitor implements Closeable { + private static final Logger LOGGER = LogManager.getLogger(AvailableDiskSpacePeriodicMonitor.class); + private final NodeEnvironment.DataPath[] dataPaths; + private final ThreadPool threadPool; + private volatile RelativeByteSizeValue highStageWatermark; + private volatile ByteSizeValue highStageMaxHeadroom; + private volatile TimeValue checkInterval; + private final Consumer updateConsumer; + private volatile boolean closed; + private volatile Scheduler.Cancellable monitor; + + AvailableDiskSpacePeriodicMonitor( + NodeEnvironment.DataPath[] dataPaths, + ThreadPool threadPool, + RelativeByteSizeValue highStageWatermark, + ByteSizeValue highStageMaxHeadroom, + TimeValue checkInterval, + Consumer updateConsumer + ) { + this.dataPaths = dataPaths; + this.threadPool = threadPool; + this.highStageWatermark = highStageWatermark; + this.highStageMaxHeadroom = highStageMaxHeadroom; + this.checkInterval = checkInterval; + this.updateConsumer = updateConsumer; + this.closed = false; + reschedule(); + } + + void setCheckInterval(TimeValue checkInterval) { + this.checkInterval = checkInterval; + reschedule(); + } + + void setHighStageWatermark(RelativeByteSizeValue highStageWatermark) { + this.highStageWatermark = highStageWatermark; + } + + void setHighStageMaxHeadroom(ByteSizeValue highStageMaxHeadroom) { + this.highStageMaxHeadroom = highStageMaxHeadroom; + } + + private synchronized void reschedule() { + if (monitor != null) { + monitor.cancel(); + } + if (closed == false && checkInterval.duration() > 0) { + // do an eager run, + // in order to increase responsiveness in case the period is long and something blocks waiting for the first update + threadPool.generic().execute(this::run); + monitor = threadPool.scheduleWithFixedDelay(this::run, checkInterval, threadPool.generic()); + } else { + monitor = null; + } + } + + boolean isScheduled() { + return monitor != null && closed == false; + } + + @Override + public void close() throws IOException { + closed = true; + reschedule(); + } + + private void run() { + if (closed) { + return; + } + FsInfo.Path mostAvailablePath = null; + IOException fsInfoException = null; + for (NodeEnvironment.DataPath dataPath : dataPaths) { + try { + FsInfo.Path fsInfo = getFSInfo(dataPath); // uncached + if (mostAvailablePath == null || mostAvailablePath.getAvailable().getBytes() < fsInfo.getAvailable().getBytes()) { + mostAvailablePath = fsInfo; + } + } catch (IOException e) { + if (fsInfoException == null) { + fsInfoException = e; + } else { + fsInfoException.addSuppressed(e); + } + } + } + if (fsInfoException != null) { + LOGGER.warn("unexpected exception reading filesystem info", fsInfoException); + } + if (mostAvailablePath == null) { + LOGGER.error("Cannot read filesystem info for node data paths " + Arrays.toString(dataPaths)); + updateConsumer.accept(ByteSizeValue.MINUS_ONE); + return; + } + long mostAvailableDiskSpaceBytes = mostAvailablePath.getAvailable().getBytes(); + // subtract the configured free disk space threshold + mostAvailableDiskSpaceBytes -= getFreeBytesThreshold(mostAvailablePath.getTotal(), highStageWatermark, highStageMaxHeadroom) + .getBytes(); + // clamp available space to 0 + long maxMergeSizeLimit = Math.max(0L, mostAvailableDiskSpaceBytes); + updateConsumer.accept(ByteSizeValue.ofBytes(maxMergeSizeLimit)); + } + + private static ByteSizeValue getFreeBytesThreshold( + ByteSizeValue total, + RelativeByteSizeValue watermark, + ByteSizeValue maxHeadroom + ) { + // If bytes are given, they can be readily returned as free bytes. + // If percentages are given, we need to calculate the free bytes. + if (watermark.isAbsolute()) { + return watermark.getAbsolute(); + } + return ByteSizeValue.subtract(total, watermark.calculateValue(total, maxHeadroom)); + } + } + + static class MergeTaskPriorityBlockingQueue extends PriorityBlockingQueueWithBudget { + MergeTaskPriorityBlockingQueue() { + // start with 0 budget (so takes on this queue will always block until {@link #updateBudget} is invoked) + // use the estimated *remaining* merge size as the budget function so that the disk space budget of taken (in-use) elements is + // updated according to the remaining disk space requirements of the currently running merge tasks + super(MergeTask::estimatedRemainingMergeSize, 0L); + } + + // exposed for tests + long getAvailableBudget() { + return super.availableBudget; + } + + // exposed for tests + MergeTask peekQueue() { + return enqueuedByBudget.peek(); + } + } + + /** + * Similar to a regular priority queue, but the {@link #take()} operation will also block if the smallest element + * (according to the specified "budget" function) is larger than an updatable limit budget. + */ + static class PriorityBlockingQueueWithBudget { + private final ToLongFunction budgetFunction; + protected final PriorityQueue enqueuedByBudget; + private final IdentityHashMap unreleasedBudgetPerElement; + private final ReentrantLock lock; + private final Condition elementAvailable; + protected long availableBudget; + + PriorityBlockingQueueWithBudget(ToLongFunction budgetFunction, long initialAvailableBudget) { + this.budgetFunction = budgetFunction; + this.enqueuedByBudget = new PriorityQueue<>(64, Comparator.comparingLong(budgetFunction)); + this.unreleasedBudgetPerElement = new IdentityHashMap<>(); + this.lock = new ReentrantLock(); + this.elementAvailable = lock.newCondition(); + this.availableBudget = initialAvailableBudget; + } + + boolean enqueue(E e) { + final ReentrantLock lock = this.lock; + lock.lock(); + try { + enqueuedByBudget.offer(e); + elementAvailable.signal(); + } finally { + lock.unlock(); + } + return true; + } + + /** + * Dequeues the smallest element (according to the specified "budget" function) if its budget is below the available limit. + * This method invocation blocks if the queue is empty or the element's budget is above the available limit. + */ + ElementWithReleasableBudget take() throws InterruptedException { + final ReentrantLock lock = this.lock; + lock.lockInterruptibly(); + try { + E peek; + long peekBudget; + // blocks until the smallest budget element fits the currently available budget + while ((peek = enqueuedByBudget.peek()) == null || (peekBudget = budgetFunction.applyAsLong(peek)) > availableBudget) { + elementAvailable.await(); + } + // deducts and holds up that element's budget from the available budget + return newElementWithReleasableBudget(enqueuedByBudget.poll(), peekBudget); + } finally { + lock.unlock(); + } + } + + /** + * Updates the available budged given the passed-in argument, from which it deducts the budget hold up by taken elements + * that are still in use. The budget of in-use elements is also updated (by re-applying the budget function). + * The newly updated budget is used to potentially block {@link #take()} operations if the smallest-budget enqueued element + * is over this newly computed available budget. + */ + void updateBudget(long availableBudget) { + final ReentrantLock lock = this.lock; + lock.lock(); + try { + this.availableBudget = availableBudget; + // update the per-element budget (these are all the elements that are using any budget) + unreleasedBudgetPerElement.replaceAll((e, v) -> budgetFunction.applyAsLong(e.element())); + // available budget is decreased by the used per-element budget (for all dequeued elements that are still in use) + this.availableBudget -= unreleasedBudgetPerElement.values().stream().mapToLong(i -> i).sum(); + elementAvailable.signalAll(); + } finally { + lock.unlock(); + } + } + + boolean isQueueEmpty() { + return enqueuedByBudget.isEmpty(); + } + + int queueSize() { + return enqueuedByBudget.size(); + } + + private ElementWithReleasableBudget newElementWithReleasableBudget(E element, long budget) { + ElementWithReleasableBudget elementWithReleasableBudget = new ElementWithReleasableBudget(element); + assert this.lock.isHeldByCurrentThread(); + // the taken element holds up some budget + var prev = this.unreleasedBudgetPerElement.put(elementWithReleasableBudget, budget); + assert prev == null; + this.availableBudget -= budget; + assert this.availableBudget >= 0L; + return elementWithReleasableBudget; + } + + private void release(ElementWithReleasableBudget elementWithReleasableBudget) { + final ReentrantLock lock = this.lock; + lock.lock(); + try { + assert elementWithReleasableBudget.isClosed() == false; + // when the taken element is not used anymore, it will not influence subsequent computations for available budget, + // but its allotted budget is not yet released + var val = unreleasedBudgetPerElement.remove(elementWithReleasableBudget); + assert val != null; + } finally { + lock.unlock(); + } + } + + private boolean isReleased(ElementWithReleasableBudget elementWithReleasableBudget) { + return unreleasedBudgetPerElement.containsKey(elementWithReleasableBudget) == false; + } + + class ElementWithReleasableBudget implements Releasable { + private final E element; + + private ElementWithReleasableBudget(E element) { + this.element = element; + } + + /** + * Must be invoked when the caller is done with the element that it previously took from the queue. + * The budget it's holding is not immediately released, but the next time {@link #updateBudget(long)} + * is invoked this element's budget won't deduct from the total available. + */ + @Override + public void close() { + PriorityBlockingQueueWithBudget.this.release(this); + } + + boolean isClosed() { + return PriorityBlockingQueueWithBudget.this.isReleased(this); + } + + E element() { + return element; + } + } + } + + private static long newTargetIORateBytesPerSec( + long currentTargetIORateBytesPerSec, + int currentlySubmittedIOThrottledMergeTasks, + int concurrentMergesFloorLimitForThrottling, + int concurrentMergesCeilLimitForThrottling + ) { + final long newTargetIORateBytesPerSec; + if (currentlySubmittedIOThrottledMergeTasks < concurrentMergesFloorLimitForThrottling + && currentTargetIORateBytesPerSec > MIN_IO_RATE.getBytes()) { + // decrease target IO rate by 10% (capped) + newTargetIORateBytesPerSec = Math.max( + MIN_IO_RATE.getBytes(), + currentTargetIORateBytesPerSec - currentTargetIORateBytesPerSec / 10L + ); + } else if (currentlySubmittedIOThrottledMergeTasks > concurrentMergesCeilLimitForThrottling + && currentTargetIORateBytesPerSec < MAX_IO_RATE.getBytes()) { + // increase target IO rate by 20% (capped) + newTargetIORateBytesPerSec = Math.min( + MAX_IO_RATE.getBytes(), + currentTargetIORateBytesPerSec + currentTargetIORateBytesPerSec / 5L + ); + } else { + newTargetIORateBytesPerSec = currentTargetIORateBytesPerSec; + } + return newTargetIORateBytesPerSec; + } + + static class AtomicIORate { + private final AtomicLong ioRate; + + AtomicIORate(long initialIORate) { + ioRate = new AtomicLong(initialIORate); + } + + long get() { + return ioRate.get(); + } + + // Exactly like {@link AtomicLong#updateAndGet} but calls the consumer rather than return the new (updated) value. + // The consumer receives both the previous and the updated values (which can be equal). + void update(LongUnaryOperator updateFunction, AtomicIORate.UpdateConsumer updateConsumer) { + long prev = ioRate.get(), next = 0L; + for (boolean haveNext = false;;) { + if (haveNext == false) next = updateFunction.applyAsLong(prev); + if (ioRate.weakCompareAndSetVolatile(prev, next)) { + updateConsumer.accept(prev, next); + return; + } + haveNext = (prev == (prev = ioRate.get())); + } + } + + @FunctionalInterface + interface UpdateConsumer { + void accept(long prev, long next); + } + } + + public boolean usingMaxTargetIORateBytesPerSec() { + return MAX_IO_RATE.getBytes() == targetIORateBytesPerSec.get(); + } + + public void registerMergeEventListener(MergeEventListener consumer) { + mergeEventListeners.add(consumer); + } + + // exposed for tests + Set getRunningMergeTasks() { + return runningMergeTasks; + } + + // exposed for tests + int getMergeTasksQueueLength() { + return queuedMergeTasks.queueSize(); + } + + // exposed for tests + long getDiskSpaceAvailableForNewMergeTasks() { + return queuedMergeTasks.getAvailableBudget(); + } + + // exposed for tests and stats + long getTargetIORateBytesPerSec() { + return targetIORateBytesPerSec.get(); + } + + // exposed for tests + int getMaxConcurrentMerges() { + return maxConcurrentMerges; + } + + @Override + public void close() throws IOException { + availableDiskSpacePeriodicMonitor.close(); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java new file mode 100644 index 0000000000000..78a9695bea540 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/engine/ThreadPoolMergeScheduler.java @@ -0,0 +1,620 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.logging.log4j.Logger; +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergeRateLimiter; +import org.apache.lucene.index.MergeScheduler; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FilterDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RateLimitedIndexOutput; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.MergeSchedulerConfig; +import org.elasticsearch.index.merge.MergeStats; +import org.elasticsearch.index.merge.OnGoingMerge; +import org.elasticsearch.index.shard.ShardId; + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; + +public class ThreadPoolMergeScheduler extends MergeScheduler implements ElasticsearchMergeScheduler { + public static final Setting USE_THREAD_POOL_MERGE_SCHEDULER_SETTING = Setting.boolSetting( + "indices.merge.scheduler.use_thread_pool", + true, + Setting.Property.NodeScope + ); + private final ShardId shardId; + private final MergeSchedulerConfig config; + protected final Logger logger; + private final MergeTracking mergeTracking; + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; + private final PriorityQueue backloggedMergeTasks = new PriorityQueue<>( + 16, + Comparator.comparingLong(MergeTask::estimatedRemainingMergeSize) + ); + private final Map runningMergeTasks = new HashMap<>(); + // set when incoming merges should be throttled (i.e. restrict the indexing rate) + private final AtomicBoolean shouldThrottleIncomingMerges = new AtomicBoolean(); + // how many {@link MergeTask}s have kicked off (this is used to name them). + private final AtomicLong submittedMergeTaskCount = new AtomicLong(); + private final AtomicLong doneMergeTaskCount = new AtomicLong(); + private final CountDownLatch closedWithNoRunningMerges = new CountDownLatch(1); + private volatile boolean closed = false; + private final MergeMemoryEstimateProvider mergeMemoryEstimateProvider; + + /** + * Creates a thread-pool-based merge scheduler that runs merges in a thread pool. + * + * @param shardId the shard id associated with this merge scheduler + * @param indexSettings used to obtain the {@link MergeSchedulerConfig} + * @param threadPoolMergeExecutorService the executor service used to execute merge tasks from this scheduler + * @param mergeMemoryEstimateProvider provides an estimate for how much memory a merge will take + */ + public ThreadPoolMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService, + MergeMemoryEstimateProvider mergeMemoryEstimateProvider + ) { + this.shardId = shardId; + this.config = indexSettings.getMergeSchedulerConfig(); + this.logger = Loggers.getLogger(getClass(), shardId); + this.mergeTracking = new MergeTracking( + logger, + () -> this.config.isAutoThrottle() + ? ByteSizeValue.ofBytes(threadPoolMergeExecutorService.getTargetIORateBytesPerSec()).getMbFrac() + : Double.POSITIVE_INFINITY + ); + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; + this.mergeMemoryEstimateProvider = mergeMemoryEstimateProvider; + } + + @Override + public Set onGoingMerges() { + return mergeTracking.onGoingMerges(); + } + + @Override + public MergeStats stats() { + return mergeTracking.stats(); + } + + @Override + public MergeScheduler getMergeScheduler() { + return this; + } + + @Override + public void refreshConfig() { + // if maxMergeCount changed, maybe we need to toggle merge task throttling + checkMergeTaskThrottling(); + // if maxThreadCount changed, maybe some backlogged merges are now allowed to run + enqueueBackloggedTasks(); + } + + @Override + public void merge(MergeSource mergeSource, MergeTrigger trigger) { + if (closed) { + // avoid pulling from the merge source when closing + return; + } + MergePolicy.OneMerge merge = null; + try { + merge = mergeSource.getNextMerge(); + } catch (IllegalStateException e) { + if (verbose()) { + message("merge task poll failed, likely that index writer is failed"); + } + // ignore exception, we expect the IW failure to be logged elsewhere + } + if (merge != null) { + submitNewMergeTask(mergeSource, merge, trigger); + } + } + + @Override + public MergeScheduler clone() { + // Lucene IW makes a clone internally but since we hold on to this instance + // the clone will just be the identity. + return this; + } + + /** + * A callback allowing for custom logic before an actual merge starts. + */ + protected void beforeMerge(OnGoingMerge merge) {} + + /** + * A callback allowing for custom logic after an actual merge starts. + */ + protected void afterMerge(OnGoingMerge merge) {} + + /** + * A callback allowing for custom logic when a merge is queued. + */ + protected void mergeQueued(OnGoingMerge merge) {} + + /** + * A callback allowing for custom logic after a merge is executed or aborted. + */ + protected void mergeExecutedOrAborted(OnGoingMerge merge) {} + + /** + * A callback that's invoked when indexing should throttle down indexing in order to let merging to catch up. + */ + protected void enableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) {} + + /** + * A callback that's invoked when indexing should un-throttle because merging caught up. + * This is invoked sometime after {@link #enableIndexingThrottling(int, int, int)} was invoked in the first place. + */ + protected void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) {} + + /** + * Returns true if scheduled merges should be skipped (aborted) + */ + protected boolean shouldSkipMerge() { + return false; + } + + /** + * Returns true if IO-throttling is enabled + */ + protected boolean isAutoThrottle() { + return config.isAutoThrottle(); + } + + /** + * Returns the maximum number of active merges before being throttled + */ + protected int getMaxMergeCount() { + return config.getMaxMergeCount(); + } + + /** + * Returns the maximum number of threads running merges before being throttled + */ + protected int getMaxThreadCount() { + return config.getMaxThreadCount(); + } + + /** + * A callback for exceptions thrown while merging. + */ + protected void handleMergeException(Throwable t) { + throw new MergePolicy.MergeException(t); + } + + // package-private for tests + boolean submitNewMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, MergeTrigger mergeTrigger) { + try { + MergeTask mergeTask = newMergeTask(mergeSource, merge, mergeTrigger); + mergeQueued(mergeTask.onGoingMerge); + return threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } finally { + checkMergeTaskThrottling(); + } + } + + // package-private for tests + MergeTask newMergeTask(MergeSource mergeSource, MergePolicy.OneMerge merge, MergeTrigger mergeTrigger) { + // forced merges, as well as merges triggered when closing a shard, always run un-IO-throttled + boolean isAutoThrottle = mergeTrigger != MergeTrigger.CLOSING && merge.getStoreMergeInfo().mergeMaxNumSegments() == -1; + // IO throttling cannot be toggled for existing merge tasks, only new merge tasks pick up the updated IO throttling setting + long estimateMergeMemoryBytes = mergeMemoryEstimateProvider.estimateMergeMemoryBytes(merge); + return new MergeTask( + mergeSource, + merge, + isAutoThrottle && isAutoThrottle(), + "Lucene Merge Task #" + submittedMergeTaskCount.incrementAndGet() + " for shard " + shardId, + estimateMergeMemoryBytes + ); + } + + private void checkMergeTaskThrottling() { + long submittedMergesCount = submittedMergeTaskCount.get(); + long doneMergesCount = doneMergeTaskCount.get(); + int runningMergesCount = runningMergeTasks.size(); + int configuredMaxMergeCount = getMaxMergeCount(); + // both currently running and enqueued merge tasks are considered "active" for throttling purposes + int activeMerges = (int) (submittedMergesCount - doneMergesCount); + if (activeMerges > configuredMaxMergeCount + // only throttle indexing if disk IO is un-throttled, and we still can't keep up with the merge load + && threadPoolMergeExecutorService.usingMaxTargetIORateBytesPerSec() + && shouldThrottleIncomingMerges.get() == false) { + // maybe enable merge task throttling + synchronized (shouldThrottleIncomingMerges) { + if (shouldThrottleIncomingMerges.getAndSet(true) == false) { + enableIndexingThrottling(runningMergesCount, activeMerges - runningMergesCount, configuredMaxMergeCount); + } + } + } else if (activeMerges <= configuredMaxMergeCount && shouldThrottleIncomingMerges.get()) { + // maybe disable merge task throttling + synchronized (shouldThrottleIncomingMerges) { + if (shouldThrottleIncomingMerges.getAndSet(false)) { + disableIndexingThrottling(runningMergesCount, activeMerges - runningMergesCount, configuredMaxMergeCount); + } + } + } + } + + // exposed for tests + // synchronized so that {@code #closed}, {@code #runningMergeTasks} and {@code #backloggedMergeTasks} are modified atomically + synchronized Schedule schedule(MergeTask mergeTask) { + assert mergeTask.hasStartedRunning() == false; + if (closed) { + // do not run or backlog tasks when closing the merge scheduler, instead abort them + return Schedule.ABORT; + } else if (shouldSkipMerge()) { + if (verbose()) { + message(String.format(Locale.ROOT, "skipping merge task %s", mergeTask)); + } + return Schedule.ABORT; + } else if (runningMergeTasks.size() < getMaxThreadCount()) { + boolean added = runningMergeTasks.put(mergeTask.onGoingMerge.getMerge(), mergeTask) == null; + assert added : "starting merge task [" + mergeTask + "] registered as already running"; + return Schedule.RUN; + } else { + assert mergeTask.hasStartedRunning() == false; + backloggedMergeTasks.add(mergeTask); + return Schedule.BACKLOG; + } + } + + // exposed for tests + synchronized void mergeTaskFinishedRunning(MergeTask mergeTask) { + boolean removed = runningMergeTasks.remove(mergeTask.onGoingMerge.getMerge()) != null; + assert removed : "completed merge task [" + mergeTask + "] not registered as running"; + // when one merge is done, maybe a backlogged one can now execute + enqueueBackloggedTasks(); + // signal here, because, when closing, we wait for all currently running merges to finish + maybeSignalAllMergesDoneAfterClose(); + } + + private void mergeTaskDone(OnGoingMerge merge) { + doneMergeTaskCount.incrementAndGet(); + mergeExecutedOrAborted(merge); + checkMergeTaskThrottling(); + } + + private synchronized void maybeSignalAllMergesDoneAfterClose() { + if (closed && runningMergeTasks.isEmpty()) { + closedWithNoRunningMerges.countDown(); + } + } + + private synchronized void enqueueBackloggedTasks() { + int maxBackloggedTasksToEnqueue = getMaxThreadCount() - runningMergeTasks.size(); + // enqueue all backlogged tasks when closing, as the queue expects all backlogged tasks to always be enqueued back + while (closed || maxBackloggedTasksToEnqueue-- > 0) { + MergeTask backloggedMergeTask = backloggedMergeTasks.poll(); + if (backloggedMergeTask == null) { + break; + } + // no need to abort merge tasks now, they will be aborted on the spot when the scheduler gets to run them + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backloggedMergeTask); + } + } + + /** + * Does the actual merge, by calling {@link org.apache.lucene.index.MergeScheduler.MergeSource#merge} + */ + void doMerge(MergeSource mergeSource, MergePolicy.OneMerge oneMerge) { + try { + mergeSource.merge(oneMerge); + } catch (Throwable t) { + // OK to ignore MergeAbortedException. This is what Lucene's ConcurrentMergeScheduler does. + if (t instanceof MergePolicy.MergeAbortedException == false) { + handleMergeException(t); + } + } + } + + @Override + public Directory wrapForMerge(MergePolicy.OneMerge merge, Directory in) { + // Return a wrapped Directory which has rate-limited output. + // Note: the rate limiter is only per thread (per merge). So, if there are multiple merge threads running + // the combined IO rate per node is, roughly, 'thread_pool_size * merge_queue#targetMBPerSec', as + // the per-thread IO rate is updated, best effort, for all running merge threads concomitantly. + if (merge.isAborted()) { + // merges can theoretically be aborted at any moment + return in; + } + MergeTask mergeTask = runningMergeTasks.get(merge); + if (mergeTask == null) { + throw new IllegalStateException("associated merge task for executing merge not found"); + } + return new FilterDirectory(in) { + @Override + public IndexOutput createOutput(String name, IOContext context) throws IOException { + ensureOpen(); + + // This Directory is only supposed to be used during merging, + // so all writes should have MERGE context, else there is a bug + // somewhere that is failing to pass down the right IOContext: + assert context.context() == IOContext.Context.MERGE : "got context=" + context.context(); + + return new RateLimitedIndexOutput(mergeTask.rateLimiter, in.createOutput(name, context)); + } + }; + } + + class MergeTask implements Runnable { + private final String name; + private final AtomicLong mergeStartTimeNS; + private final MergeSource mergeSource; + private final OnGoingMerge onGoingMerge; + private final MergeRateLimiter rateLimiter; + private final boolean supportsIOThrottling; + private final long mergeMemoryEstimateBytes; + + MergeTask( + MergeSource mergeSource, + MergePolicy.OneMerge merge, + boolean supportsIOThrottling, + String name, + long mergeMemoryEstimateBytes + ) { + this.name = name; + this.mergeStartTimeNS = new AtomicLong(); + this.mergeSource = mergeSource; + this.onGoingMerge = new OnGoingMerge(merge); + this.rateLimiter = new MergeRateLimiter(merge.getMergeProgress()); + this.supportsIOThrottling = supportsIOThrottling; + this.mergeMemoryEstimateBytes = mergeMemoryEstimateBytes; + } + + Schedule schedule() { + return ThreadPoolMergeScheduler.this.schedule(this); + } + + public boolean supportsIOThrottling() { + return supportsIOThrottling; + } + + public void setIORateLimit(long ioRateLimitBytesPerSec) { + if (supportsIOThrottling == false) { + throw new IllegalArgumentException("merge task cannot be IO throttled"); + } + this.rateLimiter.setMBPerSec(ByteSizeValue.ofBytes(ioRateLimitBytesPerSec).getMbFrac()); + } + + /** + * Returns {@code true} if this task is currently running, or was run in the past. + * An aborted task (see {@link #abort()}) is considered as NOT run. + */ + public boolean hasStartedRunning() { + boolean isRunning = mergeStartTimeNS.get() > 0L; + assert isRunning != false || rateLimiter.getTotalBytesWritten() == 0L; + return isRunning; + } + + /** + * Runs the merge associated to this task. MUST be invoked after {@link #schedule()} returned {@link Schedule#RUN}, + * to confirm that the associated {@link MergeScheduler} assents to run the merge. + * Either one of {@link #run()} or {@link #abort()} MUST be invoked exactly once for evey {@link MergeTask}. + * After the merge is finished, this will also submit any follow-up merges from the task's merge source. + */ + @Override + public void run() { + assert hasStartedRunning() == false; + assert ThreadPoolMergeScheduler.this.runningMergeTasks.containsKey(onGoingMerge.getMerge()) + : "runNowOrBacklog must be invoked before actually running the merge task"; + try { + beforeMerge(onGoingMerge); + try { + if (mergeStartTimeNS.compareAndSet(0L, System.nanoTime()) == false) { + throw new IllegalStateException("The merge task is already started or aborted"); + } + mergeTracking.mergeStarted(onGoingMerge); + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s start", this)); + } + try { + doMerge(mergeSource, onGoingMerge.getMerge()); + if (verbose()) { + message( + String.format( + Locale.ROOT, + "merge task %s merge segment [%s] done estSize=%.1f MB (written=%.1f MB) " + + "runTime=%.1fs (stopped=%.1fs, paused=%.1fs) rate=%s", + this, + getSegmentName(onGoingMerge.getMerge()), + bytesToMB(onGoingMerge.getMerge().estimatedMergeBytes), + bytesToMB(rateLimiter.getTotalBytesWritten()), + nsToSec(System.nanoTime() - mergeStartTimeNS.get()), + nsToSec(rateLimiter.getTotalStoppedNS()), + nsToSec(rateLimiter.getTotalPausedNS()), + rateToString(rateLimiter.getMBPerSec()) + ) + ); + } + } finally { + long tookMS = TimeValue.nsecToMSec(System.nanoTime() - mergeStartTimeNS.get()); + mergeTracking.mergeFinished(onGoingMerge.getMerge(), onGoingMerge, tookMS); + } + } finally { + afterMerge(onGoingMerge); + } + } finally { + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s end", this)); + } + try { + mergeTaskFinishedRunning(this); + } finally { + mergeTaskDone(onGoingMerge); + } + try { + // kick-off any follow-up merge + merge(mergeSource, MergeTrigger.MERGE_FINISHED); + } catch (@SuppressWarnings("unused") AlreadyClosedException ace) { + // OK, this is what the {@code ConcurrentMergeScheduler} does + } + } + } + + /** + * Aborts the merge task, for e.g. when the {@link MergeScheduler}, or the + * {@link ThreadPoolMergeExecutorService} are closing. Either one of {@link #run()} or {@link #abort()} + * MUST be invoked exactly once for evey {@link MergeTask}. + * An aborted merge means that the segments involved will be made available + * (by the {@link org.apache.lucene.index.IndexWriter}) to any subsequent merges. + */ + void abort() { + assert hasStartedRunning() == false; + assert ThreadPoolMergeScheduler.this.runningMergeTasks.containsKey(onGoingMerge.getMerge()) == false + : "cannot abort a merge task that's already running"; + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s aborted", this)); + } + // {@code IndexWriter} checks the abort flag internally, while running the merge. + // The segments of an aborted merge become available to subsequent merges. + onGoingMerge.getMerge().setAborted(); + try { + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s start abort", this)); + } + // mark the merge task as running, even though the merge itself is aborted and the task will run for a brief time only + if (mergeStartTimeNS.compareAndSet(0L, System.nanoTime()) == false) { + throw new IllegalStateException("The merge task is already started or aborted"); + } + // This ensures {@code OneMerge#close} gets invoked. + // {@code IndexWriter} considers a merge as "running" once it has been pulled from the {@code MergeSource#getNextMerge}, + // so in theory it's not enough to just call {@code MergeSource#onMergeFinished} on it (as for "pending" ones). + doMerge(mergeSource, onGoingMerge.getMerge()); + } finally { + if (verbose()) { + message(String.format(Locale.ROOT, "merge task %s end abort", this)); + } + mergeTaskDone(onGoingMerge); + } + } + + /** + * Before the merge task started running, this returns the estimated required disk space for the merge to complete + * (i.e. the estimated disk space size of the resulting segment following the merge). + * While the merge is running, the returned estimation is updated to take into account the data that's already been written. + * After the merge completes, the estimation returned here should ideally be close to "0". + */ + long estimatedRemainingMergeSize() { + // TODO is it possible that `estimatedMergeBytes` be `0` for correctly initialize merges, + // or is it always the case that if `estimatedMergeBytes` is `0` that means that the merge has not yet been initialized? + long estimatedMergeSize = onGoingMerge.getMerge().getStoreMergeInfo().estimatedMergeBytes(); + return Math.max(0L, estimatedMergeSize - rateLimiter.getTotalBytesWritten()); + } + + public long getMergeMemoryEstimateBytes() { + return mergeMemoryEstimateBytes; + } + + public OnGoingMerge getOnGoingMerge() { + return onGoingMerge; + } + + @Override + public String toString() { + return name + (onGoingMerge.getMerge().isAborted() ? " (aborted)" : ""); + } + } + + @Override + /* Overridden to route messages to our logger too, in addition to the {@link org.apache.lucene.util.InfoStream} that lucene uses. */ + protected boolean verbose() { + if (logger.isTraceEnabled()) { + return true; + } + return super.verbose(); + } + + @Override + /* Overridden to route messages to our logger too, in addition to the {@link org.apache.lucene.util.InfoStream} that lucene uses. */ + protected void message(String message) { + if (logger.isTraceEnabled()) { + logger.trace("{}", message); + } + super.message(message); + } + + @Override + public void close() throws IOException { + synchronized (this) { + closed = true; + // enqueue any backlogged merge tasks, because the merge queue assumes that the backlogged tasks are always re-enqueued + enqueueBackloggedTasks(); + // signal if there aren't any currently running merges + maybeSignalAllMergesDoneAfterClose(); + } + try { + closedWithNoRunningMerges.await(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } finally { + // this closes an executor that may be used by ongoing merges, so better close it only after all running merges finished + super.close(); + } + } + + // exposed for tests + PriorityQueue getBackloggedMergeTasks() { + return backloggedMergeTasks; + } + + // exposed for tests + Map getRunningMergeTasks() { + return runningMergeTasks; + } + + private static double nsToSec(long ns) { + return ns / (double) TimeUnit.SECONDS.toNanos(1); + } + + private static double bytesToMB(long bytes) { + return bytes / 1024. / 1024.; + } + + private static String getSegmentName(MergePolicy.OneMerge merge) { + return merge.getMergeInfo() != null ? merge.getMergeInfo().info.name : "_na_"; + } + + private static String rateToString(double mbPerSec) { + if (mbPerSec == 0.0) { + return "stopped"; + } else if (mbPerSec == Double.POSITIVE_INFINITY) { + return "unlimited"; + } else { + return String.format(Locale.ROOT, "%.1f MB/sec", mbPerSec); + } + } + + enum Schedule { + ABORT, + RUN, + BACKLOG + } +} diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java index e67debfe443bf..ed31a74ac326b 100644 --- a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java @@ -96,6 +96,7 @@ import org.elasticsearch.index.engine.SafeCommitInfo; import org.elasticsearch.index.engine.Segment; import org.elasticsearch.index.engine.SegmentsStats; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.fielddata.FieldDataStats; import org.elasticsearch.index.fielddata.ShardFieldData; import org.elasticsearch.index.flush.FlushStats; @@ -195,6 +196,8 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesClusterStateService.Shard { private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final MapperService mapperService; private final IndexCache indexCache; private final Store store; @@ -318,6 +321,7 @@ public IndexShard( final IndexEventListener indexEventListener, final CheckedFunction indexReaderWrapper, final ThreadPool threadPool, + final ThreadPoolMergeExecutorService threadPoolMergeExecutorService, final BigArrays bigArrays, final Engine.Warmer warmer, final List searchOperationListener, @@ -344,6 +348,7 @@ public IndexShard( this.indexSortSupplier = indexSortSupplier; this.indexEventListener = indexEventListener; this.threadPool = threadPool; + this.threadPoolMergeExecutorService = threadPoolMergeExecutorService; this.mapperService = mapperService; this.indexCache = indexCache; this.internalIndexingStats = new InternalIndexingStats(); @@ -3559,6 +3564,7 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) { return new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, indexSettings, warmer, store, diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesService.java b/server/src/main/java/org/elasticsearch/indices/IndicesService.java index 1df5bddeff9e3..7cb92ec7b3f14 100644 --- a/server/src/main/java/org/elasticsearch/indices/IndicesService.java +++ b/server/src/main/java/org/elasticsearch/indices/IndicesService.java @@ -98,6 +98,7 @@ import org.elasticsearch.index.engine.InternalEngineFactory; import org.elasticsearch.index.engine.NoOpEngine; import org.elasticsearch.index.engine.ReadOnlyEngine; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.flush.FlushStats; import org.elasticsearch.index.get.GetStats; @@ -231,6 +232,8 @@ public class IndicesService extends AbstractLifecycleComponent private final IndicesFieldDataCache indicesFieldDataCache; private final CacheCleaner cacheCleaner; private final ThreadPool threadPool; + @Nullable + private final ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private final CircuitBreakerService circuitBreakerService; private final BigArrays bigArrays; private final ScriptService scriptService; @@ -308,6 +311,11 @@ protected void doStart() { this.bigArrays = builder.bigArrays; this.scriptService = builder.scriptService; this.clusterService = builder.clusterService; + this.threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + clusterService.getClusterSettings(), + nodeEnv + ); this.client = builder.client; this.featureService = builder.featureService; this.idFieldDataEnabled = INDICES_ID_FIELD_DATA_ENABLED_SETTING.get(clusterService.getSettings()); @@ -355,7 +363,8 @@ public void onRemoval(ShardId shardId, String fieldName, boolean wasEvicted, lon indicesFieldDataCache, cacheCleaner, indicesRequestCache, - indicesQueryCache + indicesQueryCache, + threadPoolMergeExecutorService ); } catch (IOException e) { throw new UncheckedIOException(e); @@ -781,6 +790,7 @@ private synchronized IndexService createIndexService( circuitBreakerService, bigArrays, threadPool, + threadPoolMergeExecutorService, scriptService, clusterService, client, @@ -1906,4 +1916,9 @@ public IndexScopedSettings getIndexScopedSettings() { public BigArrays getBigArrays() { return bigArrays; } + + @Nullable + public ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService() { + return threadPoolMergeExecutorService; + } } diff --git a/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java b/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java index 9698ce6b65cdf..b8dddc20cc51d 100644 --- a/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java +++ b/server/src/main/java/org/elasticsearch/threadpool/DefaultBuiltInExecutorBuilders.java @@ -13,6 +13,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.threadpool.internal.BuiltInExecutorBuilders; import java.util.HashMap; @@ -141,6 +142,12 @@ public Map getBuilders(Settings settings, int allocated false ) ); + if (ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.get(settings)) { + result.put( + ThreadPool.Names.MERGE, + new ScalingExecutorBuilder(ThreadPool.Names.MERGE, 1, allocatedProcessors, TimeValue.timeValueMinutes(5), true) + ); + } result.put( ThreadPool.Names.FORCE_MERGE, new FixedExecutorBuilder( diff --git a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java index 96d82793a3f4f..85ee02b6db856 100644 --- a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java +++ b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java @@ -134,6 +134,7 @@ public static class Names { public static final String WARMER = "warmer"; public static final String SNAPSHOT = "snapshot"; public static final String SNAPSHOT_META = "snapshot_meta"; + public static final String MERGE = "merge"; public static final String FORCE_MERGE = "force_merge"; public static final String FETCH_SHARD_STARTED = "fetch_shard_started"; public static final String FETCH_SHARD_STORE = "fetch_shard_store"; @@ -192,6 +193,7 @@ public static ThreadPoolType fromType(String type) { entry(Names.WARMER, ThreadPoolType.SCALING), entry(Names.SNAPSHOT, ThreadPoolType.SCALING), entry(Names.SNAPSHOT_META, ThreadPoolType.SCALING), + entry(Names.MERGE, ThreadPoolType.SCALING), entry(Names.FORCE_MERGE, ThreadPoolType.FIXED), entry(Names.FETCH_SHARD_STARTED, ThreadPoolType.SCALING), entry(Names.FETCH_SHARD_STORE, ThreadPoolType.SCALING), diff --git a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java index c519d4834148d..81ca842f9ad65 100644 --- a/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java +++ b/server/src/test/java/org/elasticsearch/index/IndexModuleTests.java @@ -37,6 +37,7 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.UUIDs; import org.elasticsearch.common.breaker.CircuitBreaker; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Setting.Property; import org.elasticsearch.common.settings.Settings; @@ -59,6 +60,8 @@ import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngine; import org.elasticsearch.index.engine.InternalEngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.fielddata.IndexFieldDataCache; import org.elasticsearch.index.mapper.MapperMetrics; import org.elasticsearch.index.mapper.MapperRegistry; @@ -158,6 +161,7 @@ public void beforeShardFoldersDeleted(ShardId shardId, IndexSettings indexSettin }; private MapperRegistry mapperRegistry; private ThreadPool threadPool; + private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private CircuitBreakerService circuitBreakerService; private BigArrays bigArrays; private ScriptService scriptService; @@ -170,6 +174,7 @@ public void setUp() throws Exception { settings = Settings.builder() .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) .build(); indicesQueryCache = new IndicesQueryCache(settings); indexSettings = IndexSettingsModule.newIndexSettings("foo", settings); @@ -192,8 +197,13 @@ public void setUp() throws Exception { PageCacheRecycler pageCacheRecycler = new PageCacheRecycler(settings); bigArrays = new BigArrays(pageCacheRecycler, circuitBreakerService, CircuitBreaker.REQUEST); scriptService = new ScriptService(settings, Collections.emptyMap(), Collections.emptyMap(), () -> 1L); - clusterService = ClusterServiceUtils.createClusterService(threadPool); + clusterService = ClusterServiceUtils.createClusterService(threadPool, ClusterSettings.createBuiltInClusterSettings(settings)); nodeEnvironment = new NodeEnvironment(settings, environment); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + clusterService.getClusterSettings(), + nodeEnvironment + ); mapperRegistry = new IndicesModule(Collections.emptyList()).getMapperRegistry(); indexNameExpressionResolver = TestIndexNameExpressionResolver.newInstance(threadPool.getThreadContext()); } @@ -214,6 +224,7 @@ private IndexService newIndexService(IndexModule module) throws IOException { circuitBreakerService, bigArrays, threadPool, + threadPoolMergeExecutorService, scriptService, clusterService, null, diff --git a/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java b/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java index 26de6a7897786..3f49d430a9945 100644 --- a/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java +++ b/server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java @@ -2578,10 +2578,10 @@ public boolean mergeCompleted() { public void append(LogEvent event) { final String formattedMessage = event.getMessage().getFormattedMessage(); if (event.getLevel() == Level.TRACE && event.getMarker().getName().contains("[index][0]")) { - if (formattedMessage.startsWith("merge thread")) { + if (formattedMessage.startsWith("merge task")) { messages.add(formattedMessage); } else if (event.getLoggerName().endsWith(".MS") - && formattedMessage.contains("MS: merge thread") + && formattedMessage.contains("MS: merge task") && formattedMessage.endsWith("end")) { luceneMergeSchedulerEnded.set(true); } @@ -2616,14 +2616,14 @@ public void testMergeThreadLogging() throws Exception { }); assertBusy(() -> { - List threadMsgs = mockAppender.messages().stream().filter(line -> line.startsWith("merge thread")).toList(); + List threadMsgs = mockAppender.messages().stream().filter(line -> line.startsWith("merge task")).toList(); assertThat("messages:" + threadMsgs, threadMsgs.size(), greaterThanOrEqualTo(3)); assertThat( threadMsgs, containsInRelativeOrder( - matchesRegex("^merge thread .* start$"), - matchesRegex("^merge thread .* merge segment.*$"), - matchesRegex("^merge thread .* end$") + matchesRegex("^merge task .* start$"), + matchesRegex("^merge task .* merge segment.*$"), + matchesRegex("^merge task .* end$") ) ); assertThat(mockAppender.mergeCompleted(), is(true)); @@ -3587,6 +3587,7 @@ public void testRecoverFromForeignTranslog() throws IOException { EngineConfig brokenConfig = new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, config.getIndexSettings(), null, store, @@ -7149,6 +7150,7 @@ public void testNotWarmUpSearcherInEngineCtor() throws Exception { EngineConfig configWithWarmer = new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), warmer, store, diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java new file mode 100644 index 0000000000000..97943101758fe --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceDiskSpaceTests.java @@ -0,0 +1,1023 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.tests.mockfile.FilterFileSystemProvider; +import org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.core.PathUtils; +import org.elasticsearch.core.PathUtilsForTesting; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.core.Tuple; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.IOException; +import java.nio.file.FileStore; +import java.nio.file.FileSystem; +import java.nio.file.Path; +import java.nio.file.attribute.FileAttributeView; +import java.nio.file.attribute.FileStoreAttributeView; +import java.nio.file.spi.FileSystemProvider; +import java.util.ArrayList; +import java.util.IdentityHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Executor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.is; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class ThreadPoolMergeExecutorServiceDiskSpaceTests extends ESTestCase { + + private static TestMockFileStore aFileStore = new TestMockFileStore("mocka"); + private static TestMockFileStore bFileStore = new TestMockFileStore("mockb"); + private static String aPathPart; + private static String bPathPart; + private static int mergeExecutorThreadCount; + private static Settings settings; + private static TestCapturingThreadPool testThreadPool; + private static NodeEnvironment nodeEnvironment; + + @BeforeClass + public static void installMockUsableSpaceFS() throws Exception { + FileSystem current = PathUtils.getDefaultFileSystem(); + aPathPart = "a-" + randomUUID(); + bPathPart = "b-" + randomUUID(); + FileSystemProvider mock = new TestMockUsableSpaceFileSystemProvider(current); + PathUtilsForTesting.installMock(mock.getFileSystem(null)); + Path path = PathUtils.get(createTempDir().toString()); + // use 2 data paths + String[] paths = new String[] { path.resolve(aPathPart).toString(), path.resolve(bPathPart).toString() }; + // some tests hold one merge thread blocked, and need at least one other runnable + mergeExecutorThreadCount = randomIntBetween(2, 9); + Settings.Builder settingsBuilder = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), path) + .putList(Environment.PATH_DATA_SETTING.getKey(), paths) + // the default of "5s" slows down testing + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "50ms") + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount); + if (randomBoolean()) { + settingsBuilder.put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true); + } + settings = settingsBuilder.build(); + testThreadPool = new TestCapturingThreadPool("test", settings); + nodeEnvironment = new NodeEnvironment(settings, TestEnvironment.newEnvironment(settings)); + } + + @AfterClass + public static void removeMockUsableSpaceFS() { + PathUtilsForTesting.teardown(); + aFileStore = null; + bFileStore = null; + testThreadPool.close(); + nodeEnvironment.close(); + } + + @After + public void cleanupThreadPool() { + testThreadPool.scheduledTasks.clear(); + } + + static class TestCapturingThreadPool extends TestThreadPool { + final List> scheduledTasks = new ArrayList<>(); + + TestCapturingThreadPool(String name, Settings settings) { + super(name, settings); + } + + @Override + public Cancellable scheduleWithFixedDelay(Runnable command, TimeValue interval, Executor executor) { + Cancellable cancellable = super.scheduleWithFixedDelay(command, interval, executor); + scheduledTasks.add(new Tuple<>(interval, cancellable)); + return cancellable; + } + } + + static class TestMockUsableSpaceFileSystemProvider extends FilterFileSystemProvider { + + TestMockUsableSpaceFileSystemProvider(FileSystem inner) { + super("mockusablespace://", inner); + } + + @Override + public FileStore getFileStore(Path path) { + if (path.toString().contains(path.getFileSystem().getSeparator() + aPathPart)) { + return aFileStore; + } else { + assert path.toString().contains(path.getFileSystem().getSeparator() + bPathPart); + return bFileStore; + } + } + } + + static class TestMockFileStore extends FileStore { + + public volatile long totalSpace; + public volatile long freeSpace; + public volatile long usableSpace; + public volatile boolean throwIoException; + + private final String desc; + + TestMockFileStore(String desc) { + this.desc = desc; + } + + @Override + public String type() { + return "mock"; + } + + @Override + public String name() { + return desc; + } + + @Override + public String toString() { + return desc; + } + + @Override + public boolean isReadOnly() { + return false; + } + + @Override + public long getTotalSpace() throws IOException { + if (throwIoException) { + throw new IOException("Test IO Exception"); + } + return totalSpace; + } + + @Override + public long getUnallocatedSpace() throws IOException { + if (throwIoException) { + throw new IOException("Test IO Exception"); + } + return freeSpace; + } + + @Override + public long getUsableSpace() throws IOException { + if (throwIoException) { + throw new IOException("Test IO Exception"); + } + return usableSpace; + } + + @Override + public boolean supportsFileAttributeView(Class type) { + return false; + } + + @Override + public boolean supportsFileAttributeView(String name) { + return false; + } + + @Override + public V getFileStoreAttributeView(Class type) { + return null; + } + + @Override + public Object getAttribute(String attribute) { + return null; + } + } + + public void testAvailableDiskSpaceMonitorWithDefaultSettings() throws Exception { + // path "a" has lots of free space, and "b" has little + aFileStore.usableSpace = 100_000L; + aFileStore.totalSpace = aFileStore.usableSpace * 2; + bFileStore.usableSpace = 1_000L; + bFileStore.totalSpace = bFileStore.usableSpace * 2; + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + ClusterSettings.createBuiltInClusterSettings(settings), + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(1)); + // 100_000 (available) - 5% (default flood stage level) * 200_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(90_000L)); + } + }); + // "b" now has more available space + bFileStore.usableSpace = 110_000L; + bFileStore.totalSpace = 130_000L; + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(2)); + // 110_000 (available) - 5% (default flood stage level) * 130_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(103_500L)); + } + }); + // available space for "a" and "b" is below the limit => it's clamp down to "0" + aFileStore.usableSpace = 100L; + bFileStore.usableSpace = 1_000L; + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(3)); + // 1_000 (available) - 5% (default flood stage level) * 130_000 (total space) < 0 + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(0L)); + } + }); + } + } + + public void testDiskSpaceMonitorStartsAsDisabled() throws Exception { + aFileStore.usableSpace = randomLongBetween(1L, 100L); + aFileStore.totalSpace = randomLongBetween(1L, 100L); + aFileStore.throwIoException = randomBoolean(); + bFileStore.usableSpace = randomLongBetween(1L, 100L); + bFileStore.totalSpace = randomLongBetween(1L, 100L); + bFileStore.throwIoException = randomBoolean(); + Settings.Builder settingsBuilder = Settings.builder().put(settings); + if (randomBoolean()) { + settingsBuilder.put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0"); + } else { + settingsBuilder.put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s"); + } + Settings settings = settingsBuilder.build(); + ClusterSettings clusterSettings = ClusterSettings.createBuiltInClusterSettings(settings); + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + clusterSettings, + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertThat(diskSpacePeriodicMonitor.isScheduled(), is(false)); + assertThat(availableDiskSpaceUpdates.size(), is(1)); + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(Long.MAX_VALUE)); + // updating monitoring interval should enable the monitor + String intervalSettingValue = randomFrom("1s", "123ms", "5nanos", "2h"); + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), intervalSettingValue) + .build() + ); + assertThat(diskSpacePeriodicMonitor.isScheduled(), is(true)); + assertThat(testThreadPool.scheduledTasks.size(), is(1)); + assertThat( + testThreadPool.scheduledTasks.getLast().v1(), + is( + TimeValue.parseTimeValue( + intervalSettingValue, + ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey() + ) + ) + ); + } + aFileStore.throwIoException = false; + bFileStore.throwIoException = false; + } + + public void testAvailableDiskSpaceMonitorWhenFileSystemStatErrors() throws Exception { + aFileStore.usableSpace = randomLongBetween(1L, 100L); + aFileStore.totalSpace = randomLongBetween(1L, 100L); + bFileStore.usableSpace = randomLongBetween(1L, 100L); + bFileStore.totalSpace = randomLongBetween(1L, 100L); + boolean aErrorsFirst = randomBoolean(); + if (aErrorsFirst) { + // the "a" file system will error when collecting stats + aFileStore.throwIoException = true; + bFileStore.throwIoException = false; + } else { + aFileStore.throwIoException = false; + bFileStore.throwIoException = true; + } + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + ClusterSettings.createBuiltInClusterSettings(settings), + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(1)); + if (aErrorsFirst) { + // uses the stats from "b" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(bFileStore.usableSpace - bFileStore.totalSpace / 20, 0L)) + ); + } else { + // uses the stats from "a" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(aFileStore.usableSpace - aFileStore.totalSpace / 20, 0L)) + ); + } + } + }); + if (aErrorsFirst) { + // the "b" file system will also now error when collecting stats + bFileStore.throwIoException = true; + } else { + // the "a" file system will also now error when collecting stats + aFileStore.throwIoException = true; + } + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(2)); + // consider the available disk space as unlimited when no fs stats can be collected + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(Long.MAX_VALUE)); + } + }); + if (aErrorsFirst) { + // "a" fs stats collection recovered + aFileStore.throwIoException = false; + } else { + // "b" fs stats collection recovered + bFileStore.throwIoException = false; + } + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(3)); + if (aErrorsFirst) { + // uses the stats from "a" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(aFileStore.usableSpace - aFileStore.totalSpace / 20, 0L)) + ); + } else { + // uses the stats from "b" + assertThat( + availableDiskSpaceUpdates.getLast().getBytes(), + // the default 5% (same as flood stage level) + is(Math.max(bFileStore.usableSpace - bFileStore.totalSpace / 20, 0L)) + ); + } + } + }); + } + aFileStore.throwIoException = false; + bFileStore.throwIoException = false; + } + + public void testAvailableDiskSpaceMonitorSettingsUpdate() throws Exception { + ClusterSettings clusterSettings = ClusterSettings.createBuiltInClusterSettings(settings); + // path "b" has more usable (available) space, but path "a" has more total space + aFileStore.usableSpace = 900_000L; + aFileStore.totalSpace = 1_200_000L; + bFileStore.usableSpace = 1_000_000L; + bFileStore.totalSpace = 1_100_000L; + LinkedHashSet availableDiskSpaceUpdates = new LinkedHashSet<>(); + try ( + var diskSpacePeriodicMonitor = ThreadPoolMergeExecutorService.startDiskSpaceMonitoring( + testThreadPool, + nodeEnvironment.dataPaths(), + clusterSettings, + (availableDiskSpace) -> { + synchronized (availableDiskSpaceUpdates) { + availableDiskSpaceUpdates.add(availableDiskSpace); + } + } + ) + ) { + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(1)); + // 1_000_000 (available) - 5% (default flood stage level) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(945_000L)); + } + }, 5, TimeUnit.SECONDS); + // updated the ration for the watermark + clusterSettings.applySettings( + Settings.builder().put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "90%").build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(2)); + // 1_000_000 (available) - 10% (indices.merge.disk.watermark.high) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(890_000L)); + } + }, 5, TimeUnit.SECONDS); + // absolute value for the watermark limit + clusterSettings.applySettings( + Settings.builder().put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "3000b").build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(3)); + // 1_000_000 (available) - 3_000 (indices.merge.disk.watermark.high) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(997_000L)); + } + }, 5, TimeUnit.SECONDS); + // headroom value that takes priority over the watermark + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "50%") + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "11111b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(4)); + // 1_000_000 (available) - 11_111 (indices.merge.disk.watermark.high) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(988_889L)); + } + }, 5, TimeUnit.SECONDS); + // watermark limit that takes priority over the headroom + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "98%") + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "22222b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(5)); + // 1_000_000 (available) - 2% (indices.merge.disk.watermark.high) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(978_000L)); + } + }, 5, TimeUnit.SECONDS); + // headroom takes priority over the default watermark of 95% + clusterSettings.applySettings( + Settings.builder() + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "22222b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(6)); + // 1_000_000 (available) - 22_222 + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(977_778L)); + } + }, 5, TimeUnit.SECONDS); + // watermark from routing allocation takes priority + clusterSettings.applySettings( + Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "99%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "2b") + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_MAX_HEADROOM_SETTING.getKey(), "22222b") + .build() + ); + assertBusy(() -> { + synchronized (availableDiskSpaceUpdates) { + assertThat(availableDiskSpaceUpdates.size(), is(7)); + // 1_000_000 (available) - 1% (cluster.routing.allocation.disk.watermark.flood_stage) * 1_100_000 (total space) + assertThat(availableDiskSpaceUpdates.getLast().getBytes(), is(989_000L)); + } + }, 5, TimeUnit.SECONDS); + } + } + + public void testAbortingOrRunningMergeTaskHoldsUpBudget() throws Exception { + aFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + bFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + aFileStore.usableSpace = randomLongBetween(900L, aFileStore.totalSpace); + bFileStore.usableSpace = randomLongBetween(900L, bFileStore.totalSpace); + boolean aHasMoreSpace = aFileStore.usableSpace > bFileStore.usableSpace; + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), greaterThanOrEqualTo(1)); + // assumes the 5% default value for the remaining space watermark + final long availableInitialBudget = aHasMoreSpace + ? aFileStore.usableSpace - aFileStore.totalSpace / 20 + : bFileStore.usableSpace - bFileStore.totalSpace / 20; + final AtomicLong expectedAvailableBudget = new AtomicLong(availableInitialBudget); + // wait for the merge scheduler to learn about the available disk space + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + ThreadPoolMergeScheduler.MergeTask stallingMergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + when(stallingMergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(stallingMergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + CountDownLatch testDoneLatch = new CountDownLatch(1); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(stallingMergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(stallingMergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(stallingMergeTask); + // assert the merge task is holding up disk space budget + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + // double check that submitting a runnable merge task under budget works correctly + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(0L, expectedAvailableBudget.get())); + when(mergeTask.schedule()).thenReturn(RUN); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + assertBusy(() -> { + verify(mergeTask).schedule(); + verify(mergeTask).run(); + verify(mergeTask, times(0)).abort(); + }); + // let the test finish + testDoneLatch.countDown(); + assertBusy(() -> { + // available budget is back to the initial value + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(availableInitialBudget)); + if (stallingMergeTask.schedule() == RUN) { + verify(stallingMergeTask).run(); + verify(stallingMergeTask, times(0)).abort(); + } else { + verify(stallingMergeTask).abort(); + verify(stallingMergeTask, times(0)).run(); + } + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + }); + } + } + + public void testBackloggedMergeTasksDoNotHoldUpBudget() throws Exception { + aFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + bFileStore.totalSpace = randomLongBetween(1_000L, 10_000L); + aFileStore.usableSpace = randomLongBetween(900L, aFileStore.totalSpace); + bFileStore.usableSpace = randomLongBetween(900L, bFileStore.totalSpace); + boolean aHasMoreSpace = aFileStore.usableSpace > bFileStore.usableSpace; + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), greaterThanOrEqualTo(1)); + // assumes the 5% default value for the remaining space watermark + final long availableInitialBudget = aHasMoreSpace + ? aFileStore.usableSpace - aFileStore.totalSpace / 20 + : bFileStore.usableSpace - bFileStore.totalSpace / 20; + final AtomicLong expectedAvailableBudget = new AtomicLong(availableInitialBudget); + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + long backloggedMergeTaskDiskSpaceBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + CountDownLatch testDoneLatch = new CountDownLatch(1); + // take care that there's still at least one thread available to run merges + int maxBlockingTasksToSubmit = mergeExecutorThreadCount - 1; + // first maybe submit some running or aborting merge tasks that hold up some budget while running or aborting + List runningMergeTasks = new ArrayList<>(); + List abortingMergeTasks = new ArrayList<>(); + while (expectedAvailableBudget.get() - backloggedMergeTaskDiskSpaceBudget > 0L + && maxBlockingTasksToSubmit-- > 0 + && randomBoolean()) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get() - backloggedMergeTaskDiskSpaceBudget); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(mergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + // this task runs/aborts, and it's going to hold up some budget for it + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + // this task will hold up budget because it blocks when it runs (to simulate it running for a long time) + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (mergeTask.schedule() == RUN) { + runningMergeTasks.add(mergeTask); + } else { + abortingMergeTasks.add(mergeTask); + } + } + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + // submit some backlogging merge tasks which should NOT hold up any budget + IdentityHashMap backloggingMergeTasksScheduleCountMap = new IdentityHashMap<>(); + int backloggingTaskCount = randomIntBetween(1, 10); + while (backloggingTaskCount-- > 0) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, backloggedMergeTaskDiskSpaceBudget); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + doAnswer(mock -> { + // task always backlogs (as long as the test hasn't finished) + if (testDoneLatch.getCount() > 0) { + return BACKLOG; + } else { + return RUN; + } + }).when(mergeTask).schedule(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + backloggingMergeTasksScheduleCountMap.put(mergeTask, 1); + } + int checkRounds = randomIntBetween(1, 10); + // assert all backlogging merge tasks have been scheduled while possibly re-enqueued, + // BUT none run and none aborted, AND the available budget is left unchanged + while (true) { + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : backloggingMergeTasksScheduleCountMap.keySet()) { + verify(mergeTask, times(backloggingMergeTasksScheduleCountMap.get(mergeTask))).schedule(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : backloggingMergeTasksScheduleCountMap.keySet()) { + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(0)).abort(); + } + // budget hasn't changed! + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + if (checkRounds-- <= 0) { + break; + } + // maybe re-enqueue backlogged merge task + for (ThreadPoolMergeScheduler.MergeTask backlogged : backloggingMergeTasksScheduleCountMap.keySet()) { + if (randomBoolean()) { + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backlogged); + backloggingMergeTasksScheduleCountMap.put(backlogged, backloggingMergeTasksScheduleCountMap.get(backlogged) + 1); + } + } + // double check that submitting a runnable merge task under budget works correctly + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, backloggedMergeTaskDiskSpaceBudget); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(mergeTask.schedule()).thenReturn(RUN); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + assertBusy(() -> { + verify(mergeTask).schedule(); + verify(mergeTask).run(); + verify(mergeTask, times(0)).abort(); + }); + } + // let the test finish + testDoneLatch.countDown(); + for (ThreadPoolMergeScheduler.MergeTask backlogged : backloggingMergeTasksScheduleCountMap.keySet()) { + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backlogged); + } + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : runningMergeTasks) { + verify(mergeTask).run(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : abortingMergeTasks) { + verify(mergeTask).abort(); + } + for (ThreadPoolMergeScheduler.MergeTask backlogged : backloggingMergeTasksScheduleCountMap.keySet()) { + verify(backlogged).run(); + } + // available budget is restored + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(availableInitialBudget)); + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + }); + } + } + + public void testUnavailableBudgetBlocksNewMergeTasksFromStartingExecution() throws Exception { + aFileStore.totalSpace = 150_000L; + bFileStore.totalSpace = 140_000L; + boolean aHasMoreSpace = randomBoolean(); + if (aHasMoreSpace) { + // "a" has more available space + aFileStore.usableSpace = 120_000L; + bFileStore.usableSpace = 100_000L; + } else { + // "b" has more available space + aFileStore.usableSpace = 90_000L; + bFileStore.usableSpace = 110_000L; + } + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + // wait for the budget to be updated from the available disk space + AtomicLong expectedAvailableBudget = new AtomicLong(); + assertBusy(() -> { + if (aHasMoreSpace) { + // 120_000L (available) - 5% (default flood stage level) * 150_000L (total) + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(112_500L)); + expectedAvailableBudget.set(112_500L); + } else { + // 110_000L (available) - 5% (default flood stage level) * 140_000L (total) + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(103_000L)); + expectedAvailableBudget.set(103_000L); + } + }); + List runningOrAbortingMergeTasksList = new ArrayList<>(); + List latchesBlockingMergeTasksList = new ArrayList<>(); + int submittedMergesCount = randomIntBetween(1, mergeExecutorThreadCount - 1); + // submit merge tasks that don't finish, in order to deplete the available budget + while (submittedMergesCount > 0 && expectedAvailableBudget.get() > 0L) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + doAnswer(mock -> { + Schedule schedule = randomFrom(Schedule.values()); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // re-enqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + // let some task complete, which will NOT hold up any budget + if (randomBoolean()) { + // this task will NOT hold up any budget because it runs quickly (it is not blocked) + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(1_000L, 10_000L)); + } else { + CountDownLatch blockMergeTaskLatch = new CountDownLatch(1); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + submittedMergesCount--; + // this task will hold up budget because it blocks when it runs (to simulate it running for a long time) + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + blockMergeTaskLatch.await(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + blockMergeTaskLatch.await(); + return null; + }).when(mergeTask).abort(); + runningOrAbortingMergeTasksList.add(mergeTask); + latchesBlockingMergeTasksList.add(blockMergeTaskLatch); + } + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + // currently running (or aborting) merge tasks have consumed some of the available budget + while (runningOrAbortingMergeTasksList.isEmpty() == false) { + assertBusy( + () -> assertThat( + threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), + is(expectedAvailableBudget.get()) + ) + ); + ThreadPoolMergeScheduler.MergeTask mergeTask1 = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask1.supportsIOThrottling()).thenReturn(randomBoolean()); + when(mergeTask1.schedule()).thenReturn(RUN); + ThreadPoolMergeScheduler.MergeTask mergeTask2 = mock(ThreadPoolMergeScheduler.MergeTask.class); + when(mergeTask2.supportsIOThrottling()).thenReturn(randomBoolean()); + when(mergeTask2.schedule()).thenReturn(RUN); + boolean task1Runs = randomBoolean(); + long currentAvailableBudget = expectedAvailableBudget.get(); + long overBudget = randomLongBetween(currentAvailableBudget + 1L, currentAvailableBudget + 100L); + long underBudget = randomLongBetween(0L, currentAvailableBudget); + if (task1Runs) { + // merge task 1 can run because it is under budget + when(mergeTask1.estimatedRemainingMergeSize()).thenReturn(underBudget); + // merge task 2 cannot run because it is over budget + when(mergeTask2.estimatedRemainingMergeSize()).thenReturn(overBudget); + } else { + // merge task 1 cannot run because it is over budget + when(mergeTask1.estimatedRemainingMergeSize()).thenReturn(overBudget); + // merge task 2 can run because it is under budget + when(mergeTask2.estimatedRemainingMergeSize()).thenReturn(underBudget); + } + threadPoolMergeExecutorService.submitMergeTask(mergeTask1); + threadPoolMergeExecutorService.submitMergeTask(mergeTask2); + assertBusy(() -> { + if (task1Runs) { + verify(mergeTask1).schedule(); + verify(mergeTask1).run(); + verify(mergeTask2, times(0)).schedule(); + verify(mergeTask2, times(0)).run(); + } else { + verify(mergeTask2).schedule(); + verify(mergeTask2).run(); + verify(mergeTask1, times(0)).schedule(); + verify(mergeTask1, times(0)).run(); + } + }); + // let one task finish from the bunch that is holding up budget + int index = randomIntBetween(0, runningOrAbortingMergeTasksList.size() - 1); + latchesBlockingMergeTasksList.remove(index).countDown(); + ThreadPoolMergeScheduler.MergeTask completedMergeTask = runningOrAbortingMergeTasksList.remove(index); + // update the expected budget given that one task now finished + expectedAvailableBudget.set(expectedAvailableBudget.get() + completedMergeTask.estimatedRemainingMergeSize()); + } + // let the test finish cleanly + assertBusy(() -> { + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(aHasMoreSpace ? 112_500L : 103_000L)); + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + }); + } + } + + public void testMergeTasksAreUnblockedWhenMoreDiskSpaceBecomesAvailable() throws Exception { + aFileStore.totalSpace = randomLongBetween(300L, 1_000L); + bFileStore.totalSpace = randomLongBetween(300L, 1_000L); + long grantedUsableSpaceBuffer = randomLongBetween(10L, 50L); + aFileStore.usableSpace = randomLongBetween(200L, aFileStore.totalSpace - grantedUsableSpaceBuffer); + bFileStore.usableSpace = randomLongBetween(200L, bFileStore.totalSpace - grantedUsableSpaceBuffer); + boolean aHasMoreSpace = aFileStore.usableSpace > bFileStore.usableSpace; + Settings.Builder settingsBuilder = Settings.builder().put(settings); + // change the watermark level, just for coverage and it's easier with the calculations + if (randomBoolean()) { + settingsBuilder.put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_HIGH_WATERMARK_SETTING.getKey(), "90%"); + } else { + settingsBuilder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "90%"); + } + try ( + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService( + testThreadPool, + ClusterSettings.createBuiltInClusterSettings(settingsBuilder.build()), + nodeEnvironment + ) + ) { + assert threadPoolMergeExecutorService != null; + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), greaterThanOrEqualTo(1)); + // uses the 10% watermark limit + final long availableInitialBudget = aHasMoreSpace + ? aFileStore.usableSpace - aFileStore.totalSpace / 10 + : bFileStore.usableSpace - bFileStore.totalSpace / 10; + final AtomicLong expectedAvailableBudget = new AtomicLong(availableInitialBudget); + assertBusy( + () -> assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())) + ); + // maybe let some merge tasks hold up some budget + // take care that there's still at least one thread available to run merges + int maxBlockingTasksToSubmit = mergeExecutorThreadCount - 1; + // first maybe submit some running or aborting merge tasks that hold up some budget while running or aborting + List runningMergeTasks = new ArrayList<>(); + List abortingMergeTasks = new ArrayList<>(); + CountDownLatch testDoneLatch = new CountDownLatch(1); + while (expectedAvailableBudget.get() > 0L && maxBlockingTasksToSubmit-- > 0 && randomBoolean()) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + long taskBudget = randomLongBetween(1L, expectedAvailableBudget.get()); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + when(mergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + // this task runs/aborts, and it's going to hold up some budget for it + expectedAvailableBudget.set(expectedAvailableBudget.get() - taskBudget); + // this task will hold up budget because it blocks when it runs (to simulate it running for a long time) + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing (this holds up budget) + testDoneLatch.await(); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (mergeTask.schedule() == RUN) { + runningMergeTasks.add(mergeTask); + } else { + abortingMergeTasks.add(mergeTask); + } + } + assertBusy(() -> { + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + // send some runnable merge tasks that although runnable are currently over budget + int overBudgetTaskCount = randomIntBetween(1, 5); + List overBudgetTasksToRunList = new ArrayList<>(); + List overBudgetTasksToAbortList = new ArrayList<>(); + while (overBudgetTaskCount-- > 0) { + ThreadPoolMergeScheduler.MergeTask mergeTask = mock(ThreadPoolMergeScheduler.MergeTask.class); + // currently over-budget + long taskBudget = randomLongBetween( + expectedAvailableBudget.get() + 1L, + expectedAvailableBudget.get() + grantedUsableSpaceBuffer + ); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(taskBudget); + Schedule schedule = randomFrom(RUN, ABORT); + when(mergeTask.schedule()).thenReturn(schedule); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (schedule == RUN) { + overBudgetTasksToRunList.add(mergeTask); + } else { + overBudgetTasksToAbortList.add(mergeTask); + } + } + // over-budget tasks did not run, are enqueued, and budget is unchanged + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToAbortList) { + verify(mergeTask, times(0)).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(0)).abort(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToRunList) { + verify(mergeTask, times(0)).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(0)).abort(); + } + assertThat( + threadPoolMergeExecutorService.getMergeTasksQueueLength(), + is(overBudgetTasksToAbortList.size() + overBudgetTasksToRunList.size()) + ); + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + // more disk space becomes available + if (aHasMoreSpace) { + aFileStore.usableSpace += grantedUsableSpaceBuffer; + } else { + bFileStore.usableSpace += grantedUsableSpaceBuffer; + } + expectedAvailableBudget.set(expectedAvailableBudget.get() + grantedUsableSpaceBuffer); + // all over-budget tasks can now run because more disk space became available + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToRunList) { + verify(mergeTask).schedule(); + verify(mergeTask).run(); + verify(mergeTask, times(0)).abort(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : overBudgetTasksToAbortList) { + verify(mergeTask).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask).abort(); + } + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + assertThat(threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), is(expectedAvailableBudget.get())); + }); + // let test finish cleanly + testDoneLatch.countDown(); + assertBusy(() -> { + for (ThreadPoolMergeScheduler.MergeTask mergeTask : runningMergeTasks) { + verify(mergeTask).run(); + } + for (ThreadPoolMergeScheduler.MergeTask mergeTask : abortingMergeTasks) { + verify(mergeTask).abort(); + } + assertThat( + threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), + is(availableInitialBudget + grantedUsableSpaceBuffer) + ); + assertThat(threadPoolMergeExecutorService.allDone(), is(true)); + assertThat( + threadPoolMergeExecutorService.getDiskSpaceAvailableForNewMergeTasks(), + is(availableInitialBudget + grantedUsableSpaceBuffer) + ); + }); + } + } +} diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java new file mode 100644 index 0000000000000..9b74d68326108 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeExecutorServiceTests.java @@ -0,0 +1,926 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.MergeTaskPriorityBlockingQueue; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.PriorityBlockingQueueWithBudget; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.index.merge.OnGoingMerge; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.junit.After; +import org.mockito.ArgumentCaptor; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Semaphore; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; + +import static org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.MAX_IO_RATE; +import static org.elasticsearch.index.engine.ThreadPoolMergeExecutorService.MIN_IO_RATE; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.ABORT; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.BACKLOG; +import static org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule.RUN; +import static org.hamcrest.Matchers.either; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class ThreadPoolMergeExecutorServiceTests extends ESTestCase { + + private NodeEnvironment nodeEnvironment; + + @After + public void closeNodeEnv() { + if (nodeEnvironment != null) { + nodeEnvironment.close(); + nodeEnvironment = null; + } + } + + public void testNewMergeTaskIsAbortedWhenThreadPoolIsShutdown() throws IOException { + TestThreadPool testThreadPool = new TestThreadPool("test", Settings.EMPTY); + nodeEnvironment = newNodeEnvironment(Settings.EMPTY); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + Settings.EMPTY, + nodeEnvironment + ); + // shutdown the thread pool + testThreadPool.shutdown(); + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + assertFalse(threadPoolMergeExecutorService.submitMergeTask(mergeTask)); + verify(mergeTask).abort(); + verify(mergeTask, times(0)).schedule(); + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(1)).abort(); + assertTrue(threadPoolMergeExecutorService.allDone()); + } + + public void testEnqueuedAndBackloggedMergesAreStillExecutedWhenThreadPoolIsShutdown() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + // more merges than threads so that some are enqueued + int mergesToSubmit = mergeExecutorThreadCount + randomIntBetween(1, 5); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + TestThreadPool testThreadPool = new TestThreadPool("test", settings); + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + var countingListener = new CountingMergeEventListener(); + threadPoolMergeExecutorService.registerMergeEventListener(countingListener); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + Semaphore runMergeSemaphore = new Semaphore(0); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + AtomicInteger doneMergesCount = new AtomicInteger(0); + AtomicInteger reEnqueuedBackloggedMergesCount = new AtomicInteger(); + AtomicInteger abortedMergesCount = new AtomicInteger(); + // submit more merge tasks than there are threads so that some are enqueued + for (int i = 0; i < mergesToSubmit; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + Schedule runOrAbort = randomFrom(RUN, ABORT); + doAnswer(mock -> { + // merges can be backlogged, but will be re-enqueued + Schedule schedule = randomFrom(BACKLOG, runOrAbort); + if (schedule == BACKLOG) { + // reenqueue backlogged merge task + new Thread(() -> threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask)).start(); + reEnqueuedBackloggedMergesCount.incrementAndGet(); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + // wait to be signalled before completing + if (runOrAbort == ABORT) { + fail("merge task ran but it should've aborted instead"); + } + runMergeSemaphore.acquireUninterruptibly(); + doneMergesCount.incrementAndGet(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing + if (runOrAbort == RUN) { + fail("merge task aborted but it should've ran instead"); + } + runMergeSemaphore.acquireUninterruptibly(); + doneMergesCount.incrementAndGet(); + abortedMergesCount.incrementAndGet(); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + // assert merges are running and enqueued + assertBusy(() -> { + // assert that there are merge tasks running concurrently at the max allowed concurrency rate + assertThat(threadPoolExecutor.getActiveCount(), is(mergeExecutorThreadCount)); + // with the other merge tasks enqueued + assertThat(threadPoolExecutor.getQueue().size(), is(mergesToSubmit - mergeExecutorThreadCount)); + }); + assertBusy( + () -> assertThat( + countingListener.queued.get(), + equalTo(threadPoolExecutor.getActiveCount() + threadPoolExecutor.getQueue().size() + reEnqueuedBackloggedMergesCount.get()) + ) + ); + // shutdown prevents new merge tasks to be enqueued but existing ones should be allowed to continue + testThreadPool.shutdown(); + // assert all executors, except the merge one, are terminated + for (String executorName : ThreadPool.THREAD_POOL_TYPES.keySet()) { + assertTrue(testThreadPool.executor(executorName).isShutdown()); + if (ThreadPool.Names.MERGE.equals(executorName)) { + assertFalse(testThreadPool.executor(executorName).isTerminated()); + } else { + assertTrue(testThreadPool.executor(executorName).isTerminated()); + } + } + for (int i = 0; i < mergesToSubmit; i++) { + // closing the thread pool is delayed because there are running and/or enqueued merge tasks + assertFalse(testThreadPool.awaitTermination(1, TimeUnit.NANOSECONDS)); + assertTrue(threadPoolExecutor.isShutdown()); + assertFalse(threadPoolExecutor.isTerminated()); + // let merges run one by one and check thread pool + runMergeSemaphore.release(); + int completedMergesCount = i + 1; + assertBusy(() -> { + assertThat(doneMergesCount.get(), is(completedMergesCount)); + assertThat(threadPoolExecutor.getCompletedTaskCount(), is((long) completedMergesCount)); + // active threads still working on the remaining merges + assertThat( + threadPoolExecutor.getActiveCount(), + is(Math.min(mergeExecutorThreadCount, mergesToSubmit - completedMergesCount)) + ); + // with any of the other merges still enqueued + assertThat( + threadPoolExecutor.getQueue().size(), + is(Math.max(mergesToSubmit - mergeExecutorThreadCount - completedMergesCount, 0)) + ); + }); + } + assertBusy(() -> { + assertTrue(testThreadPool.awaitTermination(1, TimeUnit.NANOSECONDS)); + assertTrue(threadPoolExecutor.isShutdown()); + assertTrue(threadPoolExecutor.isTerminated()); + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + assertThat(countingListener.aborted.get() + countingListener.completed.get(), equalTo(doneMergesCount.get())); + assertThat(countingListener.aborted.get(), equalTo(abortedMergesCount.get())); + } + + public void testTargetIORateChangesWhenSubmittingMergeTasks() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + int mergesStillToSubmit = randomIntBetween(10, 100); + int mergesStillToComplete = mergesStillToSubmit; + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + Semaphore runMergeSemaphore = new Semaphore(0); + AtomicInteger submittedIOThrottledMergeTasks = new AtomicInteger(); + while (mergesStillToComplete > 0) { + if (mergesStillToSubmit > 0 && (threadPoolMergeExecutorService.getRunningMergeTasks().isEmpty() || randomBoolean())) { + // submit new merge task + MergeTask mergeTask = mock(MergeTask.class); + boolean supportsIOThrottling = randomBoolean(); + when(mergeTask.supportsIOThrottling()).thenReturn(supportsIOThrottling); + doAnswer(mock -> { + Schedule schedule = randomFrom(Schedule.values()); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + if (supportsIOThrottling) { + submittedIOThrottledMergeTasks.decrementAndGet(); + } + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + if (supportsIOThrottling) { + submittedIOThrottledMergeTasks.decrementAndGet(); + } + return null; + }).when(mergeTask).abort(); + long currentIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + if (supportsIOThrottling) { + submittedIOThrottledMergeTasks.incrementAndGet(); + } + long newIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + if (supportsIOThrottling) { + if (submittedIOThrottledMergeTasks.get() < 2) { + // assert the IO rate decreases, with a floor limit, when there is just a single merge task running + assertThat(newIORate, either(is(MIN_IO_RATE.getBytes())).or(lessThan(currentIORate))); + } else if (submittedIOThrottledMergeTasks.get() > threadPoolMergeExecutorService.getMaxConcurrentMerges() * 2) { + // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued + assertThat(newIORate, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(currentIORate))); + } else { + // assert the IO rate does NOT change when there are a couple of merge tasks enqueued + assertThat(newIORate, equalTo(currentIORate)); + } + } else { + // assert the IO rate does not change, when the merge task doesn't support IO throttling + assertThat(newIORate, equalTo(currentIORate)); + } + mergesStillToSubmit--; + } else { + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + long completedMerges = threadPoolExecutor.getCompletedTaskCount(); + runMergeSemaphore.release(); + // await merge to finish + assertBusy(() -> assertThat(threadPoolExecutor.getCompletedTaskCount(), is(completedMerges + 1))); + mergesStillToComplete--; + } + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + + public void testIORateIsAdjustedForRunningMergeTasks() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 3); + int mergesStillToSubmit = randomIntBetween(1, 10); + int mergesStillToComplete = mergesStillToSubmit; + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + Semaphore runMergeSemaphore = new Semaphore(0); + Set currentlyRunningMergeTasksSet = ConcurrentCollections.newConcurrentSet(); + while (mergesStillToComplete > 0) { + if (mergesStillToSubmit > 0 && (currentlyRunningMergeTasksSet.isEmpty() || randomBoolean())) { + MergeTask mergeTask = mock(MergeTask.class); + // all tasks support IO throttling in this test case + when(mergeTask.supportsIOThrottling()).thenReturn(true); + doAnswer(mock -> { + Schedule schedule = randomFrom(Schedule.values()); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + currentlyRunningMergeTasksSet.add(mergeTask); + // wait to be signalled before completing + runMergeSemaphore.acquire(); + currentlyRunningMergeTasksSet.remove(mergeTask); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + return null; + }).when(mergeTask).abort(); + int activeMergeTasksCount = threadPoolExecutor.getActiveCount(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + long newIORate = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + // all currently running merge tasks must be IO throttled + assertBusy(() -> { + // await new merge to start executing + if (activeMergeTasksCount < mergeExecutorThreadCount) { + assertThat(threadPoolExecutor.getActiveCount(), is(activeMergeTasksCount + 1)); + } + // assert IO throttle is set on the running merge tasks + for (MergeTask currentlyRunningMergeTask : currentlyRunningMergeTasksSet) { + var ioRateCaptor = ArgumentCaptor.forClass(Long.class); + // only interested in the last invocation + verify(currentlyRunningMergeTask, atLeastOnce()).setIORateLimit(ioRateCaptor.capture()); + assertThat(ioRateCaptor.getValue(), is(newIORate)); + } + }); + mergesStillToSubmit--; + } else { + long completedMerges = threadPoolExecutor.getCompletedTaskCount(); + runMergeSemaphore.release(); + // await merge to finish + assertBusy(() -> assertThat(threadPoolExecutor.getCompletedTaskCount(), is(completedMerges + 1))); + mergesStillToComplete--; + } + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSpeedy() throws IOException { + // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted + int submittedVsExecutedRateOutOf1000 = randomIntBetween(0, 250); + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); + // executor starts running merges only after a considerable amount of merge tasks have already been submitted + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); + } + + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsSluggish() throws IOException { + // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted + int submittedVsExecutedRateOutOf1000 = randomIntBetween(750, 1000); + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); + // executor starts running merges only after a considerable amount of merge tasks have already been submitted + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); + } + + public void testIORateAdjustedForSubmittedTasksWhenExecutionRateIsOnPar() throws IOException { + // the executor runs merge tasks at a faster rate than the rate that merge tasks are submitted + int submittedVsExecutedRateOutOf1000 = randomIntBetween(250, 750); + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(0, 5)); + // executor starts running merges only after a considerable amount of merge tasks have already been submitted + testIORateAdjustedForSubmittedTasks(randomIntBetween(50, 1000), submittedVsExecutedRateOutOf1000, randomIntBetween(5, 50)); + } + + private void testIORateAdjustedForSubmittedTasks(int totalTasksToSubmit, int submittedVsExecutedRateOutOf1000, int initialTasksToSubmit) + throws IOException { + DeterministicTaskQueue mergeExecutorTaskQueue = new DeterministicTaskQueue(); + ThreadPool mergeExecutorThreadPool = mergeExecutorTaskQueue.getThreadPool(); + Settings settings = Settings.builder() + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + if (nodeEnvironment != null) { + nodeEnvironment.close(); + nodeEnvironment = null; + } + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + mergeExecutorThreadPool, + settings, + nodeEnvironment + ); + final AtomicInteger currentlySubmittedMergeTaskCount = new AtomicInteger(); + final AtomicLong targetIORateLimit = new AtomicLong(ThreadPoolMergeExecutorService.START_IO_RATE.getBytes()); + final AtomicReference lastRunTask = new AtomicReference<>(); + int initialTasksCounter = Math.min(initialTasksToSubmit, totalTasksToSubmit); + while (totalTasksToSubmit > 0 || mergeExecutorTaskQueue.hasAnyTasks()) { + if (mergeExecutorTaskQueue.hasAnyTasks() == false // always submit if there are no outstanding merge tasks + || initialTasksCounter > 0 // first submit all the initial tasks + || (randomIntBetween(0, 1000) < submittedVsExecutedRateOutOf1000 && totalTasksToSubmit > 0)) { + // submit new merge task + MergeTask mergeTask = mock(MergeTask.class); + // all merge tasks support IO throttling in this test + when(mergeTask.supportsIOThrottling()).thenReturn(true); + // always run the task + when(mergeTask.schedule()).thenReturn(RUN); + doAnswer(mock -> { + lastRunTask.set(mergeTask); + return null; + }).when(mergeTask).run(); + currentlySubmittedMergeTaskCount.incrementAndGet(); + totalTasksToSubmit--; + initialTasksCounter--; + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + long newTargetIORateLimit = threadPoolMergeExecutorService.getTargetIORateBytesPerSec(); + if (currentlySubmittedMergeTaskCount.get() < 2) { + // assert the IO rate decreases, with a floor limit, when there are few merge tasks enqueued + assertThat(newTargetIORateLimit, either(is(MIN_IO_RATE.getBytes())).or(lessThan(targetIORateLimit.get()))); + } else if (currentlySubmittedMergeTaskCount.get() > threadPoolMergeExecutorService.getMaxConcurrentMerges() * 2) { + // assert the IO rate increases, with a ceiling limit, when there are many merge tasks enqueued + assertThat(newTargetIORateLimit, either(is(MAX_IO_RATE.getBytes())).or(greaterThan(targetIORateLimit.get()))); + } else { + // assert the IO rate does not change, when there are a couple of merge tasks enqueued + assertThat(newTargetIORateLimit, equalTo(targetIORateLimit.get())); + } + targetIORateLimit.set(newTargetIORateLimit); + } else { + // execute already submitted merge task + if (runOneTask(mergeExecutorTaskQueue)) { + // task is done, no longer just submitted + currentlySubmittedMergeTaskCount.decrementAndGet(); + // assert IO rate is invoked on the merge task that just ran + assertNotNull(lastRunTask.get()); + var ioRateCaptor = ArgumentCaptor.forClass(Long.class); + verify(lastRunTask.get(), times(1)).setIORateLimit(ioRateCaptor.capture()); + assertThat(ioRateCaptor.getValue(), is(targetIORateLimit.get())); + lastRunTask.set(null); + } + } + } + assertTrue(threadPoolMergeExecutorService.allDone()); + } + + public void testMergeTasksRunConcurrently() throws Exception { + // at least 2 merges allowed to run concurrently + int mergeExecutorThreadCount = randomIntBetween(2, 5); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + // more merge tasks than max concurrent merges allowed to run concurrently + int totalMergeTasksCount = mergeExecutorThreadCount + randomIntBetween(1, 5); + Semaphore runMergeSemaphore = new Semaphore(0); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + // submit all merge tasks + for (int i = 0; i < totalMergeTasksCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + doAnswer(mock -> { + // each individual merge task can either "run" or be "backlogged" + Schedule schedule = randomFrom(RUN, BACKLOG); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + return schedule; + }).when(mergeTask).schedule(); + doAnswer(mock -> { + // wait to be signalled before completing + runMergeSemaphore.acquire(); + return null; + }).when(mergeTask).run(); + doAnswer(mock -> { + fail("This test doesn't deal with aborted merge tasks"); + return null; + }).when(mergeTask).abort(); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + // assert stats while merge tasks finish + for (int completedTasksCount = 0; completedTasksCount < totalMergeTasksCount + - mergeExecutorThreadCount; completedTasksCount++) { + int finalCompletedTasksCount = completedTasksCount; + assertBusy(() -> { + // assert that there are merge tasks running concurrently at the max allowed concurrency rate + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + // with the other merge tasks enqueued + assertThat( + threadPoolMergeExecutorService.getMergeTasksQueueLength(), + is(totalMergeTasksCount - mergeExecutorThreadCount - finalCompletedTasksCount) + ); + // also check thread-pool stats for the same + assertThat(threadPoolExecutor.getActiveCount(), is(mergeExecutorThreadCount)); + assertThat( + threadPoolExecutor.getQueue().size(), + is(totalMergeTasksCount - mergeExecutorThreadCount - finalCompletedTasksCount) + ); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + // there are now fewer merge tasks still running than available threads + for (int remainingMergeTasksCount = mergeExecutorThreadCount; remainingMergeTasksCount >= 0; remainingMergeTasksCount--) { + int finalRemainingMergeTasksCount = remainingMergeTasksCount; + assertBusy(() -> { + // there are fewer available merges than available threads + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(finalRemainingMergeTasksCount)); + // no more merges enqueued + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + // also check thread-pool stats for the same + assertThat(threadPoolExecutor.getActiveCount(), is(finalRemainingMergeTasksCount)); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + + public void testThreadPoolStatsWithBackloggedMergeTasks() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + int totalMergeTasksCount = randomIntBetween(1, 10); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + List backloggedMergeTasksList = new ArrayList<>(); + for (int i = 0; i < totalMergeTasksCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + boolean runNowOrBacklog = randomBoolean(); + if (runNowOrBacklog) { + when(mergeTask.schedule()).thenReturn(randomFrom(RUN, ABORT)); + } else { + // first backlog, then run + when(mergeTask.schedule()).thenReturn(BACKLOG, randomFrom(RUN, ABORT)); + backloggedMergeTasksList.add(mergeTask); + } + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + assertBusy(() -> { + // all runnable merge tasks should show as "completed" + assertThat(threadPoolExecutor.getCompletedTaskCount(), is((long) (totalMergeTasksCount - backloggedMergeTasksList.size()))); + if (backloggedMergeTasksList.size() >= mergeExecutorThreadCount) { + // active tasks waiting for backlogged merge tasks to be re-enqueued + assertThat(threadPoolExecutor.getActiveCount(), is(mergeExecutorThreadCount)); + assertThat(threadPoolExecutor.getQueue().size(), is(backloggedMergeTasksList.size() - mergeExecutorThreadCount)); + } else { + assertThat(threadPoolExecutor.getActiveCount(), is(backloggedMergeTasksList.size())); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + } + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + }); + // re-enqueue backlogged merge tasks + for (MergeTask backloggedMergeTask : backloggedMergeTasksList) { + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(backloggedMergeTask); + } + assertBusy(() -> { + // all merge tasks should now show as "completed" + assertThat(threadPoolExecutor.getCompletedTaskCount(), is((long) totalMergeTasksCount)); + assertThat(threadPoolExecutor.getActiveCount(), is(0)); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + } + + public void testBackloggedMergeTasksExecuteExactlyOnce() throws Exception { + int mergeExecutorThreadCount = randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), true) + // few merge threads, in order to increase contention + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + testThreadPool, + settings, + nodeEnvironment + ); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + // many merge tasks concurrently + int mergeTaskCount = randomIntBetween(10, 100); + CountDownLatch mergeTasksReadyLatch = new CountDownLatch(mergeTaskCount); + CountDownLatch submitTaskLatch = new CountDownLatch(1); + Collection runMergeTasks = ConcurrentCollections.newConcurrentSet(); + Collection abortMergeTasks = ConcurrentCollections.newConcurrentSet(); + for (int i = 0; i < mergeTaskCount; i++) { + new Thread(() -> { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + doAnswer(mock -> { + // each individual merge task can either "run" or be "backlogged" + Schedule schedule = randomFrom(RUN, ABORT, BACKLOG); + if (schedule == BACKLOG) { + testThreadPool.executor(ThreadPool.Names.GENERIC).execute(() -> { + // reenqueue backlogged merge task + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + }); + } + if (schedule == RUN) { + runMergeTasks.add(mergeTask); + } + if (schedule == ABORT) { + abortMergeTasks.add(mergeTask); + } + return schedule; + }).when(mergeTask).schedule(); + mergeTasksReadyLatch.countDown(); + // make all threads submit merge tasks at once + safeAwait(submitTaskLatch); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + }).start(); + } + safeAwait(mergeTasksReadyLatch); + submitTaskLatch.countDown(); + assertBusy(() -> { + assertThat(runMergeTasks.size() + abortMergeTasks.size(), is(mergeTaskCount)); + for (MergeTask mergeTask : runMergeTasks) { + verify(mergeTask, times(1)).run(); + verify(mergeTask, times(0)).abort(); + if (mergeTask.supportsIOThrottling() == false) { + verify(mergeTask, times(0)).setIORateLimit(anyLong()); + } + } + for (MergeTask mergeTask : abortMergeTasks) { + verify(mergeTask, times(0)).run(); + verify(mergeTask, times(1)).abort(); + verify(mergeTask, times(0)).setIORateLimit(anyLong()); + } + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + } + + public void testMergeTasksExecuteInSizeOrder() throws IOException { + DeterministicTaskQueue mergeExecutorTaskQueue = new DeterministicTaskQueue(); + ThreadPool mergeExecutorThreadPool = mergeExecutorTaskQueue.getThreadPool(); + Settings settings = Settings.builder() + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = getThreadPoolMergeExecutorService( + mergeExecutorThreadPool, + settings, + nodeEnvironment + ); + DeterministicTaskQueue reEnqueueBackloggedTaskQueue = new DeterministicTaskQueue(); + int mergeTaskCount = randomIntBetween(10, 100); + // sort merge tasks available to run by size + PriorityQueue mergeTasksAvailableToRun = new PriorityQueue<>( + mergeTaskCount, + Comparator.comparingLong(MergeTask::estimatedRemainingMergeSize) + ); + for (int i = 0; i < mergeTaskCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.supportsIOThrottling()).thenReturn(randomBoolean()); + // merge tasks of various sizes (0 might be a valid value) + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(0, 10)); + doAnswer(mock -> { + // each individual merge task can either "run" or be "backlogged" at any point in time + Schedule schedule = randomFrom(Schedule.values()); + // in either case, the merge task is, at least temporarily, not "available" to run + mergeTasksAvailableToRun.remove(mergeTask); + // if merge task cannot run, it is backlogged, and should be re enqueued some time in the future + if (schedule == BACKLOG) { + // reenqueue backlogged merge task sometime in the future + reEnqueueBackloggedTaskQueue.scheduleNow(() -> { + // reenqueue backlogged merge task sometime in the future + threadPoolMergeExecutorService.reEnqueueBackloggedMergeTask(mergeTask); + // the merge task should once again be "available" to run + mergeTasksAvailableToRun.add(mergeTask); + }); + } + // hack: avoid blocking for unavailable merge task by running one re-enqueuing task now + if (schedule == BACKLOG && mergeTasksAvailableToRun.isEmpty()) { + assertTrue(runOneTask(reEnqueueBackloggedTaskQueue)); + } + if (schedule == RUN && mergeTasksAvailableToRun.isEmpty() == false) { + // assert the merge task that's now going to run is the smallest of the ones currently available to run + assertThat( + mergeTask.estimatedRemainingMergeSize(), + lessThanOrEqualTo(mergeTasksAvailableToRun.peek().estimatedRemainingMergeSize()) + ); + } + return schedule; + }).when(mergeTask).schedule(); + mergeTasksAvailableToRun.add(mergeTask); + threadPoolMergeExecutorService.submitMergeTask(mergeTask); + } + while (true) { + // re-enqueue merge tasks + if (mergeTasksAvailableToRun.isEmpty() || randomBoolean()) { + boolean backlogReEnqueued = runOneTask(reEnqueueBackloggedTaskQueue); + if (mergeTasksAvailableToRun.isEmpty() && backlogReEnqueued == false) { + // test complete, all merges ran, and none is backlogged + assertFalse(mergeExecutorTaskQueue.hasAnyTasks()); + assertFalse(reEnqueueBackloggedTaskQueue.hasAnyTasks()); + assertTrue(threadPoolMergeExecutorService.allDone()); + break; + } + } else { + // run one merge task + runOneTask(mergeExecutorTaskQueue); + } + } + } + + public void testMergeTaskQueueAvailableBudgetTracking() throws Exception { + MergeTaskPriorityBlockingQueue mergeTaskPriorityBlockingQueue = new MergeTaskPriorityBlockingQueue(); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(0L)); + long availableBudget = randomLongBetween(1, 10); + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + int taskCount = randomIntBetween(5, 15); + for (int i = 0; i < taskCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + when(mergeTask.estimatedRemainingMergeSize()).thenReturn(randomLongBetween(1, 10)); + mergeTaskPriorityBlockingQueue.enqueue(mergeTask); + } + assertThat(mergeTaskPriorityBlockingQueue.queueSize(), is(taskCount)); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + List.ElementWithReleasableBudget> tookElements = new ArrayList<>(); + + while (mergeTaskPriorityBlockingQueue.isQueueEmpty() == false) { + if (mergeTaskPriorityBlockingQueue.peekQueue().estimatedRemainingMergeSize() <= mergeTaskPriorityBlockingQueue + .getAvailableBudget() && randomBoolean()) { + // take another element (merge task) from the queue + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + tookElements.add(mergeTaskPriorityBlockingQueue.take()); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, greaterThanOrEqualTo(0L)); + assertThat(prevBudget - afterBudget, is(tookElements.getLast().element().estimatedRemainingMergeSize())); + } else if (tookElements.stream().anyMatch(e -> e.isClosed() == false) && randomBoolean()) { + // "closes" a previously took element to simulate it has gone out of scope + int index = randomValueOtherThanMany( + i -> tookElements.get(i).isClosed(), + () -> randomIntBetween(0, tookElements.size() - 1) + ); + var elementToClose = tookElements.remove(index); + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + elementToClose.close(); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + // budget hasn't yet changed, the update budget method needs to be invoked before it does + assertThat(afterBudget, is(prevBudget)); + } else if (randomBoolean()) { + // update (possibly increment) the available budget + long budgetIncrement = randomLongBetween(0, 3); + availableBudget += budgetIncrement; + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + // "closed" took elements should not impact budget computation + tookElements.removeIf(PriorityBlockingQueueWithBudget.ElementWithReleasableBudget::isClosed); + long expectedBudget = availableBudget - tookElements.stream() + .mapToLong(e -> e.element().estimatedRemainingMergeSize()) + .sum(); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, is(expectedBudget)); + } + } + } + + public void testMergeTaskQueueBudgetTrackingWhenEstimatedRemainingMergeSizeChanges() throws Exception { + MergeTaskPriorityBlockingQueue mergeTaskPriorityBlockingQueue = new MergeTaskPriorityBlockingQueue(); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(0L)); + // plenty of available budget (this should be fixed for this test) + final long availableBudget = randomLongBetween(1000L, 2000L); + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + IdentityHashMap budgetMap = new IdentityHashMap<>(); + int taskCount = randomIntBetween(5, 15); + for (int i = 0; i < taskCount; i++) { + MergeTask mergeTask = mock(MergeTask.class); + budgetMap.put(mergeTask, randomLongBetween(1L, 10L)); + doAnswer(invocation -> budgetMap.get((MergeTask) invocation.getMock())).when(mergeTask).estimatedRemainingMergeSize(); + mergeTaskPriorityBlockingQueue.enqueue(mergeTask); + } + assertThat(mergeTaskPriorityBlockingQueue.queueSize(), is(taskCount)); + assertThat(mergeTaskPriorityBlockingQueue.getAvailableBudget(), is(availableBudget)); + + List.ElementWithReleasableBudget> tookElements = new ArrayList<>(); + + while (mergeTaskPriorityBlockingQueue.isQueueEmpty() == false) { + if (tookElements.stream().allMatch(PriorityBlockingQueueWithBudget.ElementWithReleasableBudget::isClosed) || randomBoolean()) { + // take another element (merge task) from the queue + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + tookElements.add(mergeTaskPriorityBlockingQueue.take()); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, greaterThanOrEqualTo(0L)); + assertThat(prevBudget - afterBudget, is(tookElements.getLast().element().estimatedRemainingMergeSize())); + } else if (randomBoolean()) { + // "closes" a previously took element to simulate it has gone out of scope + int index = randomValueOtherThanMany( + i -> tookElements.get(i).isClosed(), + () -> randomIntBetween(0, tookElements.size() - 1) + ); + var elementToClose = tookElements.remove(index); + long prevBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + elementToClose.close(); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + // budget hasn't yet changed, the update budget method needs to be invoked before it does + assertThat(afterBudget, is(prevBudget)); + } else { + // update the remaining merge size of a took (but not "closed") merge task + int index = randomValueOtherThanMany( + i -> tookElements.get(i).isClosed(), + () -> randomIntBetween(0, tookElements.size() - 1) + ); + var elementToUpdate = tookElements.get(index); + long prevElementBudget = elementToUpdate.element().estimatedRemainingMergeSize(); + long afterElementBudget = randomValueOtherThan(prevElementBudget, () -> randomLongBetween(1L, 10L)); + budgetMap.put(elementToUpdate.element(), afterElementBudget); + assertThat(elementToUpdate.element().estimatedRemainingMergeSize(), is(afterElementBudget)); + // "closed" took elements should not impact budget computation + tookElements.removeIf(PriorityBlockingQueueWithBudget.ElementWithReleasableBudget::isClosed); + long expectedBudget = availableBudget - tookElements.stream().mapToLong(e -> budgetMap.get(e.element())).sum(); + mergeTaskPriorityBlockingQueue.updateBudget(availableBudget); + long afterBudget = mergeTaskPriorityBlockingQueue.getAvailableBudget(); + assertThat(afterBudget, is(expectedBudget)); + } + } + } + + private static class CountingMergeEventListener implements MergeEventListener { + AtomicInteger queued = new AtomicInteger(); + AtomicInteger aborted = new AtomicInteger(); + AtomicInteger completed = new AtomicInteger(); + + @Override + public void onMergeQueued(OnGoingMerge merge, long estimateMergeMemoryBytes) { + queued.incrementAndGet(); + } + + @Override + public void onMergeCompleted(OnGoingMerge merge) { + completed.incrementAndGet(); + } + + @Override + public void onMergeAborted(OnGoingMerge merge) { + aborted.incrementAndGet(); + } + } + + static ThreadPoolMergeExecutorService getThreadPoolMergeExecutorService( + ThreadPool threadPool, + Settings settings, + NodeEnvironment nodeEnvironment + ) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorService + .maybeCreateThreadPoolMergeExecutorService(threadPool, ClusterSettings.createBuiltInClusterSettings(settings), nodeEnvironment); + assertNotNull(threadPoolMergeExecutorService); + assertTrue(threadPoolMergeExecutorService.allDone()); + return threadPoolMergeExecutorService; + } + + private static boolean runOneTask(DeterministicTaskQueue deterministicTaskQueue) { + while (deterministicTaskQueue.hasAnyTasks()) { + if (deterministicTaskQueue.hasRunnableTasks()) { + deterministicTaskQueue.runRandomTask(); + return true; + } else { + deterministicTaskQueue.advanceTime(); + } + } + return false; + } +} diff --git a/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java new file mode 100644 index 0000000000000..156dcf581ec9c --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/engine/ThreadPoolMergeSchedulerTests.java @@ -0,0 +1,771 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.engine; + +import org.apache.lucene.index.MergePolicy; +import org.apache.lucene.index.MergePolicy.OneMerge; +import org.apache.lucene.index.MergeScheduler.MergeSource; +import org.apache.lucene.index.MergeTrigger; +import org.apache.lucene.store.MergeInfo; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.MergeSchedulerConfig; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.MergeTask; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler.Schedule; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.threadpool.TestThreadPool; +import org.elasticsearch.threadpool.ThreadPool; +import org.junit.After; +import org.mockito.ArgumentCaptor; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.Semaphore; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; +import static org.mockito.Mockito.when; + +public class ThreadPoolMergeSchedulerTests extends ESTestCase { + + private NodeEnvironment nodeEnvironment; + + @After + public void closeNodeEnv() { + if (nodeEnvironment != null) { + nodeEnvironment.close(); + nodeEnvironment = null; + } + } + + public void testMergesExecuteInSizeOrder() throws IOException { + DeterministicTaskQueue threadPoolTaskQueue = new DeterministicTaskQueue(); + Settings settings = Settings.builder() + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(threadPoolTaskQueue.getThreadPool(), settings, nodeEnvironment); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", Settings.EMPTY), + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + List executedMergesList = new ArrayList<>(); + int mergeCount = randomIntBetween(2, 10); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + doAnswer(invocation -> { + OneMerge merge = (OneMerge) invocation.getArguments()[0]; + assertFalse(merge.isAborted()); + executedMergesList.add(merge); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + } + threadPoolTaskQueue.runAllTasks(); + assertThat(executedMergesList.size(), is(mergeCount)); + // assert merges are executed in ascending size order + for (int i = 1; i < mergeCount; i++) { + assertThat( + executedMergesList.get(i - 1).getStoreMergeInfo().estimatedMergeBytes(), + lessThanOrEqualTo(executedMergesList.get(i).getStoreMergeInfo().estimatedMergeBytes()) + ); + } + } + assertTrue(threadPoolMergeExecutorService.allDone()); + } + + public void testSimpleMergeTaskBacklogging() { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + // close method waits for running merges to finish, but this test leaves running merges around + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService, + merge -> 0 + ); + // more merge tasks than merge threads + int mergeCount = mergeExecutorThreadCount + randomIntBetween(1, 5); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + Schedule schedule = threadPoolMergeScheduler.schedule( + threadPoolMergeScheduler.newMergeTask(mergeSource, oneMerge, randomFrom(MergeTrigger.values())) + ); + if (i < mergeExecutorThreadCount) { + assertThat(schedule, is(Schedule.RUN)); + } else { + assertThat(schedule, is(Schedule.BACKLOG)); + } + } + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(mergeCount - mergeExecutorThreadCount)); + } + + public void testSimpleMergeTaskReEnqueueingBySize() { + int mergeExecutorThreadCount = randomIntBetween(1, 5); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + .build(); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + // close method waits for running merges to finish, but this test leaves running merges around + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService, + merge -> 0 + ); + // sort backlogged merges by size + PriorityQueue backloggedMergeTasks = new PriorityQueue<>( + 16, + Comparator.comparingLong(MergeTask::estimatedRemainingMergeSize) + ); + // more merge tasks than merge threads + int mergeCount = mergeExecutorThreadCount + randomIntBetween(2, 10); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + MergeTask mergeTask = threadPoolMergeScheduler.newMergeTask(mergeSource, oneMerge, randomFrom(MergeTrigger.values())); + Schedule schedule = threadPoolMergeScheduler.schedule(mergeTask); + if (i < mergeExecutorThreadCount) { + assertThat(schedule, is(Schedule.RUN)); + } else { + assertThat(schedule, is(Schedule.BACKLOG)); + backloggedMergeTasks.add(mergeTask); + } + } + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(backloggedMergeTasks.size())); + int enqueuedTasksCount = mergeCount - mergeExecutorThreadCount; + for (int i = 0; i < enqueuedTasksCount; i++) { + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(enqueuedTasksCount - i)); + MergeTask runningMergeTask = randomFrom(threadPoolMergeScheduler.getRunningMergeTasks().values()); + runningMergeTask.run(); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService, times(i + 1)).reEnqueueBackloggedMergeTask(submittedMergeTaskCaptor.capture()); + assertThat(submittedMergeTaskCaptor.getValue(), is(backloggedMergeTasks.poll())); + Schedule schedule = threadPoolMergeScheduler.schedule(submittedMergeTaskCaptor.getValue()); + assertThat(schedule, is(Schedule.RUN)); + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeExecutorThreadCount)); + } + } + + public void testIndexingThrottlingWhenSubmittingMerges() { + final int maxThreadCount = randomIntBetween(1, 5); + // settings validation requires maxMergeCount >= maxThreadCount + final int maxMergeCount = maxThreadCount + randomIntBetween(0, 5); + List submittedMergeTasks = new ArrayList<>(); + AtomicBoolean isUsingMaxTargetIORate = new AtomicBoolean(false); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mockThreadPoolMergeExecutorService( + submittedMergeTasks, + isUsingMaxTargetIORate + ); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), maxThreadCount) + .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), maxMergeCount) + .build(); + TestThreadPoolMergeScheduler threadPoolMergeScheduler = new TestThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService + ); + // make sure there are more merges submitted than the max merge count limit (which triggers IO throttling) + int excessMerges = randomIntBetween(1, 10); + int mergesToSubmit = maxMergeCount + excessMerges; + boolean expectIndexThrottling = false; + int submittedMerges = 0; + // merges are submitted, while some are also scheduled (but none is run) + while (submittedMerges < mergesToSubmit - 1) { + isUsingMaxTargetIORate.set(randomBoolean()); + if (submittedMergeTasks.isEmpty() == false && randomBoolean()) { + // maybe schedule one submitted merge + MergeTask mergeTask = randomFrom(submittedMergeTasks); + submittedMergeTasks.remove(mergeTask); + mergeTask.schedule(); + } else { + // submit one merge + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + submittedMerges++; + if (isUsingMaxTargetIORate.get() && submittedMerges > maxMergeCount) { + expectIndexThrottling = true; + } else if (submittedMerges <= maxMergeCount) { + expectIndexThrottling = false; + } + } + // assert IO throttle state + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(expectIndexThrottling)); + } + // submit one last merge when IO throttling is at max value + isUsingMaxTargetIORate.set(true); + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + // assert index throttling because IO throttling is at max value + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(true)); + } + + public void testIndexingThrottlingWhileMergesAreRunning() { + final int maxThreadCount = randomIntBetween(1, 5); + // settings validation requires maxMergeCount >= maxThreadCount + final int maxMergeCount = maxThreadCount + randomIntBetween(0, 5); + List submittedMergeTasks = new ArrayList<>(); + List scheduledToRunMergeTasks = new ArrayList<>(); + AtomicBoolean isUsingMaxTargetIORate = new AtomicBoolean(false); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mockThreadPoolMergeExecutorService( + submittedMergeTasks, + isUsingMaxTargetIORate + ); + Settings mergeSchedulerSettings = Settings.builder() + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), maxThreadCount) + .put(MergeSchedulerConfig.MAX_MERGE_COUNT_SETTING.getKey(), maxMergeCount) + .build(); + TestThreadPoolMergeScheduler threadPoolMergeScheduler = new TestThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", mergeSchedulerSettings), + threadPoolMergeExecutorService + ); + int mergesToRun = randomIntBetween(0, 5); + // make sure there are more merges submitted and not run + int excessMerges = randomIntBetween(1, 10); + int mergesToSubmit = maxMergeCount + mergesToRun + excessMerges; + int mergesOutstanding = 0; + boolean expectIndexThrottling = false; + // merges are submitted, while some are also scheduled and run + while (mergesToSubmit > 0) { + isUsingMaxTargetIORate.set(randomBoolean()); + if (submittedMergeTasks.isEmpty() == false && randomBoolean()) { + // maybe schedule one submitted merge + MergeTask mergeTask = randomFrom(submittedMergeTasks); + submittedMergeTasks.remove(mergeTask); + Schedule schedule = mergeTask.schedule(); + if (schedule == Schedule.RUN) { + scheduledToRunMergeTasks.add(mergeTask); + } + } else { + if (mergesToRun > 0 && scheduledToRunMergeTasks.isEmpty() == false && randomBoolean()) { + // maybe run one scheduled merge + MergeTask mergeTask = randomFrom(scheduledToRunMergeTasks); + scheduledToRunMergeTasks.remove(mergeTask); + mergeTask.run(); + mergesToRun--; + mergesOutstanding--; + } else { + // submit one merge + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + mergesToSubmit--; + mergesOutstanding++; + } + if (isUsingMaxTargetIORate.get() && mergesOutstanding > maxMergeCount) { + expectIndexThrottling = true; + } else if (mergesOutstanding <= maxMergeCount) { + expectIndexThrottling = false; + } + } + // assert IO throttle state + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(expectIndexThrottling)); + } + // execute all remaining merges (submitted or scheduled) + while (mergesToRun > 0 || submittedMergeTasks.isEmpty() == false || scheduledToRunMergeTasks.isEmpty() == false) { + // simulate that the {@link ThreadPoolMergeExecutorService} maybe peaked IO un-throttling + isUsingMaxTargetIORate.set(randomBoolean()); + if (submittedMergeTasks.isEmpty() == false && (scheduledToRunMergeTasks.isEmpty() || randomBoolean())) { + // maybe schedule one submitted merge + MergeTask mergeTask = randomFrom(submittedMergeTasks); + submittedMergeTasks.remove(mergeTask); + Schedule schedule = mergeTask.schedule(); + if (schedule == Schedule.RUN) { + scheduledToRunMergeTasks.add(mergeTask); + } + } else { + // maybe run one scheduled merge + MergeTask mergeTask = randomFrom(scheduledToRunMergeTasks); + scheduledToRunMergeTasks.remove(mergeTask); + mergeTask.run(); + mergesToRun--; + mergesOutstanding--; + if (isUsingMaxTargetIORate.get() && mergesOutstanding > maxMergeCount) { + expectIndexThrottling = true; + } else if (mergesOutstanding <= maxMergeCount) { + expectIndexThrottling = false; + } + } + // assert IO throttle state + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(expectIndexThrottling)); + } + // all merges done + assertThat(threadPoolMergeScheduler.isIndexingThrottlingEnabled(), is(false)); + } + + public void testMergeSourceWithFollowUpMergesRunSequentially() throws Exception { + // test with min 2 allowed concurrent merges + int mergeExecutorThreadCount = randomIntBetween(2, 5); + Settings settings = Settings.builder() + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeExecutorThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(testThreadPool, settings, nodeEnvironment); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", settings), + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge firstMerge = mock(OneMerge.class); + when(firstMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(firstMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + // at least one followup merge + null (i.e. no more followups) + int followUpMergeCount = randomIntBetween(2, 10); + OneMerge[] followUpMerges = new OneMerge[followUpMergeCount]; + followUpMerges[followUpMergeCount - 1] = null; + for (int i = 0; i < followUpMergeCount - 1; i++) { + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + followUpMerges[i] = oneMerge; + } + // the merge source with follow-up merges + when(mergeSource.getNextMerge()).thenReturn(firstMerge, followUpMerges); + AtomicBoolean isMergeInProgress = new AtomicBoolean(); + AtomicInteger runMergeIdx = new AtomicInteger(); + Semaphore runMergeSemaphore = new Semaphore(0); + Semaphore nextMergeSemaphore = new Semaphore(0); + doAnswer(invocation -> { + // assert only one merge can be in-progress at any point-in-time + assertTrue(isMergeInProgress.compareAndSet(false, true)); + OneMerge mergeInvocation = (OneMerge) invocation.getArguments()[0]; + assertFalse(mergeInvocation.isAborted()); + // assert merges run in the order they are produced by the merge source + if (runMergeIdx.get() == 0) { + assertThat(mergeInvocation, is(firstMerge)); + } else { + assertThat(mergeInvocation, is(followUpMerges[runMergeIdx.get() - 1])); + } + runMergeIdx.incrementAndGet(); + // await before returning from the merge in order to really ensure that follow-up merges don't run concurrently + nextMergeSemaphore.release(); + runMergeSemaphore.acquire(); + assertTrue(isMergeInProgress.compareAndSet(true, false)); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + // trigger run merges on the merge source + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + boolean done = false; + while (done == false) { + // let merges run, but wait for the in-progress one to signal it is running + nextMergeSemaphore.acquire(); + done = runMergeIdx.get() >= followUpMergeCount; + runMergeSemaphore.release(); + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + } + + public void testMergesRunConcurrently() throws Exception { + // min 2 allowed concurrent merges, per scheduler + int mergeSchedulerMaxThreadCount = randomIntBetween(2, 4); + // the merge executor has at least 1 extra thread available + int mergeExecutorThreadCount = mergeSchedulerMaxThreadCount + randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeSchedulerMaxThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(testThreadPool, settings, nodeEnvironment); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + ThreadPoolExecutor threadPoolExecutor = (ThreadPoolExecutor) testThreadPool.executor(ThreadPool.Names.MERGE); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", settings), + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + // at least 1 extra merge than there are concurrently allowed + int mergeCount = mergeExecutorThreadCount + randomIntBetween(1, 10); + Semaphore runMergeSemaphore = new Semaphore(0); + for (int i = 0; i < mergeCount; i++) { + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + doAnswer(invocation -> { + OneMerge merge = (OneMerge) invocation.getArguments()[0]; + assertFalse(merge.isAborted()); + // wait to be signalled before completing + runMergeSemaphore.acquire(); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + } + for (int completedMergesCount = 0; completedMergesCount < mergeCount + - mergeSchedulerMaxThreadCount; completedMergesCount++) { + int finalCompletedMergesCount = completedMergesCount; + assertBusy(() -> { + // assert that there are merges running concurrently at the max allowed concurrency rate + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(mergeSchedulerMaxThreadCount)); + // with the other merges backlogged + assertThat( + threadPoolMergeScheduler.getBackloggedMergeTasks().size(), + is(mergeCount - mergeSchedulerMaxThreadCount - finalCompletedMergesCount) + ); + // also check the same for the thread-pool executor + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(mergeSchedulerMaxThreadCount)); + // queued merge tasks do not include backlogged merges + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + // also check thread-pool stats for the same + // there are active thread-pool threads waiting for the backlogged merge tasks to be re-enqueued + int activeMergeThreads = Math.min(mergeCount - finalCompletedMergesCount, mergeExecutorThreadCount); + assertThat(threadPoolExecutor.getActiveCount(), is(activeMergeThreads)); + assertThat(threadPoolExecutor.getQueue().size(), is(mergeCount - finalCompletedMergesCount - activeMergeThreads)); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + // there are now fewer merges still running than available threads + for (int remainingMergesCount = mergeSchedulerMaxThreadCount; remainingMergesCount >= 0; remainingMergesCount--) { + int finalRemainingMergesCount = remainingMergesCount; + assertBusy(() -> { + // there are fewer available merges than available threads + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(finalRemainingMergesCount)); + // no more backlogged merges + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(0)); + // also check thread-pool executor for the same + assertThat(threadPoolMergeExecutorService.getRunningMergeTasks().size(), is(finalRemainingMergesCount)); + // no more backlogged merges + assertThat(threadPoolMergeExecutorService.getMergeTasksQueueLength(), is(0)); + // also check thread-pool stats for the same + assertThat(threadPoolExecutor.getActiveCount(), is(finalRemainingMergesCount)); + assertThat(threadPoolExecutor.getQueue().size(), is(0)); + }); + // let one merge task finish running + runMergeSemaphore.release(); + } + assertBusy(() -> assertTrue(threadPoolMergeExecutorService.allDone())); + } + } + } + + public void testSchedulerCloseWaitsForRunningMerge() throws Exception { + int mergeSchedulerMaxThreadCount = randomIntBetween(1, 3); + int mergeExecutorThreadCount = randomIntBetween(1, 3); + Settings settings = Settings.builder() + .put(EsExecutors.NODE_PROCESSORS_SETTING.getKey(), mergeExecutorThreadCount) + .put(MergeSchedulerConfig.MAX_THREAD_COUNT_SETTING.getKey(), mergeSchedulerMaxThreadCount) + // disable fs available disk space feature for this test + .put(ThreadPoolMergeExecutorService.INDICES_MERGE_DISK_CHECK_INTERVAL_SETTING.getKey(), "0s") + .build(); + nodeEnvironment = newNodeEnvironment(settings); + try (TestThreadPool testThreadPool = new TestThreadPool("test", settings)) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = ThreadPoolMergeExecutorServiceTests + .getThreadPoolMergeExecutorService(testThreadPool, settings, nodeEnvironment); + assertThat(threadPoolMergeExecutorService.getMaxConcurrentMerges(), equalTo(mergeExecutorThreadCount)); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", settings), + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + CountDownLatch mergeDoneLatch = new CountDownLatch(1); + CountDownLatch mergeRunningLatch = new CountDownLatch(1); + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + doAnswer(invocation -> { + mergeRunningLatch.countDown(); + OneMerge merge = (OneMerge) invocation.getArguments()[0]; + assertFalse(merge.isAborted()); + // wait to be signalled before completing the merge + mergeDoneLatch.await(); + return null; + }).when(mergeSource).merge(any(OneMerge.class)); + // submit the merge + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + Thread t = new Thread(() -> { + try { + threadPoolMergeScheduler.close(); + } catch (IOException e) { + fail(e); + } + }); + t.start(); + try { + assertTrue(t.isAlive()); + // wait for the merge to actually run + mergeRunningLatch.await(); + // ensure the merge scheduler is effectively "closed" + assertBusy(() -> { + MergeSource mergeSource2 = mock(MergeSource.class); + threadPoolMergeScheduler.merge(mergeSource2, randomFrom(MergeTrigger.values())); + // when the merge scheduler is closed it won't pull in any new merges from the merge source + verifyNoInteractions(mergeSource2); + }); + // assert the merge still shows up as "running" + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().keySet(), contains(oneMerge)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(0)); + assertTrue(t.isAlive()); + // signal the merge to finish + mergeDoneLatch.countDown(); + } finally { + t.join(); + } + assertBusy(() -> { + assertThat(threadPoolMergeScheduler.getRunningMergeTasks().size(), is(0)); + assertThat(threadPoolMergeScheduler.getBackloggedMergeTasks().size(), is(0)); + assertTrue(threadPoolMergeExecutorService.allDone()); + }); + } + } + } + + public void testAutoIOThrottleForMergeTasksWhenSchedulerDisablesIt() throws Exception { + // merge scheduler configured with auto IO throttle disabled + Settings settings = Settings.builder().put(MergeSchedulerConfig.AUTO_THROTTLE_SETTING.getKey(), false).build(); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + MergePolicy.OneMergeProgress oneMergeProgress = new MergePolicy.OneMergeProgress(); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomNonNegativeLong())); + when(oneMerge.getMergeProgress()).thenReturn(oneMergeProgress); + MergeSource mergeSource = mock(MergeSource.class); + when(mergeSource.getNextMerge()).thenReturn(oneMerge); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + assertFalse(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + } + + public void testAutoIOThrottleForMergeTasks() throws Exception { + final Settings.Builder settingsBuilder = Settings.builder(); + // merge scheduler configured with auto IO throttle enabled + if (randomBoolean()) { + settingsBuilder.put(MergeSchedulerConfig.AUTO_THROTTLE_SETTING.getKey(), true); + } + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settingsBuilder.build()); + MergePolicy.OneMergeProgress oneMergeProgress = new MergePolicy.OneMergeProgress(); + OneMerge oneMerge = mock(OneMerge.class); + // forced merge with a set number of segments + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomNonNegativeLong(), randomNonNegativeInt())); + when(oneMerge.getMergeProgress()).thenReturn(oneMergeProgress); + MergeSource mergeSource = mock(MergeSource.class); + when(mergeSource.getNextMerge()).thenReturn(oneMerge); + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + threadPoolMergeScheduler.merge(mergeSource, randomFrom(MergeTrigger.values())); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + // forced merge tasks should not be IO throttled + assertFalse(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + // NOT a forced merge + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomNonNegativeLong(), -1)); + threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + // merge submitted upon closing + threadPoolMergeScheduler.merge(mergeSource, MergeTrigger.CLOSING); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + // merge tasks submitted when closing should not be IO throttled + assertFalse(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + // otherwise, merge tasks should be auto IO throttled + threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + try ( + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + indexSettings, + threadPoolMergeExecutorService, + merge -> 0 + ) + ) { + // merge submitted upon closing + threadPoolMergeScheduler.merge( + mergeSource, + randomValueOtherThan(MergeTrigger.CLOSING, () -> randomFrom(MergeTrigger.values())) + ); + var submittedMergeTaskCaptor = ArgumentCaptor.forClass(MergeTask.class); + verify(threadPoolMergeExecutorService).submitMergeTask(submittedMergeTaskCaptor.capture()); + // merge tasks should be auto IO throttled + assertTrue(submittedMergeTaskCaptor.getValue().supportsIOThrottling()); + } + } + + public void testMergeSchedulerAbortsMergeWhenShouldSkipMergeIsTrue() { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + // build a scheduler that always returns true for shouldSkipMerge + ThreadPoolMergeScheduler threadPoolMergeScheduler = new ThreadPoolMergeScheduler( + new ShardId("index", "_na_", 1), + IndexSettingsModule.newIndexSettings("index", Settings.builder().build()), + threadPoolMergeExecutorService, + merge -> 0 + ) { + @Override + protected boolean shouldSkipMerge() { + return true; + } + }; + MergeSource mergeSource = mock(MergeSource.class); + OneMerge oneMerge = mock(OneMerge.class); + when(oneMerge.getStoreMergeInfo()).thenReturn(getNewMergeInfo(randomLongBetween(1L, 10L))); + when(oneMerge.getMergeProgress()).thenReturn(new MergePolicy.OneMergeProgress()); + when(mergeSource.getNextMerge()).thenReturn(oneMerge, (OneMerge) null); + MergeTask mergeTask = threadPoolMergeScheduler.newMergeTask(mergeSource, oneMerge, randomFrom(MergeTrigger.values())); + // verify that calling schedule on the merge task indicates the merge should be aborted + Schedule schedule = threadPoolMergeScheduler.schedule(mergeTask); + assertThat(schedule, is(Schedule.ABORT)); + } + + private static MergeInfo getNewMergeInfo(long estimatedMergeBytes) { + return getNewMergeInfo(estimatedMergeBytes, randomFrom(-1, randomNonNegativeInt())); + } + + private static MergeInfo getNewMergeInfo(long estimatedMergeBytes, int maxNumSegments) { + return new MergeInfo(randomNonNegativeInt(), estimatedMergeBytes, randomBoolean(), maxNumSegments); + } + + static class TestThreadPoolMergeScheduler extends ThreadPoolMergeScheduler { + AtomicBoolean isIndexingThrottlingEnabled = new AtomicBoolean(false); + + TestThreadPoolMergeScheduler( + ShardId shardId, + IndexSettings indexSettings, + ThreadPoolMergeExecutorService threadPoolMergeExecutorService + ) { + super(shardId, indexSettings, threadPoolMergeExecutorService, merge -> 0); + } + + @Override + protected void enableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + isIndexingThrottlingEnabled.set(true); + } + + @Override + protected void disableIndexingThrottling(int numRunningMerges, int numQueuedMerges, int configuredMaxMergeCount) { + isIndexingThrottlingEnabled.set(false); + } + + boolean isIndexingThrottlingEnabled() { + return isIndexingThrottlingEnabled.get(); + } + } + + static ThreadPoolMergeExecutorService mockThreadPoolMergeExecutorService( + List submittedMergeTasks, + AtomicBoolean isUsingMaxTargetIORate + ) { + ThreadPoolMergeExecutorService threadPoolMergeExecutorService = mock(ThreadPoolMergeExecutorService.class); + doAnswer(invocation -> { + MergeTask mergeTask = (MergeTask) invocation.getArguments()[0]; + submittedMergeTasks.add(mergeTask); + return null; + }).when(threadPoolMergeExecutorService).submitMergeTask(any(MergeTask.class)); + doAnswer(invocation -> { + MergeTask mergeTask = (MergeTask) invocation.getArguments()[0]; + submittedMergeTasks.add(mergeTask); + return null; + }).when(threadPoolMergeExecutorService).reEnqueueBackloggedMergeTask(any(MergeTask.class)); + doAnswer(invocation -> isUsingMaxTargetIORate.get()).when(threadPoolMergeExecutorService).usingMaxTargetIORateBytesPerSec(); + return threadPoolMergeExecutorService; + } +} diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java index 8c325c945a7a2..38d89f08378bd 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardRetentionLeaseTests.java @@ -41,8 +41,8 @@ public class IndexShardRetentionLeaseTests extends IndexShardTestCase { private final AtomicLong currentTimeMillis = new AtomicLong(); @Override - protected ThreadPool setUpThreadPool() { - return new TestThreadPool(getClass().getName(), threadPoolSettings()) { + protected ThreadPool setUpThreadPool(Settings settings) { + return new TestThreadPool(getClass().getName(), settings) { @Override public long absoluteTimeInMillis() { return currentTimeMillis.get(); diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java index 4549a329d499a..43f5d58ee8e0c 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java @@ -5019,6 +5019,7 @@ public void testCloseShardWhileEngineIsWarming() throws Exception { EngineConfig configWithWarmer = new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), warmer, config.getStore(), diff --git a/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java b/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java index ca616dc619ec9..843c7f3f58852 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/RefreshListenersTests.java @@ -24,6 +24,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.metrics.MeanMetric; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.concurrent.EsExecutors; @@ -33,6 +34,7 @@ import org.elasticsearch.core.Releasable; import org.elasticsearch.core.Strings; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; @@ -41,6 +43,8 @@ import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngine; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.LuceneDocument; import org.elasticsearch.index.mapper.MapperService; @@ -89,6 +93,8 @@ public class RefreshListenersTests extends ESTestCase { private Engine engine; private volatile int maxListeners; private ThreadPool threadPool; + private NodeEnvironment nodeEnvironment; + private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private Store store; @Before @@ -97,6 +103,16 @@ public void setupListeners() throws Exception { maxListeners = randomIntBetween(2, 1000); // Now setup the InternalEngine which is much more complicated because we aren't mocking anything threadPool = new TestThreadPool(getTestName()); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) + .build(); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings); + nodeEnvironment = newNodeEnvironment(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ); listeners = new RefreshListeners( () -> maxListeners, () -> engine.refresh("too-many-listeners"), @@ -105,7 +121,6 @@ public void setupListeners() throws Exception { new MeanMetric() ); - IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY); ShardId shardId = new ShardId(new Index("index", "_na_"), 1); Directory directory = newDirectory(); store = new Store(shardId, indexSettings, directory, new DummyShardLock(shardId)); @@ -134,6 +149,7 @@ public void onFailedEngine(String reason, @Nullable Exception e) { EngineConfig config = new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, indexSettings, null, store, @@ -169,8 +185,7 @@ public void onFailedEngine(String reason, @Nullable Exception e) { @After public void tearDownListeners() throws Exception { - IOUtils.close(engine, store); - terminate(threadPool); + IOUtils.close(engine, store, nodeEnvironment, () -> terminate(threadPool)); } public void testBeforeRefresh() throws Exception { diff --git a/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java b/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java index d0b4f90948a4e..aaffcb4085f12 100644 --- a/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java +++ b/server/src/test/java/org/elasticsearch/threadpool/ScalingThreadPoolTests.java @@ -119,6 +119,7 @@ private int expectedSize(final String threadPoolName, final int numberOfProcesso sizes.put(ThreadPool.Names.SNAPSHOT_META, n -> Math.min(n * 3, 50)); sizes.put(ThreadPool.Names.FETCH_SHARD_STARTED, ThreadPool::twiceAllocatedProcessors); sizes.put(ThreadPool.Names.FETCH_SHARD_STORE, ThreadPool::twiceAllocatedProcessors); + sizes.put(ThreadPool.Names.MERGE, Function.identity()); return sizes.get(threadPoolName).apply(numberOfProcessors); } diff --git a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java index 7a2f375001874..574d3ac47daa9 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/engine/EngineTestCase.java @@ -61,6 +61,7 @@ import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.uid.Versions; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.util.BigArrays; @@ -68,6 +69,7 @@ import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.Nullable; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexModule; import org.elasticsearch.index.IndexSettings; @@ -155,6 +157,8 @@ public abstract class EngineTestCase extends ESTestCase { protected static final IndexSettings INDEX_SETTINGS = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY); protected ThreadPool threadPool; + protected NodeEnvironment nodeEnvironment; + protected ThreadPoolMergeExecutorService threadPoolMergeExecutorService; protected TranslogHandler translogHandler; protected Store store; @@ -197,6 +201,7 @@ protected Settings indexSettings() { between(10, 10 * IndexSettings.MAX_REFRESH_LISTENERS_PER_SHARD.get(Settings.EMPTY)) ) .put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), between(0, 1000)) + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) .build(); } @@ -241,6 +246,13 @@ public void setUp() throws Exception { } defaultSettings = IndexSettingsModule.newIndexSettings("index", indexSettings()); threadPool = new TestThreadPool(getClass().getName()); + nodeEnvironment = newNodeEnvironment(defaultSettings.getNodeSettings()); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(defaultSettings.getNodeSettings()), + nodeEnvironment + ); + store = createStore(); storeReplica = createStore(); Lucene.cleanLuceneIndex(store.directory()); @@ -272,6 +284,7 @@ public static EngineConfig copy(EngineConfig config, LongSupplier globalCheckpoi return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), config.getWarmer(), config.getStore(), @@ -304,6 +317,7 @@ public EngineConfig copy(EngineConfig config, Analyzer analyzer) { return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), config.getWarmer(), config.getStore(), @@ -336,6 +350,7 @@ public EngineConfig copy(EngineConfig config, MergePolicy mergePolicy) { return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), config.getIndexSettings(), config.getWarmer(), config.getStore(), @@ -384,7 +399,7 @@ public void tearDown() throws Exception { assertAtMostOneLuceneDocumentPerSequenceNumber(replicaEngine); } } finally { - IOUtils.close(replicaEngine, storeReplica, engine, store, () -> terminate(threadPool)); + IOUtils.close(replicaEngine, storeReplica, engine, store, () -> terminate(threadPool), nodeEnvironment); } } @@ -840,6 +855,7 @@ public EngineConfig config( return new EngineConfig( shardId, threadPool, + threadPoolMergeExecutorService, indexSettings, null, store, @@ -880,6 +896,7 @@ protected EngineConfig config(EngineConfig config, Store store, Path translogPat return new EngineConfig( config.getShardId(), config.getThreadPool(), + config.getThreadPoolMergeExecutorService(), indexSettings, config.getWarmer(), store, diff --git a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java index 2ae4bb0343101..65c3e01cd6c87 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/shard/IndexShardTestCase.java @@ -54,6 +54,8 @@ import org.elasticsearch.index.engine.EngineFactory; import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngineFactory; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.mapper.MapperMetrics; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.SourceToParse; @@ -152,6 +154,8 @@ public void onRecoveryFailure(RecoveryFailedException e, boolean sendShardFailur }; protected ThreadPool threadPool; + protected NodeEnvironment nodeEnvironment; + protected ThreadPoolMergeExecutorService threadPoolMergeExecutorService; protected Executor writeExecutor; protected long primaryTerm; @@ -166,20 +170,27 @@ public static void addMockCloseImplementation(IndexShard shard) throws IOExcepti @Override public void setUp() throws Exception { super.setUp(); - threadPool = setUpThreadPool(); + Settings settings = threadPoolSettings(); + threadPool = setUpThreadPool(settings); + nodeEnvironment = newNodeEnvironment(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ); writeExecutor = threadPool.executor(ThreadPool.Names.WRITE); primaryTerm = randomIntBetween(1, 100); // use random but fixed term for creating shards failOnShardFailures(); } - protected ThreadPool setUpThreadPool() { - return new TestThreadPool(getClass().getName(), threadPoolSettings()); + protected ThreadPool setUpThreadPool(Settings settings) { + return new TestThreadPool(getClass().getName(), settings); } @Override public void tearDown() throws Exception { try { - tearDownThreadPool(); + IOUtils.close(nodeEnvironment, this::tearDownThreadPool); } finally { super.tearDown(); } @@ -203,7 +214,7 @@ protected void failOnShardFailures() { } public Settings threadPoolSettings() { - return Settings.EMPTY; + return Settings.builder().put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()).build(); } protected Store createStore(IndexSettings indexSettings, ShardPath shardPath) throws IOException { @@ -537,6 +548,7 @@ protected IndexShard newShard( indexEventListener, indexReaderWrapper, threadPool, + threadPoolMergeExecutorService, BigArrays.NON_RECYCLING_INSTANCE, warmer, Collections.emptyList(), diff --git a/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java index 0c34b0fddc5c8..9be119a4cbf35 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/test/ESIntegTestCase.java @@ -131,6 +131,7 @@ import org.elasticsearch.index.MockEngineFactoryPlugin; import org.elasticsearch.index.codec.CodecService; import org.elasticsearch.index.engine.Segment; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.mapper.MockFieldFilterPlugin; import org.elasticsearch.index.translog.Translog; import org.elasticsearch.indices.IndicesQueryCache; @@ -1608,12 +1609,39 @@ protected final BroadcastResponse flush(String... indices) { * Waits for all relocations and force merge all indices in the cluster to 1 segment. */ protected BroadcastResponse forceMerge() { + return forceMerge(randomBoolean()); + } + + /** + * Waits for all relocations and force merge all indices in the cluster to 1 segment. + */ + protected BroadcastResponse forceMerge(boolean assertOneSegment) { waitForRelocation(); BroadcastResponse actionGet = indicesAdmin().prepareForceMerge().setMaxNumSegments(1).get(); assertNoFailures(actionGet); + if (assertOneSegment) { + // after a force merge there should only be 1 segment per shard + var shardsWithMultipleSegments = getShardSegments().stream() + .filter(shardSegments -> shardSegments.getSegments().size() > 1) + .toList(); + assertTrue("there are shards with multiple segments " + shardsWithMultipleSegments, shardsWithMultipleSegments.isEmpty()); + } return actionGet; } + /** + * Returns the segments of the shards of the indices. + */ + protected List getShardSegments(String... indices) { + IndicesSegmentResponse indicesSegmentResponse = indicesAdmin().prepareSegments(indices).get(); + return indicesSegmentResponse.getIndices() + .values() + .stream() + .flatMap(indexSegments -> indexSegments.getShards().values().stream()) + .flatMap(indexShardSegments -> Stream.of(indexShardSegments.shards())) + .toList(); + } + /** * Returns true iff the given index exists otherwise false */ @@ -2057,6 +2085,7 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { builder.put(IndexingPressure.SPLIT_BULK_HIGH_WATERMARK.getKey(), randomFrom("1KB", "16KB", "64KB")); builder.put(IndexingPressure.SPLIT_BULK_HIGH_WATERMARK_SIZE.getKey(), "256B"); } + builder.put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()); return builder.build(); } diff --git a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java index ae79636c6b14c..07467fa69e7bf 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java @@ -1418,6 +1418,20 @@ public void assertConsistentHistoryBetweenTranslogAndLuceneIndex() throws IOExce } } + public void assertMergeExecutorIsDone() throws Exception { + assertBusy(() -> { + for (String nodeName : getNodeNames()) { + IndicesService indicesService = getInstance(IndicesService.class, nodeName); + if (indicesService.getThreadPoolMergeExecutorService() != null) { + assertTrue( + "thread pool merge executor is not done after test", + indicesService.getThreadPoolMergeExecutorService().allDone() + ); + } + } + }); + } + public void assertNoInFlightDocsInEngine() throws Exception { assertBusy(() -> { for (String nodeName : getNodeNames()) { @@ -2526,6 +2540,7 @@ public synchronized void assertAfterTest() throws Exception { assertRequestsFinished(); assertSearchContextsReleased(); assertNoInFlightDocsInEngine(); + assertMergeExecutorIsDone(); awaitIndexShardCloseAsyncTasks(); for (NodeAndClient nodeAndClient : nodes.values()) { NodeEnvironment env = nodeAndClient.node().getNodeEnvironment(); diff --git a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java index 62dc3313a1172..a2cb7b85e880f 100644 --- a/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java +++ b/x-pack/plugin/ccr/src/test/java/org/elasticsearch/xpack/ccr/index/engine/FollowingEngineTests.java @@ -18,10 +18,13 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.compress.CompressedXContent; +import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexModule; @@ -34,6 +37,8 @@ import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.index.engine.EngineTestCase; import org.elasticsearch.index.engine.InternalEngine; +import org.elasticsearch.index.engine.ThreadPoolMergeExecutorService; +import org.elasticsearch.index.engine.ThreadPoolMergeScheduler; import org.elasticsearch.index.engine.TranslogHandler; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.MapperService; @@ -82,6 +87,8 @@ public class FollowingEngineTests extends ESTestCase { private ThreadPool threadPool; + private NodeEnvironment nodeEnvironment; + private ThreadPoolMergeExecutorService threadPoolMergeExecutorService; private Index index; private ShardId shardId; private AtomicLong primaryTerm = new AtomicLong(); @@ -91,7 +98,16 @@ public class FollowingEngineTests extends ESTestCase { @Override public void setUp() throws Exception { super.setUp(); - threadPool = new TestThreadPool("following-engine-tests"); + Settings settings = Settings.builder() + .put(ThreadPoolMergeScheduler.USE_THREAD_POOL_MERGE_SCHEDULER_SETTING.getKey(), randomBoolean()) + .build(); + threadPool = new TestThreadPool("following-engine-tests", settings); + nodeEnvironment = newNodeEnvironment(settings); + threadPoolMergeExecutorService = ThreadPoolMergeExecutorService.maybeCreateThreadPoolMergeExecutorService( + threadPool, + ClusterSettings.createBuiltInClusterSettings(settings), + nodeEnvironment + ); index = new Index("index", "uuid"); shardId = new ShardId(index, 0); primaryTerm.set(randomLongBetween(1, Long.MAX_VALUE)); @@ -100,7 +116,7 @@ public void setUp() throws Exception { @Override public void tearDown() throws Exception { - terminate(threadPool); + IOUtils.close(nodeEnvironment, () -> terminate(threadPool)); super.tearDown(); } @@ -113,7 +129,7 @@ public void testFollowingEngineRejectsNonFollowingIndex() throws IOException { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> new FollowingEngine(engineConfig)); assertThat(e, hasToString(containsString("a following engine can not be constructed for a non-following index"))); } @@ -137,7 +153,7 @@ public void testOutOfOrderDocuments() throws IOException { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { final VersionType versionType = randomFrom(VersionType.INTERNAL, VersionType.EXTERNAL, VersionType.EXTERNAL_GTE); final List ops = EngineTestCase.generateSingleDocHistory(true, versionType, 2, 2, 20, "id"); @@ -156,7 +172,7 @@ public void runIndexTest( final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { final Engine.Index indexToTest = indexForFollowing("id", seqNo, origin); consumer.accept(followingEngine, indexToTest); @@ -182,7 +198,7 @@ public void runDeleteTest( final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { final String id = "id"; final Engine.Delete delete = new Engine.Delete( @@ -208,7 +224,7 @@ public void testDoNotFillSeqNoGaps() throws Exception { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine followingEngine = createEngine(store, engineConfig)) { followingEngine.index(indexForFollowing("id", 128, Engine.Operation.Origin.PRIMARY)); int addedNoops = followingEngine.fillSeqNoGaps(primaryTerm.get()); @@ -221,6 +237,7 @@ private EngineConfig engineConfig( final ShardId shardIdValue, final IndexSettings indexSettings, final ThreadPool threadPool, + final ThreadPoolMergeExecutorService threadPoolMergeExecutorService, final Store store ) throws IOException { final IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); @@ -235,6 +252,7 @@ private EngineConfig engineConfig( return new EngineConfig( shardIdValue, threadPool, + threadPoolMergeExecutorService, indexSettings, null, store, @@ -506,7 +524,13 @@ public void testConcurrentIndexOperationsWithDeletesCanAdvanceMaxSeqNoOfUpdates( IndexMetadata followerIndexMetadata = IndexMetadata.builder(index.getName()).settings(followerSettings).build(); IndexSettings followerIndexSettings = new IndexSettings(followerIndexMetadata, Settings.EMPTY); try (Store followerStore = createStore(shardId, followerIndexSettings, newDirectory())) { - EngineConfig followerConfig = engineConfig(shardId, followerIndexSettings, threadPool, followerStore); + EngineConfig followerConfig = engineConfig( + shardId, + followerIndexSettings, + threadPool, + threadPoolMergeExecutorService, + followerStore + ); followerStore.createEmpty(); String translogUuid = Translog.createEmptyTranslog( followerConfig.getTranslogConfig().getTranslogPath(), @@ -613,7 +637,7 @@ private void runFollowTest(CheckedBiConsumer operationWithTerms = new HashMap<>(); @@ -882,7 +912,7 @@ public void testMaxSeqNoInCommitUserData() throws Exception { final IndexMetadata indexMetadata = IndexMetadata.builder(index.getName()).settings(settings).build(); final IndexSettings indexSettings = new IndexSettings(indexMetadata, settings); try (Store store = createStore(shardId, indexSettings, newDirectory())) { - final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, store); + final EngineConfig engineConfig = engineConfig(shardId, indexSettings, threadPool, threadPoolMergeExecutorService, store); try (FollowingEngine engine = createEngine(store, engineConfig)) { AtomicBoolean running = new AtomicBoolean(true); Thread rollTranslog = new Thread(() -> { diff --git a/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java b/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java index c9a1a82b34118..ef7fd2c6b065d 100644 --- a/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java +++ b/x-pack/plugin/searchable-snapshots/src/internalClusterTest/java/org/elasticsearch/xpack/searchablesnapshots/RetrySearchIntegTests.java @@ -57,7 +57,8 @@ public void testSearcherId() throws Exception { equalTo(0) ); refresh(indexName); - forceMerge(); + // force merge with expunge deletes is not merging down to one segment only + forceMerge(false); final String repositoryName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT); createRepository(repositoryName, "fs"); @@ -125,7 +126,8 @@ public void testRetryPointInTime() throws Exception { equalTo(0) ); refresh(indexName); - forceMerge(); + // force merge with expunge deletes is not merging down to one segment only + forceMerge(false); final String repositoryName = randomAlphaOfLength(10).toLowerCase(Locale.ROOT); createRepository(repositoryName, "fs"); diff --git a/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java b/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java index 405a9926e2e5f..8e3bb1d3f27db 100644 --- a/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java +++ b/x-pack/plugin/snapshot-based-recoveries/src/internalClusterTest/java/org/elasticsearch/xpack/snapshotbasedrecoveries/recovery/SnapshotBasedIndexRecoveryIT.java @@ -465,7 +465,7 @@ public void testPeerRecoveryTriesToUseMostOfTheDataFromAnAvailableSnapshot() thr int numDocs = randomIntBetween(300, 1000); indexDocs(indexName, 0, numDocs); - forceMerge(); + forceMerge(false); String repoName = "repo"; createRepo(repoName, TestRepositoryPlugin.INSTRUMENTED_TYPE);