Skip to content

Commit 3f5fb9f

Browse files
BewareMyPowerheesung-sn
authored andcommitted
[fix][broker] Fix unloadNamespaceBundlesGracefully can be stuck with extensible load manager (apache#23349)
(cherry picked from commit e91574a)
1 parent 10ca808 commit 3f5fb9f

16 files changed

+403
-125
lines changed

pulsar-broker/src/main/java/org/apache/pulsar/broker/PulsarService.java

+3-4
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,9 @@ public CompletableFuture<Void> closeAsync() {
460460
return closeFuture;
461461
}
462462
LOG.info("Closing PulsarService");
463+
if (topicPoliciesService != null) {
464+
topicPoliciesService.close();
465+
}
463466
if (brokerService != null) {
464467
brokerService.unloadNamespaceBundlesGracefully();
465468
}
@@ -575,10 +578,6 @@ public CompletableFuture<Void> closeAsync() {
575578
transactionBufferClient.close();
576579
}
577580

578-
if (topicPoliciesService != null) {
579-
topicPoliciesService.close();
580-
topicPoliciesService = null;
581-
}
582581

583582
if (client != null) {
584583
client.close();

pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/extensions/ExtensibleLoadManagerImpl.java

+96-36
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,14 @@ public class ExtensibleLoadManagerImpl implements ExtensibleLoadManager, BrokerS
182182

183183
private SplitManager splitManager;
184184

185-
private volatile boolean started = false;
185+
enum State {
186+
INIT,
187+
RUNNING,
188+
// It's removing visibility of the current broker from other brokers. In this state, it cannot play as a leader
189+
// or follower.
190+
DISABLED,
191+
}
192+
private final AtomicReference<State> state = new AtomicReference<>(State.INIT);
186193

187194
private boolean configuredSystemTopics = false;
188195

@@ -210,7 +217,7 @@ public class ExtensibleLoadManagerImpl implements ExtensibleLoadManager, BrokerS
210217
* Get all the bundles that are owned by this broker.
211218
*/
212219
public CompletableFuture<Set<NamespaceBundle>> getOwnedServiceUnitsAsync() {
213-
if (!started) {
220+
if (state.get() == State.INIT) {
214221
log.warn("Failed to get owned service units, load manager is not started.");
215222
return CompletableFuture.completedFuture(Collections.emptySet());
216223
}
@@ -373,7 +380,7 @@ public static CompletableFuture<Optional<BrokerLookupData>> getAssignedBrokerLoo
373380

374381
@Override
375382
public void start() throws PulsarServerException {
376-
if (this.started) {
383+
if (state.get() != State.INIT) {
377384
return;
378385
}
379386
try {
@@ -471,7 +478,9 @@ public void start() throws PulsarServerException {
471478

472479
this.splitScheduler.start();
473480
this.initWaiter.complete(true);
474-
this.started = true;
481+
if (!state.compareAndSet(State.INIT, State.RUNNING)) {
482+
failForUnexpectedState("start");
483+
}
475484
log.info("Started load manager.");
476485
} catch (Throwable e) {
477486
failStarting(e);
@@ -643,21 +652,17 @@ public CompletableFuture<Optional<String>> selectAsync(ServiceUnitId bundle,
643652
filter.filterAsync(availableBrokerCandidates, bundle, context);
644653
futures.add(future);
645654
}
646-
CompletableFuture<Optional<String>> result = new CompletableFuture<>();
647-
FutureUtil.waitForAll(futures).whenComplete((__, ex) -> {
648-
if (ex != null) {
649-
// TODO: We may need to revisit this error case.
650-
log.error("Failed to filter out brokers when select bundle: {}", bundle, ex);
651-
}
655+
return FutureUtil.waitForAll(futures).exceptionally(e -> {
656+
// TODO: We may need to revisit this error case.
657+
log.error("Failed to filter out brokers when select bundle: {}", bundle, e);
658+
return null;
659+
}).thenApply(__ -> {
652660
if (availableBrokerCandidates.isEmpty()) {
653-
result.complete(Optional.empty());
654-
return;
661+
return Optional.empty();
655662
}
656663
Set<String> candidateBrokers = availableBrokerCandidates.keySet();
657-
658-
result.complete(getBrokerSelectionStrategy().select(candidateBrokers, bundle, context));
664+
return getBrokerSelectionStrategy().select(candidateBrokers, bundle, context);
659665
});
660-
return result;
661666
});
662667
}
663668

@@ -695,6 +700,9 @@ public CompletableFuture<Void> unloadNamespaceBundleAsync(ServiceUnitId bundle,
695700
boolean force,
696701
long timeout,
697702
TimeUnit timeoutUnit) {
703+
if (state.get() == State.INIT) {
704+
return CompletableFuture.completedFuture(null);
705+
}
698706
if (NamespaceService.isSLAOrHeartbeatNamespace(bundle.getNamespaceObject().toString())) {
699707
log.info("Skip unloading namespace bundle: {}.", bundle);
700708
return CompletableFuture.completedFuture(null);
@@ -783,28 +791,13 @@ private CompletableFuture<Void> splitAsync(SplitDecision decision,
783791

784792
@Override
785793
public void close() throws PulsarServerException {
786-
if (!this.started) {
794+
if (state.get() == State.INIT) {
787795
return;
788796
}
789797
try {
790-
if (brokerLoadDataReportTask != null) {
791-
brokerLoadDataReportTask.cancel(true);
792-
}
793-
794-
if (topBundlesLoadDataReportTask != null) {
795-
topBundlesLoadDataReportTask.cancel(true);
796-
}
797-
798-
if (monitorTask != null) {
799-
monitorTask.cancel(true);
800-
}
801-
802-
this.brokerLoadDataStore.close();
803-
this.topBundlesLoadDataStore.close();
798+
stopLoadDataReportTasks();
804799
this.unloadScheduler.close();
805800
this.splitScheduler.close();
806-
} catch (IOException ex) {
807-
throw new PulsarServerException(ex);
808801
} finally {
809802
try {
810803
this.brokerRegistry.close();
@@ -818,14 +811,36 @@ public void close() throws PulsarServerException {
818811
} catch (Exception e) {
819812
throw new PulsarServerException(e);
820813
} finally {
821-
this.started = false;
814+
state.set(State.INIT);
822815
}
823816
}
824817

825818
}
826819
}
827820
}
828821

822+
private void stopLoadDataReportTasks() {
823+
if (brokerLoadDataReportTask != null) {
824+
brokerLoadDataReportTask.cancel(true);
825+
}
826+
if (topBundlesLoadDataReportTask != null) {
827+
topBundlesLoadDataReportTask.cancel(true);
828+
}
829+
if (monitorTask != null) {
830+
monitorTask.cancel(true);
831+
}
832+
try {
833+
brokerLoadDataStore.shutdown();
834+
} catch (IOException e) {
835+
log.warn("Failed to shutdown brokerLoadDataStore", e);
836+
}
837+
try {
838+
topBundlesLoadDataStore.shutdown();
839+
} catch (IOException e) {
840+
log.warn("Failed to shutdown topBundlesLoadDataStore", e);
841+
}
842+
}
843+
829844
public static boolean isInternalTopic(String topic) {
830845
return INTERNAL_TOPICS.contains(topic)
831846
|| topic.startsWith(TOPIC)
@@ -841,13 +856,16 @@ synchronized void playLeader() {
841856
boolean becameFollower = false;
842857
while (!Thread.currentThread().isInterrupted()) {
843858
try {
844-
if (!initWaiter.get()) {
859+
if (!initWaiter.get() || disabled()) {
845860
return;
846861
}
847862
if (!serviceUnitStateChannel.isChannelOwner()) {
848863
becameFollower = true;
849864
break;
850865
}
866+
if (disabled()) {
867+
return;
868+
}
851869
// Confirm the system topics have been created or create them if they do not exist.
852870
// If the leader has changed, the new leader need to reset
853871
// the local brokerService.topics (by this topic creations).
@@ -859,6 +877,11 @@ synchronized void playLeader() {
859877
serviceUnitStateChannel.scheduleOwnershipMonitor();
860878
break;
861879
} catch (Throwable e) {
880+
if (disabled()) {
881+
log.warn("The broker:{} failed to set the role but exit because it's disabled",
882+
pulsar.getBrokerId(), e);
883+
return;
884+
}
862885
log.warn("The broker:{} failed to set the role. Retrying {} th ...",
863886
pulsar.getBrokerId(), ++retry, e);
864887
try {
@@ -870,6 +893,9 @@ synchronized void playLeader() {
870893
}
871894
}
872895
}
896+
if (disabled()) {
897+
return;
898+
}
873899

874900
if (becameFollower) {
875901
log.warn("The broker:{} became follower while initializing leader role.", pulsar.getBrokerId());
@@ -893,13 +919,16 @@ synchronized void playFollower() {
893919
boolean becameLeader = false;
894920
while (!Thread.currentThread().isInterrupted()) {
895921
try {
896-
if (!initWaiter.get()) {
922+
if (!initWaiter.get() || disabled()) {
897923
return;
898924
}
899925
if (serviceUnitStateChannel.isChannelOwner()) {
900926
becameLeader = true;
901927
break;
902928
}
929+
if (disabled()) {
930+
return;
931+
}
903932
unloadScheduler.close();
904933
serviceUnitStateChannel.cancelOwnershipMonitor();
905934
closeInternalTopics();
@@ -908,6 +937,11 @@ synchronized void playFollower() {
908937
topBundlesLoadDataStore.startProducer();
909938
break;
910939
} catch (Throwable e) {
940+
if (disabled()) {
941+
log.warn("The broker:{} failed to set the role but exit because it's disabled",
942+
pulsar.getBrokerId(), e);
943+
return;
944+
}
911945
log.warn("The broker:{} failed to set the role. Retrying {} th ...",
912946
pulsar.getBrokerId(), ++retry, e);
913947
try {
@@ -919,6 +953,9 @@ synchronized void playFollower() {
919953
}
920954
}
921955
}
956+
if (disabled()) {
957+
return;
958+
}
922959

923960
if (becameLeader) {
924961
log.warn("This broker:{} became leader while initializing follower role.", pulsar.getBrokerId());
@@ -997,9 +1034,20 @@ protected void monitor() {
9971034
}
9981035

9991036
public void disableBroker() throws Exception {
1037+
// TopicDoesNotExistException might be thrown and it's not recoverable. Enable this flag to exit playFollower()
1038+
// or playLeader() quickly.
1039+
if (!state.compareAndSet(State.RUNNING, State.DISABLED)) {
1040+
failForUnexpectedState("disableBroker");
1041+
}
1042+
stopLoadDataReportTasks();
10001043
serviceUnitStateChannel.cleanOwnerships();
1001-
leaderElectionService.close();
10021044
brokerRegistry.unregister();
1045+
leaderElectionService.close();
1046+
final var availableBrokers = brokerRegistry.getAvailableBrokersAsync()
1047+
.get(conf.getMetadataStoreOperationTimeoutSeconds(), TimeUnit.SECONDS);
1048+
if (availableBrokers.isEmpty()) {
1049+
close();
1050+
}
10031051
// Close the internal topics (if owned any) after giving up the possible leader role,
10041052
// so that the subsequent lookups could hit the next leader.
10051053
closeInternalTopics();
@@ -1033,4 +1081,16 @@ protected BrokerRegistry createBrokerRegistry(PulsarService pulsar) {
10331081
protected ServiceUnitStateChannel createServiceUnitStateChannel(PulsarService pulsar) {
10341082
return new ServiceUnitStateChannelImpl(pulsar);
10351083
}
1084+
1085+
private void failForUnexpectedState(String msg) {
1086+
throw new IllegalStateException("Failed to " + msg + ", state: " + state.get());
1087+
}
1088+
1089+
boolean running() {
1090+
return state.get() == State.RUNNING;
1091+
}
1092+
1093+
private boolean disabled() {
1094+
return state.get() == State.DISABLED;
1095+
}
10361096
}

pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/extensions/ExtensibleLoadManagerWrapper.java

+4
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,10 @@ public void start() throws PulsarServerException {
5050
loadManager.start();
5151
}
5252

53+
public boolean started() {
54+
return loadManager.running() && loadManager.getServiceUnitStateChannel().started();
55+
}
56+
5357
@Override
5458
public void initialize(PulsarService pulsar) {
5559
loadManager.initialize(pulsar);

pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/extensions/channel/ServiceUnitStateChannel.java

+5
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ public interface ServiceUnitStateChannel extends Closeable {
4343
*/
4444
void start() throws PulsarServerException;
4545

46+
/**
47+
* Whether the channel started.
48+
*/
49+
boolean started();
50+
4651
/**
4752
* Closes the ServiceUnitStateChannel.
4853
* @throws PulsarServerException if it fails to close the channel.

0 commit comments

Comments
 (0)