Skip to content

Commit af01dce

Browse files
authored
Add option to the CMS to set the threshold of FAULTY pdisks per node (#16932)
If the number of FAULTY PDisks on a node — including those already FAULTY or about to be marked FAULTY — exceeds this threshold, no additional disks on the same node will be marked as FAULTY, except for those explicitly marked as FAULTY by the user via the replace devices command.
1 parent 2b67dee commit af01dce

File tree

6 files changed

+238
-9
lines changed

6 files changed

+238
-9
lines changed

ydb/core/cms/config.h

+3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct TCmsSentinelConfig {
3232
ui32 DataCenterRatio;
3333
ui32 RoomRatio;
3434
ui32 RackRatio;
35+
ui32 FaultyPDisksThresholdPerNode;
3536

3637
TMaybeFail<EPDiskStatus> EvictVDisksStatus;
3738

@@ -49,6 +50,7 @@ struct TCmsSentinelConfig {
4950
config.SetDataCenterRatio(DataCenterRatio);
5051
config.SetRoomRatio(RoomRatio);
5152
config.SetRackRatio(RackRatio);
53+
config.SetFaultyPDisksThresholdPerNode(FaultyPDisksThresholdPerNode);
5254

5355
SaveStateLimits(config);
5456
SaveEvictVDisksStatus(config);
@@ -68,6 +70,7 @@ struct TCmsSentinelConfig {
6870
DataCenterRatio = config.GetDataCenterRatio();
6971
RoomRatio = config.GetRoomRatio();
7072
RackRatio = config.GetRackRatio();
73+
FaultyPDisksThresholdPerNode = config.GetFaultyPDisksThresholdPerNode();
7174

7275
auto newStateLimits = LoadStateLimits(config);
7376
StateLimits.swap(newStateLimits);

ydb/core/cms/sentinel.cpp

+38-6
Original file line numberDiff line numberDiff line change
@@ -268,23 +268,28 @@ TClusterMap::TClusterMap(TSentinelState::TPtr state)
268268
{
269269
}
270270

271-
void TClusterMap::AddPDisk(const TPDiskID& id) {
271+
void TClusterMap::AddPDisk(const TPDiskID& id, const bool inGoodState) {
272272
Y_ABORT_UNLESS(State->Nodes.contains(id.NodeId));
273273
const auto& location = State->Nodes[id.NodeId].Location;
274274

275275
ByDataCenter[location.HasKey(TNodeLocation::TKeys::DataCenter) ? location.GetDataCenterId() : ""].insert(id);
276276
ByRoom[location.HasKey(TNodeLocation::TKeys::Module) ? location.GetModuleId() : ""].insert(id);
277277
ByRack[location.HasKey(TNodeLocation::TKeys::Rack) ? location.GetRackId() : ""].insert(id);
278278
NodeByRack[location.HasKey(TNodeLocation::TKeys::Rack) ? location.GetRackId() : ""].insert(id.NodeId);
279+
280+
if (!inGoodState) {
281+
BadByNode[ToString(id.NodeId)].insert(id);
282+
}
279283
}
280284

281285
/// TGuardian
282286

283-
TGuardian::TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio, ui32 roomRatio, ui32 rackRatio)
287+
TGuardian::TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio, ui32 roomRatio, ui32 rackRatio, ui32 faultyPDisksThresholdPerNode)
284288
: TClusterMap(state)
285289
, DataCenterRatio(dataCenterRatio)
286290
, RoomRatio(roomRatio)
287291
, RackRatio(rackRatio)
292+
, FaultyPDisksThresholdPerNode(faultyPDisksThresholdPerNode)
288293
{
289294
}
290295

@@ -347,6 +352,31 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt
347352
}
348353
}
349354

355+
if (FaultyPDisksThresholdPerNode != 0) {
356+
// If the number of FAULTY PDisks on a node — including those already FAULTY or about to be marked FAULTY —
357+
// exceeds this threshold, no additional disks on the same node will be marked as FAULTY,
358+
// except for those explicitly marked as FAULTY by the user via the replace devices command.
359+
for (const auto& kv : BadByNode) {
360+
if (kv.first) {
361+
auto it = all.BadByNode.find(kv.first);
362+
ui32 currentCount = it != all.BadByNode.end() ? it->second.size() : 0;
363+
364+
if (currentCount > FaultyPDisksThresholdPerNode) {
365+
for (const auto& pdisk : kv.second) {
366+
disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::TOO_MANY_FAULTY_PER_NODE);
367+
result.erase(pdisk);
368+
}
369+
auto disallowedPdisks = disallowed | std::views::keys;
370+
issuesBuilder
371+
<< "Ignore state updates due to FaultyPDisksThresholdPerNode"
372+
<< ": changed# " << kv.second.size()
373+
<< ", total# " << currentCount
374+
<< ", affected pdisks# " << JoinSeq(", ", disallowedPdisks) << Endl;
375+
}
376+
}
377+
}
378+
}
379+
350380
#undef LOG_IGNORED
351381

352382
issues = issuesBuilder;
@@ -941,7 +971,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
941971
}
942972

943973
TClusterMap all(SentinelState);
944-
TGuardian changed(SentinelState, Config.DataCenterRatio, Config.RoomRatio, Config.RackRatio);
974+
TGuardian changed(SentinelState, Config.DataCenterRatio, Config.RoomRatio, Config.RackRatio, Config.FaultyPDisksThresholdPerNode);
945975
TClusterMap::TPDiskIDSet alwaysAllowed;
946976

947977
for (auto& pdisk : SentinelState->PDisks) {
@@ -956,18 +986,20 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
956986
continue;
957987
}
958988

989+
bool hasGoodState = NKikimrBlobStorage::TPDiskState::Normal == info.GetState();
990+
959991
if (it->second.HasFaultyMarker() && Config.EvictVDisksStatus.Defined()) {
992+
hasGoodState = false;
960993
info.SetForcedStatus(*Config.EvictVDisksStatus);
961994
} else {
962995
info.ResetForcedStatus();
963996
}
964-
965-
all.AddPDisk(id);
997+
all.AddPDisk(id, hasGoodState);
966998
if (info.IsChanged()) {
967999
if (info.IsNewStatusGood() || info.HasForcedStatus()) {
9681000
alwaysAllowed.insert(id);
9691001
} else {
970-
changed.AddPDisk(id);
1002+
changed.AddPDisk(id, hasGoodState);
9711003
}
9721004
} else {
9731005
info.AllowChanging();

ydb/core/cms/sentinel_impl.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,11 @@ class TClusterMap {
154154
TDistribution ByRoom;
155155
TDistribution ByRack;
156156
THashMap<TString, TNodeIDSet> NodeByRack;
157+
TDistribution BadByNode;
157158

158159
explicit TClusterMap(TSentinelState::TPtr state);
159160

160-
void AddPDisk(const TPDiskID& id);
161+
void AddPDisk(const TPDiskID& id, const bool inGoodState = true);
161162

162163
}; // TClusterMap
163164

@@ -171,14 +172,15 @@ class TGuardian : public TClusterMap {
171172
}
172173

173174
public:
174-
explicit TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio = 100, ui32 roomRatio = 100, ui32 rackRatio = 100);
175+
explicit TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio = 100, ui32 roomRatio = 100, ui32 rackRatio = 100, ui32 faultyPDisksThresholdPerNode = 0);
175176

176177
TPDiskIDSet GetAllowedPDisks(const TClusterMap& all, TString& issues, TPDiskIgnoredMap& disallowed) const;
177178

178179
private:
179180
const ui32 DataCenterRatio;
180181
const ui32 RoomRatio;
181182
const ui32 RackRatio;
183+
const ui32 FaultyPDisksThresholdPerNode;
182184

183185
}; // TGuardian
184186

ydb/core/cms/sentinel_ut.cpp

+162
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,67 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
375375
GuardianDataCenterRatio(1, {3, 4, 5}, true);
376376
}
377377

378+
void GuardianBadPDisksByNode(ui32 shelvesPerNode, ui32 disksPerShelf, ui32 badDisks) {
379+
ui32 disksPerNode = shelvesPerNode * disksPerShelf;
380+
ui32 maxFaultyDisksPerNode = disksPerShelf - 1;
381+
382+
auto [state, sentinelState] = MockCmsState(1, 8, 1, disksPerNode, true, false);
383+
TClusterMap all(sentinelState);
384+
385+
TGuardian changed(sentinelState, 100, 100, 100, maxFaultyDisksPerNode);
386+
387+
const auto& nodes = state->ClusterInfo->AllNodes();
388+
389+
for (const auto& node : nodes) {
390+
const ui64 nodeId = node.second->NodeId;
391+
392+
for (ui32 i = 0; i < disksPerNode; i++) {
393+
const TPDiskID id(nodeId, i);
394+
395+
if (i < badDisks) {
396+
all.AddPDisk(id, false);
397+
changed.AddPDisk(id, false);
398+
} else {
399+
all.AddPDisk(id);
400+
}
401+
}
402+
}
403+
404+
TString issues;
405+
TClusterMap::TPDiskIgnoredMap disallowed;
406+
407+
auto allowed = changed.GetAllowedPDisks(all, issues, disallowed);
408+
409+
THashMap<ui64, ui32> allowedDisksByNode;
410+
THashMap<ui64, ui32> disallowedDisksByNode;
411+
412+
for (const auto& id : allowed) {
413+
allowedDisksByNode[id.NodeId]++;
414+
}
415+
416+
for (const auto& kv : disallowed) {
417+
UNIT_ASSERT(kv.second == NKikimrCms::TPDiskInfo::TOO_MANY_FAULTY_PER_NODE);
418+
disallowedDisksByNode[kv.first.NodeId]++;
419+
}
420+
421+
for (const auto& node : nodes) {
422+
const ui64 nodeId = node.second->NodeId;
423+
if (badDisks <= maxFaultyDisksPerNode) {
424+
UNIT_ASSERT_VALUES_EQUAL(allowedDisksByNode[nodeId], badDisks);
425+
UNIT_ASSERT_VALUES_EQUAL(disallowedDisksByNode[nodeId], 0);
426+
} else {
427+
UNIT_ASSERT_VALUES_EQUAL(allowedDisksByNode[nodeId], 0);
428+
UNIT_ASSERT_VALUES_EQUAL(disallowedDisksByNode[nodeId], badDisks);
429+
}
430+
}
431+
}
432+
433+
Y_UNIT_TEST(GuardianFaultyPDisks) {
434+
for (ui32 i = 0; i < 56; i++) {
435+
GuardianBadPDisksByNode(2, 28, i);
436+
}
437+
}
438+
378439
void GuardianRackRatio(ui16 numRacks, const TVector<ui16>& nodesPerRackVariants, ui16 numPDisks, bool anyRack) {
379440
for (ui16 nodesPerRack : nodesPerRackVariants) {
380441
auto [state, sentinelState] = MockCmsState(1, numRacks, nodesPerRack, numPDisks, false, anyRack);
@@ -532,6 +593,107 @@ Y_UNIT_TEST_SUITE(TSentinelTests) {
532593
}
533594
}
534595

596+
Y_UNIT_TEST(PDiskFaultyGuard) {
597+
ui32 nodes = 2;
598+
ui32 disksPerShelf = 5;
599+
ui32 disksPerNode = 2 * disksPerShelf;
600+
601+
for (auto wholeShelfFailure : {true, false}) {
602+
NKikimrCms::TCmsConfig config;
603+
604+
config.MutableSentinelConfig()->SetFaultyPDisksThresholdPerNode(disksPerShelf - 1);
605+
TTestEnv env(nodes, disksPerNode, config);
606+
env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_ERROR);
607+
608+
for (ui32 nodeIdx = 0; nodeIdx < nodes; ++nodeIdx) {
609+
ui32 badDisks = wholeShelfFailure ? disksPerShelf : disksPerShelf / 2;
610+
611+
for (ui32 pdiskIdx = 0; pdiskIdx < badDisks - 1; ++pdiskIdx) {
612+
const TPDiskID id = env.PDiskId(nodeIdx, pdiskIdx);
613+
614+
env.SetPDiskState({id}, FaultyStates[0]);
615+
}
616+
617+
// Next disk (last badDisk)
618+
const TPDiskID id = env.PDiskId(nodeIdx, badDisks);
619+
620+
bool targetSeenFaulty = false;
621+
622+
auto observerHolder = env.AddObserver<TEvBlobStorage::TEvControllerConfigRequest>([&](TEvBlobStorage::TEvControllerConfigRequest::TPtr& event) {
623+
const auto& request = event->Get()->Record;
624+
for (const auto& command : request.GetRequest().GetCommand()) {
625+
if (command.HasUpdateDriveStatus()) {
626+
const auto& update = command.GetUpdateDriveStatus();
627+
ui32 nodeId = update.GetHostKey().GetNodeId();
628+
ui32 pdiskId = update.GetPDiskId();
629+
630+
if (id.NodeId == nodeId && id.DiskId == pdiskId) {
631+
if (update.GetStatus() == NKikimrBlobStorage::EDriveStatus::FAULTY) {
632+
targetSeenFaulty = true;
633+
}
634+
}
635+
}
636+
}
637+
});
638+
639+
for (ui32 i = 1; i < DefaultErrorStateLimit + 1; ++i) { // More than DefaultErrorStateLimit just to be sure
640+
env.SetPDiskState({id}, FaultyStates[0]);
641+
}
642+
643+
env.SimulateSleep(TDuration::Minutes(5));
644+
645+
observerHolder.Remove();
646+
647+
if (wholeShelfFailure) {
648+
UNIT_ASSERT_C(!targetSeenFaulty, "Faulty state should not have been sent to BS controller because whole shelf failed");
649+
} else {
650+
UNIT_ASSERT_C(targetSeenFaulty, "Faulty state should have been sent to BS controller");
651+
}
652+
}
653+
}
654+
}
655+
656+
Y_UNIT_TEST(PDiskFaultyGuardWithForced) {
657+
ui32 nodes = 2;
658+
ui32 disksPerShelf = 5;
659+
ui32 disksPerNode = 2 * disksPerShelf;
660+
661+
NKikimrCms::TCmsConfig config;
662+
663+
config.MutableSentinelConfig()->SetFaultyPDisksThresholdPerNode(disksPerShelf - 1);
664+
TTestEnv env(nodes, disksPerNode, config);
665+
env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_ERROR);
666+
667+
std::map<ui32, std::set<ui32>> faultyDisks;
668+
669+
auto observerHolder = env.AddObserver<TEvBlobStorage::TEvControllerConfigRequest>([&](TEvBlobStorage::TEvControllerConfigRequest::TPtr& event) {
670+
const auto& request = event->Get()->Record;
671+
for (const auto& command : request.GetRequest().GetCommand()) {
672+
if (command.HasUpdateDriveStatus()) {
673+
const auto& update = command.GetUpdateDriveStatus();
674+
ui32 nodeId = update.GetHostKey().GetNodeId();
675+
ui32 pdiskId = update.GetPDiskId();
676+
677+
faultyDisks[nodeId].insert(pdiskId);
678+
}
679+
}
680+
});
681+
682+
for (ui32 nodeIdx = 0; nodeIdx < nodes; ++nodeIdx) {
683+
env.SetNodeFaulty(env.GetNodeId(nodeIdx), true);
684+
685+
env.SimulateSleep(TDuration::Minutes(5));
686+
}
687+
688+
observerHolder.Remove();
689+
690+
for (ui32 nodeIdx = 0; nodeIdx < nodes; ++nodeIdx) {
691+
ui32 nodeId = env.GetNodeId(nodeIdx);
692+
693+
UNIT_ASSERT_VALUES_EQUAL(faultyDisks[nodeId].size(), disksPerNode);
694+
}
695+
}
696+
535697
Y_UNIT_TEST(BSControllerUnresponsive) {
536698
TTestEnv env(8, 4);
537699
env.EnableNoisyBSCPipe();

ydb/core/cms/sentinel_ut_helpers.h

+22-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ class TTestEnv: public TCmsTestEnv {
8383
}
8484

8585
public:
86-
explicit TTestEnv(ui32 nodeCount, ui32 pdisks)
86+
explicit TTestEnv(ui32 nodeCount, ui32 pdisks, const NKikimrCms::TCmsConfig &config = {})
8787
: TCmsTestEnv(nodeCount, pdisks)
8888
{
8989
SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG);
@@ -123,6 +123,7 @@ class TTestEnv: public TCmsTestEnv {
123123
});
124124

125125
State = new TCmsState;
126+
State->Config.Deserialize(config);
126127
MockClusterInfo(State->ClusterInfo);
127128
State->CmsActorId = GetSender();
128129

@@ -167,6 +168,26 @@ class TTestEnv: public TCmsTestEnv {
167168
return info->PDisks;
168169
}
169170

171+
void SetNodeFaulty(ui32 nodeId, bool faulty) {
172+
if (faulty) {
173+
auto v = TVector<NCms::TEvSentinel::TEvUpdateHostMarkers::THostMarkers>();
174+
v.push_back({
175+
.NodeId = nodeId,
176+
.Markers = {NKikimrCms::EMarker::MARKER_DISK_FAULTY},
177+
});
178+
179+
Send(new IEventHandle(Sentinel, TActorId(), new TEvSentinel::TEvUpdateHostMarkers(std::move(v))));
180+
} else {
181+
auto v = TVector<NCms::TEvSentinel::TEvUpdateHostMarkers::THostMarkers>();
182+
v.push_back({
183+
.NodeId = nodeId,
184+
.Markers = {},
185+
});
186+
187+
Send(new IEventHandle(Sentinel, TActorId(), new TEvSentinel::TEvUpdateHostMarkers(std::move(v))));
188+
}
189+
}
190+
170191
void SetPDiskState(const TSet<TPDiskID>& pdisks, EPDiskState state) {
171192
SetPDiskStateImpl(pdisks, state);
172193

ydb/core/protos/cms.proto

+9
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,14 @@ message TCmsConfig {
452452
optional uint32 DataCenterRatio = 10 [default = 50];
453453
optional uint32 RoomRatio = 11 [default = 70];
454454
optional uint32 RackRatio = 12 [default = 90];
455+
// Similar to *Ratio settings, but specified in absolute numbers and applied per storage node.
456+
// This limit helps prevent cascading overreaction when many disks go offline on a single host
457+
// (due to disk shelf disconnection).
458+
// If the number of FAULTY PDisks on a node — including those already FAULTY or about to be marked FAULTY —
459+
// exceeds this threshold, no additional disks on the same node will be marked as FAULTY,
460+
// except for those explicitly marked as FAULTY by the user via the replace devices command.
461+
// If set to 0, this check is disabled.
462+
optional uint32 FaultyPDisksThresholdPerNode = 17 [default = 0];
455463

456464
optional bool DryRun = 13;
457465
repeated TStateLimit StateLimits = 14;
@@ -646,6 +654,7 @@ message TPDiskInfo {
646654
RATIO_BY_DATACENTER = 3;
647655
RATIO_BY_ROOM = 4;
648656
RATIO_BY_RACK = 5;
657+
TOO_MANY_FAULTY_PER_NODE = 6;
649658
}
650659

651660
optional uint32 State = 1; // EPDiskState

0 commit comments

Comments
 (0)