Add option to the CMS to set the threshold of FAULTY pdisks per node (#16932)

SammyVimes · web-flow · commit af01dce2baf1 · 2025-04-21T18:33:30.000+04:00
If the number of FAULTY PDisks on a node — including those already FAULTY or about to be marked FAULTY — exceeds this threshold, no additional disks on the same node will be marked as FAULTY, except for those explicitly marked as FAULTY by the user via the replace devices command.
diff --git a/ydb/core/cms/config.h b/ydb/core/cms/config.h
@@ -32,6 +32,7 @@ struct TCmsSentinelConfig {
     ui32 DataCenterRatio;
     ui32 RoomRatio;
     ui32 RackRatio;
+    ui32 FaultyPDisksThresholdPerNode;
 
     TMaybeFail<EPDiskStatus> EvictVDisksStatus;
 
@@ -49,6 +50,7 @@ struct TCmsSentinelConfig {
         config.SetDataCenterRatio(DataCenterRatio);
         config.SetRoomRatio(RoomRatio);
         config.SetRackRatio(RackRatio);
+        config.SetFaultyPDisksThresholdPerNode(FaultyPDisksThresholdPerNode);
 
         SaveStateLimits(config);
         SaveEvictVDisksStatus(config);
@@ -68,6 +70,7 @@ struct TCmsSentinelConfig {
         DataCenterRatio = config.GetDataCenterRatio();
         RoomRatio = config.GetRoomRatio();
         RackRatio = config.GetRackRatio();
+        FaultyPDisksThresholdPerNode = config.GetFaultyPDisksThresholdPerNode();
 
         auto newStateLimits = LoadStateLimits(config);
         StateLimits.swap(newStateLimits);
diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp
@@ -268,23 +268,28 @@ TClusterMap::TClusterMap(TSentinelState::TPtr state)
 {
 }
 
-void TClusterMap::AddPDisk(const TPDiskID& id) {
+void TClusterMap::AddPDisk(const TPDiskID& id, const bool inGoodState) {
     Y_ABORT_UNLESS(State->Nodes.contains(id.NodeId));
     const auto& location = State->Nodes[id.NodeId].Location;
 
     ByDataCenter[location.HasKey(TNodeLocation::TKeys::DataCenter) ? location.GetDataCenterId() : ""].insert(id);
     ByRoom[location.HasKey(TNodeLocation::TKeys::Module) ? location.GetModuleId() : ""].insert(id);
     ByRack[location.HasKey(TNodeLocation::TKeys::Rack) ? location.GetRackId() : ""].insert(id);
     NodeByRack[location.HasKey(TNodeLocation::TKeys::Rack) ? location.GetRackId() : ""].insert(id.NodeId);
+
+    if (!inGoodState) {
+        BadByNode[ToString(id.NodeId)].insert(id);
+    }
 }
 
 /// TGuardian
 
-TGuardian::TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio, ui32 roomRatio, ui32 rackRatio)
+TGuardian::TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio, ui32 roomRatio, ui32 rackRatio, ui32 faultyPDisksThresholdPerNode)
     : TClusterMap(state)
     , DataCenterRatio(dataCenterRatio)
     , RoomRatio(roomRatio)
     , RackRatio(rackRatio)
+    , FaultyPDisksThresholdPerNode(faultyPDisksThresholdPerNode)
 {
 }
 
@@ -347,6 +352,31 @@ TClusterMap::TPDiskIDSet TGuardian::GetAllowedPDisks(const TClusterMap& all, TSt
         }
     }
 
+    if (FaultyPDisksThresholdPerNode != 0) {
+        // If the number of FAULTY PDisks on a node — including those already FAULTY or about to be marked FAULTY —
+        // exceeds this threshold, no additional disks on the same node will be marked as FAULTY,
+        // except for those explicitly marked as FAULTY by the user via the replace devices command.
+        for (const auto& kv : BadByNode) {
+            if (kv.first) {
+                auto it = all.BadByNode.find(kv.first);
+                ui32 currentCount = it != all.BadByNode.end() ? it->second.size() : 0;
+
+                if (currentCount > FaultyPDisksThresholdPerNode) {
+                    for (const auto& pdisk : kv.second) {
+                        disallowed.emplace(pdisk, NKikimrCms::TPDiskInfo::TOO_MANY_FAULTY_PER_NODE);
+                        result.erase(pdisk);
+                    }
+                    auto disallowedPdisks = disallowed | std::views::keys;
+                    issuesBuilder
+                        << "Ignore state updates due to FaultyPDisksThresholdPerNode"
+                        << ": changed# " << kv.second.size()
+                        << ", total# " << currentCount
+                        << ", affected pdisks# " << JoinSeq(", ", disallowedPdisks) << Endl;
+                }
+            }
+        }
+    }
+
     #undef LOG_IGNORED
 
     issues = issuesBuilder;
@@ -941,7 +971,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
         }
 
         TClusterMap all(SentinelState);
-        TGuardian changed(SentinelState, Config.DataCenterRatio, Config.RoomRatio, Config.RackRatio);
+        TGuardian changed(SentinelState, Config.DataCenterRatio, Config.RoomRatio, Config.RackRatio, Config.FaultyPDisksThresholdPerNode);
         TClusterMap::TPDiskIDSet alwaysAllowed;
 
         for (auto& pdisk : SentinelState->PDisks) {
@@ -956,18 +986,20 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
                 continue;
             }
 
+            bool hasGoodState = NKikimrBlobStorage::TPDiskState::Normal == info.GetState();
+
             if (it->second.HasFaultyMarker() && Config.EvictVDisksStatus.Defined()) {
+                hasGoodState = false;
                 info.SetForcedStatus(*Config.EvictVDisksStatus);
             } else {
                 info.ResetForcedStatus();
             }
-
-            all.AddPDisk(id);
+            all.AddPDisk(id, hasGoodState);
             if (info.IsChanged()) {
                 if (info.IsNewStatusGood() || info.HasForcedStatus()) {
                     alwaysAllowed.insert(id);
                 } else {
-                    changed.AddPDisk(id);
+                    changed.AddPDisk(id, hasGoodState);
                 }
             } else {
                 info.AllowChanging();
diff --git a/ydb/core/cms/sentinel_impl.h b/ydb/core/cms/sentinel_impl.h
@@ -154,10 +154,11 @@ class TClusterMap {
     TDistribution ByRoom;
     TDistribution ByRack;
     THashMap<TString, TNodeIDSet> NodeByRack;
+    TDistribution BadByNode;
 
     explicit TClusterMap(TSentinelState::TPtr state);
 
-    void AddPDisk(const TPDiskID& id);
+    void AddPDisk(const TPDiskID& id, const bool inGoodState = true);
 
 }; // TClusterMap
 
@@ -171,14 +172,15 @@ class TGuardian : public TClusterMap {
     }
 
 public:
-    explicit TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio = 100, ui32 roomRatio = 100, ui32 rackRatio = 100);
+    explicit TGuardian(TSentinelState::TPtr state, ui32 dataCenterRatio = 100, ui32 roomRatio = 100, ui32 rackRatio = 100, ui32 faultyPDisksThresholdPerNode = 0);
 
     TPDiskIDSet GetAllowedPDisks(const TClusterMap& all, TString& issues, TPDiskIgnoredMap& disallowed) const;
 
 private:
     const ui32 DataCenterRatio;
     const ui32 RoomRatio;
     const ui32 RackRatio;
+    const ui32 FaultyPDisksThresholdPerNode;
 
 }; // TGuardian
 
diff --git a/ydb/core/cms/sentinel_ut.cpp b/ydb/core/cms/sentinel_ut.cpp
@@ -375,6 +375,67 @@ Y_UNIT_TEST_SUITE(TSentinelBaseTests) {
         GuardianDataCenterRatio(1, {3, 4, 5}, true);
     }
 
+    void GuardianBadPDisksByNode(ui32 shelvesPerNode, ui32 disksPerShelf, ui32 badDisks) {
+        ui32 disksPerNode = shelvesPerNode * disksPerShelf;
+        ui32 maxFaultyDisksPerNode = disksPerShelf - 1;
+
+        auto [state, sentinelState] = MockCmsState(1, 8, 1, disksPerNode, true, false);
+        TClusterMap all(sentinelState);
+
+        TGuardian changed(sentinelState, 100, 100, 100, maxFaultyDisksPerNode);
+
+        const auto& nodes = state->ClusterInfo->AllNodes();
+
+        for (const auto& node : nodes) {
+            const ui64 nodeId = node.second->NodeId;
+            
+            for (ui32 i = 0; i < disksPerNode; i++) {
+                const TPDiskID id(nodeId, i);
+    
+                if (i < badDisks) {
+                    all.AddPDisk(id, false);
+                    changed.AddPDisk(id, false);
+                } else {
+                    all.AddPDisk(id);
+                }
+            }
+        }
+
+        TString issues;
+        TClusterMap::TPDiskIgnoredMap disallowed;
+
+        auto allowed = changed.GetAllowedPDisks(all, issues, disallowed);
+
+        THashMap<ui64, ui32> allowedDisksByNode;
+        THashMap<ui64, ui32> disallowedDisksByNode;
+
+        for (const auto& id : allowed) {
+            allowedDisksByNode[id.NodeId]++;
+        }
+
+        for (const auto& kv : disallowed) {
+            UNIT_ASSERT(kv.second == NKikimrCms::TPDiskInfo::TOO_MANY_FAULTY_PER_NODE);
+            disallowedDisksByNode[kv.first.NodeId]++;
+        }
+
+        for (const auto& node : nodes) {
+            const ui64 nodeId = node.second->NodeId;
+            if (badDisks <= maxFaultyDisksPerNode) {
+                UNIT_ASSERT_VALUES_EQUAL(allowedDisksByNode[nodeId], badDisks);
+                UNIT_ASSERT_VALUES_EQUAL(disallowedDisksByNode[nodeId], 0);
+            } else {
+                UNIT_ASSERT_VALUES_EQUAL(allowedDisksByNode[nodeId], 0);
+                UNIT_ASSERT_VALUES_EQUAL(disallowedDisksByNode[nodeId], badDisks);
+            }
+        }
+    }
+
+    Y_UNIT_TEST(GuardianFaultyPDisks) {
+        for (ui32 i = 0; i < 56; i++) {
+            GuardianBadPDisksByNode(2, 28, i);
+        }
+    }
+
     void GuardianRackRatio(ui16 numRacks, const TVector<ui16>& nodesPerRackVariants, ui16 numPDisks, bool anyRack) {
         for (ui16 nodesPerRack : nodesPerRackVariants) {
             auto [state, sentinelState] = MockCmsState(1, numRacks, nodesPerRack, numPDisks, false, anyRack);
@@ -532,6 +593,107 @@ Y_UNIT_TEST_SUITE(TSentinelTests) {
         }
     }
 
+    Y_UNIT_TEST(PDiskFaultyGuard) {
+        ui32 nodes = 2;
+        ui32 disksPerShelf = 5;
+        ui32 disksPerNode = 2 * disksPerShelf;
+
+        for (auto wholeShelfFailure : {true, false}) {
+            NKikimrCms::TCmsConfig config;
+
+            config.MutableSentinelConfig()->SetFaultyPDisksThresholdPerNode(disksPerShelf - 1);
+            TTestEnv env(nodes, disksPerNode, config);
+            env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_ERROR);
+
+            for (ui32 nodeIdx = 0; nodeIdx < nodes; ++nodeIdx) {
+                ui32 badDisks = wholeShelfFailure ? disksPerShelf : disksPerShelf / 2;
+
+                for (ui32 pdiskIdx = 0; pdiskIdx < badDisks - 1; ++pdiskIdx) {
+                    const TPDiskID id = env.PDiskId(nodeIdx, pdiskIdx);
+
+                    env.SetPDiskState({id}, FaultyStates[0]);
+                }
+
+                // Next disk (last badDisk)
+                const TPDiskID id = env.PDiskId(nodeIdx, badDisks);
+
+                bool targetSeenFaulty = false;
+
+                auto observerHolder = env.AddObserver<TEvBlobStorage::TEvControllerConfigRequest>([&](TEvBlobStorage::TEvControllerConfigRequest::TPtr& event) {
+                    const auto& request = event->Get()->Record;
+                    for (const auto& command : request.GetRequest().GetCommand()) {
+                        if (command.HasUpdateDriveStatus()) {
+                            const auto& update = command.GetUpdateDriveStatus();
+                            ui32 nodeId = update.GetHostKey().GetNodeId();
+                            ui32 pdiskId = update.GetPDiskId();
+
+                            if (id.NodeId == nodeId && id.DiskId == pdiskId) {
+                                if (update.GetStatus() == NKikimrBlobStorage::EDriveStatus::FAULTY) {
+                                    targetSeenFaulty = true;
+                                }
+                            }
+                        }
+                    }
+                });
+
+                for (ui32 i = 1; i < DefaultErrorStateLimit + 1; ++i) { // More than DefaultErrorStateLimit just to be sure
+                    env.SetPDiskState({id}, FaultyStates[0]);
+                }
+
+                env.SimulateSleep(TDuration::Minutes(5));
+
+                observerHolder.Remove();
+
+                if (wholeShelfFailure) {
+                    UNIT_ASSERT_C(!targetSeenFaulty, "Faulty state should not have been sent to BS controller because whole shelf failed");
+                } else {
+                    UNIT_ASSERT_C(targetSeenFaulty, "Faulty state should have been sent to BS controller");
+                }
+            }
+        }
+    }
+
+    Y_UNIT_TEST(PDiskFaultyGuardWithForced) {
+        ui32 nodes = 2;
+        ui32 disksPerShelf = 5;
+        ui32 disksPerNode = 2 * disksPerShelf;
+
+        NKikimrCms::TCmsConfig config;
+
+        config.MutableSentinelConfig()->SetFaultyPDisksThresholdPerNode(disksPerShelf - 1);
+        TTestEnv env(nodes, disksPerNode, config);
+        env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_ERROR);
+
+        std::map<ui32, std::set<ui32>> faultyDisks;
+
+        auto observerHolder = env.AddObserver<TEvBlobStorage::TEvControllerConfigRequest>([&](TEvBlobStorage::TEvControllerConfigRequest::TPtr& event) {
+            const auto& request = event->Get()->Record;
+            for (const auto& command : request.GetRequest().GetCommand()) {
+                if (command.HasUpdateDriveStatus()) {
+                    const auto& update = command.GetUpdateDriveStatus();
+                    ui32 nodeId = update.GetHostKey().GetNodeId();
+                    ui32 pdiskId = update.GetPDiskId();
+
+                    faultyDisks[nodeId].insert(pdiskId);
+                }
+            }
+        });
+
+        for (ui32 nodeIdx = 0; nodeIdx < nodes; ++nodeIdx) {
+            env.SetNodeFaulty(env.GetNodeId(nodeIdx), true);
+
+            env.SimulateSleep(TDuration::Minutes(5));
+        }
+
+        observerHolder.Remove();
+
+        for (ui32 nodeIdx = 0; nodeIdx < nodes; ++nodeIdx) {
+            ui32 nodeId = env.GetNodeId(nodeIdx);
+
+            UNIT_ASSERT_VALUES_EQUAL(faultyDisks[nodeId].size(), disksPerNode);
+        }
+    }
+
     Y_UNIT_TEST(BSControllerUnresponsive) {
         TTestEnv env(8, 4);
         env.EnableNoisyBSCPipe();
diff --git a/ydb/core/cms/sentinel_ut_helpers.h b/ydb/core/cms/sentinel_ut_helpers.h
@@ -83,7 +83,7 @@ class TTestEnv: public TCmsTestEnv {
     }
 
 public:
-    explicit TTestEnv(ui32 nodeCount, ui32 pdisks)
+    explicit TTestEnv(ui32 nodeCount, ui32 pdisks, const NKikimrCms::TCmsConfig &config = {})
         : TCmsTestEnv(nodeCount, pdisks)
     {
         SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG);
@@ -123,6 +123,7 @@ class TTestEnv: public TCmsTestEnv {
         });
 
         State = new TCmsState;
+        State->Config.Deserialize(config);
         MockClusterInfo(State->ClusterInfo);
         State->CmsActorId = GetSender();
 
@@ -167,6 +168,26 @@ class TTestEnv: public TCmsTestEnv {
         return info->PDisks;
     }
 
+    void SetNodeFaulty(ui32 nodeId, bool faulty) {
+        if (faulty) {
+            auto v = TVector<NCms::TEvSentinel::TEvUpdateHostMarkers::THostMarkers>();
+            v.push_back({
+                .NodeId = nodeId,
+                .Markers = {NKikimrCms::EMarker::MARKER_DISK_FAULTY},
+            });
+
+            Send(new IEventHandle(Sentinel, TActorId(), new TEvSentinel::TEvUpdateHostMarkers(std::move(v))));
+        } else {
+            auto v = TVector<NCms::TEvSentinel::TEvUpdateHostMarkers::THostMarkers>();
+            v.push_back({
+                .NodeId = nodeId,
+                .Markers = {},
+            });
+
+            Send(new IEventHandle(Sentinel, TActorId(), new TEvSentinel::TEvUpdateHostMarkers(std::move(v))));
+        }
+    }
+
     void SetPDiskState(const TSet<TPDiskID>& pdisks, EPDiskState state) {
         SetPDiskStateImpl(pdisks, state);
 
diff --git a/ydb/core/protos/cms.proto b/ydb/core/protos/cms.proto
@@ -452,6 +452,14 @@ message TCmsConfig {
         optional uint32 DataCenterRatio = 10 [default = 50];
         optional uint32 RoomRatio = 11 [default = 70];
         optional uint32 RackRatio = 12 [default = 90];
+        // Similar to *Ratio settings, but specified in absolute numbers and applied per storage node.
+        // This limit helps prevent cascading overreaction when many disks go offline on a single host
+        // (due to disk shelf disconnection).
+        // If the number of FAULTY PDisks on a node — including those already FAULTY or about to be marked FAULTY —
+        // exceeds this threshold, no additional disks on the same node will be marked as FAULTY,
+        // except for those explicitly marked as FAULTY by the user via the replace devices command.
+        // If set to 0, this check is disabled.
+        optional uint32 FaultyPDisksThresholdPerNode = 17 [default = 0];
 
         optional bool DryRun = 13;
         repeated TStateLimit StateLimits = 14;
@@ -646,6 +654,7 @@ message TPDiskInfo {
         RATIO_BY_DATACENTER = 3;
         RATIO_BY_ROOM = 4;
         RATIO_BY_RACK = 5;
+        TOO_MANY_FAULTY_PER_NODE = 6;
     }
 
     optional uint32 State = 1; // EPDiskState