Skip to content

implementing heartbeat critical tolerance for unhealthy detection #29

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/Models/AppConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public static string _SPACEFX_CONFIG_DIR {

public int HEARTBEAT_PULSE_TIMING_MS { get; set; }
public int HEARTBEAT_RECEIVED_TOLERANCE_MS { get; set; }
public int HEARTBEAT_RECEIVED_CRITICAL_TOLERANCE_MS { get; set; }
public bool RESOURCE_MONITOR_ENABLED { get; set; }
public int RESOURCE_MONITOR_TIMING_MS { get; set; }
public bool RESOURCE_SCAVENGER_ENABLED { get; set; }
Expand All @@ -40,6 +41,16 @@ public APP_CONFIG() {
HEARTBEAT_PULSE_TIMING_MS = 2000;
}

try {
HEARTBEAT_RECEIVED_CRITICAL_TOLERANCE_MS = int.Parse(GetConfigSetting("heartbeatreceivedcriticaltolerancems").Result);
} catch (Exception ex) {
Console.WriteLine("Error retrieving heartbeatreceivedcriticaltolerancems: " + ex.Message);
Console.WriteLine("Setting default value of '60000'");
HEARTBEAT_PULSE_TIMING_MS = 60000;
}



try {
HEARTBEAT_RECEIVED_TOLERANCE_MS = int.Parse(GetConfigSetting("heartbeatreceivedtolerancems").Result);
} catch (Exception ex) {
Expand Down
14 changes: 6 additions & 8 deletions src/Services/HeartbeatService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class HeartbeatService : IHostedService, Core.IMonitorableService {
private readonly ILogger<HeartbeatService> _logger;
private readonly TimeSpan HeartBeatPulseTiming;
private readonly TimeSpan _heartBeatHeardTolerance;
private readonly TimeSpan _heartBeatHeardCriticalTolerance;
private readonly Core.Client _client;
private readonly IServiceProvider _serviceProvider;
private readonly IHostApplicationLifetime _appLifetime;
Expand All @@ -19,9 +20,11 @@ public class HeartbeatService : IHostedService, Core.IMonitorableService {
private readonly Core.APP_CONFIG _appConfig;
private readonly DateTime _appStartTime;
public bool IsHealthy() {
if (_heartbeatsHeard.IsEmpty && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardTolerance * 2)) {
DateTime heartbeatStaleTime = DateTime.UtcNow.Subtract(_heartBeatHeardTolerance);

if (_heartbeatsHeard.Values.Where(p => p.CurrentSystemTime.ToDateTime().ToUniversalTime() >= heartbeatStaleTime).Count() == 0 && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardCriticalTolerance)) {
// Log a critical error and return a false value to indicate an unhealthy state.
_logger.LogCritical("No heartbeats have been heard in the last {tolerance}. Returning unhealthy. ", _heartBeatHeardTolerance);
_logger.LogCritical($"No heartbeats have been heard in the last {_heartBeatHeardCriticalTolerance}. Returning unhealthy. ");
return false;
}

Expand All @@ -41,6 +44,7 @@ public HeartbeatService(ILogger<HeartbeatService> logger, IServiceProvider servi

HeartBeatPulseTiming = TimeSpan.FromMilliseconds(_appConfig.HEARTBEAT_PULSE_TIMING_MS);
_heartBeatHeardTolerance = TimeSpan.FromMilliseconds(_appConfig.HEARTBEAT_RECEIVED_TOLERANCE_MS);
_heartBeatHeardCriticalTolerance = TimeSpan.FromMilliseconds(_appConfig.HEARTBEAT_RECEIVED_CRITICAL_TOLERANCE_MS);

_logger.LogInformation("Services.{serviceName} Initialized. HeartBeatPulseTiming: {pulseTiming} HeartBeatHeardTolerance: {pulseHeardTolerance} ", nameof(HeartbeatService), HeartBeatPulseTiming, _heartBeatHeardTolerance);

Expand Down Expand Up @@ -142,12 +146,6 @@ internal void RemoveStaleHeartbeatsFromCache() {
// Log successful removal of stale heartbeats.
_logger.LogTrace("All stale heartbeats successfully removed.");

// Check if the cache is empty and the current time exceeds the app start time by the tolerance period.
if (_heartbeatsHeard.IsEmpty && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardTolerance * 2)) {
// Log a critical error and throw an exception to potentially trigger a service restart.
_logger.LogCritical("No heartbeats have been heard in the last {tolerance}. Triggering an exception to restart the pod.", _heartBeatHeardTolerance);
throw new ApplicationException($"No heartbeats have been heard in the last {_heartBeatHeardTolerance}. Triggering an exception to restart the pod.");
}
} catch (Exception ex) {
// Log any exceptions that occur during the process and rethrow to handle them accordingly.
_logger.LogError(ex, "Exception while removing stale heartbeats from cache.");
Expand Down
5 changes: 0 additions & 5 deletions src/Services/LivenessCheck.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@ public Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, Canc
List<IMonitorableService> monitorableServices = _serviceProvider.GetServices<IHostedService>().Where(service => service is IMonitorableService).Cast<IMonitorableService>().ToList();


// // Add
// monitorableServices.Append(_resourceUtilizationMonitor);
// monitorableServices.Append(_heartbeatService);
// monitorableServices.Append(_pluginLoader);

// Check the core services seperately
coreServiceHealthy = _messageReceiver.IsHealthy();
if (!coreServiceHealthy) {
Expand Down
Loading