Skip to content

Commit 057e7e3

Browse files
adding service monitoring health check
1 parent 17d4700 commit 057e7e3

12 files changed

+124
-91
lines changed

.devcontainer/devcontainer.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"workspaceFolder": "/workspaces/spacesdk-core",
1010
"workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/spacesdk-core,type=bind,consistency=cached",
1111
"features": {
12-
"ghcr.io/microsoft/azure-orbital-space-sdk/spacefx-dev:0.11.0": {
12+
"ghcr.io/microsoft/azure-orbital-space-sdk/spacefx-dev:0.11.0_ryan_test": {
1313
"app_name": "spacesdk-core",
1414
"app_type": "spacesdk-core",
1515
"addl_debug_shim_suffixes": "client"

src/Core.cs

+29
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ public interface IResponseHeaderMessage {
1212
MessageFormats.Common.ResponseHeader ResponseHeader { get; }
1313
}
1414

15+
public static bool WaitForServiceToComeOnline(string app_id, int timeoutMS = 10000) {
16+
if (Client._client == null) throw new Exception("Client is not provisioned. Please deploy the client before trying to run this");
17+
return Client._client.WaitForServiceToComeOnline(app_id, timeoutMS);
18+
}
19+
1520
/// <summary>
1621
/// Enables apps to check if the client has been provisioned yet
1722
/// </summary>
@@ -309,6 +314,30 @@ public Task<string> GetConfigSetting(string configFileName) {
309314
return _service.RetrieveServiceHeartbeats();
310315
}
311316

317+
/// <summary>
318+
/// Waits for a service to come online or times out
319+
/// </summary>
320+
/// <returns></returns>
321+
public bool WaitForServiceToComeOnline(string app_id, int timeoutMS = 10000) {
322+
Services.HeartbeatService? _service = Client._client._serviceProvider.GetService<Services.HeartbeatService>();
323+
DateTime maxWaitTime = DateTime.UtcNow.Add(TimeSpan.FromMilliseconds(timeoutMS));
324+
325+
while (_service == null && DateTime.UtcNow <= maxWaitTime) {
326+
Task.Delay(ClientDelayMS).Wait();
327+
_service = Client._client._serviceProvider.GetService<Services.HeartbeatService>();
328+
}
329+
330+
if (_service == null) {
331+
throw new TimeoutException("Timed out waiting for Heartbeat Service to come online.");
332+
}
333+
334+
while (_service.RetrieveServiceHeartbeats().Any(heartbeat => heartbeat.AppId.Equals(app_id, StringComparison.InvariantCultureIgnoreCase)) == false && DateTime.UtcNow <= maxWaitTime) {
335+
Task.Delay(ClientDelayMS).Wait();
336+
}
337+
338+
return _service.RetrieveServiceHeartbeats().Any(heartbeat => heartbeat.AppId.Equals(app_id, StringComparison.InvariantCultureIgnoreCase));
339+
}
340+
312341
/// <summary>
313342
/// Send a telemetry metric
314343
/// </summary>

src/Extensions/CoreServiceCollectionExtensions.cs

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@ public static IServiceCollection AddAzureOrbitalFramework(this IServiceCollectio
66
services.AddOptions();
77
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.Client>();
88
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.Services.MessageReceiver>();
9-
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.Services.HealthCheckService>();
109
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.Services.HeartbeatService>();
1110
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.Services.ResourceUtilizationMonitor>();
1211
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.Services.PluginLoader>();
13-
12+
services.AddGrpcHealthChecks().AddCheck<Microsoft.Azure.SpaceFx.Core.Services.LivenessCheck>("SDKHealthCheck");
1413
services.AddHttpClient<HttpClient>().ConfigurePrimaryHttpMessageHandler(() => new HttpClientHandler {
1514
CheckCertificateRevocationList = true
1615
});

src/Services/HealthCheckService.cs

-74
This file was deleted.

src/Services/HeartbeatService.cs

+8-7
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@ public class HeartbeatService : IHostedService, Core.IMonitorableService {
1919
private readonly Core.APP_CONFIG _appConfig;
2020
private readonly DateTime _appStartTime;
2121
public bool IsHealthy() {
22-
if (_heartbeatsHeard.IsEmpty && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardTolerance * 2)) {
23-
// Log a critical error and return a false value to indicate an unhealthy state.
24-
_logger.LogCritical("No heartbeats have been heard in the last {tolerance}. Returning unhealthy. ", _heartBeatHeardTolerance);
25-
return false;
26-
}
27-
28-
return true;
22+
// if (_heartbeatsHeard.IsEmpty && DateTime.UtcNow > _appStartTime.Add(_heartBeatHeardTolerance * 2)) {
23+
// // Log a critical error and return a false value to indicate an unhealthy state.
24+
// _logger.LogCritical("No heartbeats have been heard in the last {tolerance}. Returning unhealthy. ", _heartBeatHeardTolerance);
25+
// return false;
26+
// }
27+
28+
// return true;
29+
return false;
2930
}
3031

3132
public HeartbeatService(ILogger<HeartbeatService> logger, IServiceProvider serviceProvider, Core.Client client, IHostApplicationLifetime appLifetime) {

src/Services/LivenessCheck.cs

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
2+
using Microsoft.Extensions.Diagnostics.HealthChecks;
3+
4+
namespace Microsoft.Azure.SpaceFx;
5+
6+
public partial class Core {
7+
public partial class Services {
8+
public class LivenessCheck : IHealthCheck {
9+
private readonly ILogger<LivenessCheck> _logger;
10+
private readonly IServiceProvider _serviceProvider;
11+
private readonly IHostApplicationLifetime _appLifetime;
12+
private readonly Services.ResourceUtilizationMonitor _resourceUtilizationMonitor;
13+
private readonly Services.MessageReceiver _messageReceiver;
14+
private readonly Services.HeartbeatService _heartbeatService;
15+
16+
public LivenessCheck(ILogger<LivenessCheck> logger, IServiceProvider serviceProvider, IHostApplicationLifetime appLifetime, Services.MessageReceiver messageReceiver, Services.HeartbeatService heartbeatService, Services.ResourceUtilizationMonitor resourceUtilizationMonitor) {
17+
_logger = logger;
18+
_serviceProvider = serviceProvider;
19+
_appLifetime = appLifetime;
20+
_messageReceiver = messageReceiver;
21+
22+
_resourceUtilizationMonitor = resourceUtilizationMonitor;
23+
_heartbeatService = heartbeatService;
24+
25+
_logger.LogInformation("Services.{serviceName} Initialized.", nameof(LivenessCheck));
26+
27+
}
28+
29+
public Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context, CancellationToken cancellationToken = default) {
30+
// return Task.FromResult(HealthCheckResult.Unhealthy("The check indicates an unhealthy status."));
31+
32+
using (var scope = _serviceProvider.CreateScope()) {
33+
List<string> unhealthyServices = new List<string>();
34+
35+
var monitorableServices = _serviceProvider.GetServices<IMonitorableService>().ToList();
36+
37+
// Scan all the services utilizing the IMonitorableService interface and check if they are healthy
38+
unhealthyServices.AddRange(monitorableServices.Where(service => !service.IsHealthy()).Select(service => service.GetType().Name));
39+
40+
if (!_heartbeatService.IsHealthy())
41+
unhealthyServices.Add(_heartbeatService.GetType().Name);
42+
43+
if (!_messageReceiver.IsHealthy())
44+
unhealthyServices.Add(_messageReceiver.GetType().Name);
45+
46+
if (!_resourceUtilizationMonitor.IsHealthy())
47+
unhealthyServices.Add(_resourceUtilizationMonitor.GetType().Name);
48+
49+
if (unhealthyServices.Any()) {
50+
string unhealthServicesOutput = string.Join(",", unhealthyServices);
51+
_logger.LogCritical($"Unhealthy services detected. Services reporting unhealthy: {unhealthServicesOutput}");
52+
_logger.LogCritical("Triggering application stop.");
53+
_appLifetime.StopApplication();
54+
// throw new RpcException(new Status(StatusCode.Unknown, $"Unhealthy services detected. Services reporting unhealthy: {unhealthServicesOutput}"));
55+
return Task.FromResult(HealthCheckResult.Unhealthy($"Unhealthy services detected. Services reporting unhealthy: {unhealthServicesOutput}"));
56+
}
57+
58+
_logger.LogDebug("All services report healthy.");
59+
}
60+
61+
return Task.FromResult(HealthCheckResult.Healthy("Health check passed. All services report healthy."));
62+
}
63+
}
64+
}
65+
}

src/spacesdk-core.csproj

+3-1
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,11 @@
2121
</ItemGroup>
2222
<ItemGroup>
2323
<PackageReference Include="Google.Api.CommonProtos" Version="2.11.0" />
24-
<PackageReference Include="Google.Protobuf" Version="3.24.4" />
24+
<PackageReference Include="Google.Protobuf" Version="3.27.0" />
2525
<PackageReference Include="Grpc.AspNetCore" Version="2.57.0" />
26+
<PackageReference Include="Grpc.AspNetCore.HealthChecks" Version="2.65.0" />
2627
<PackageReference Include="Grpc.Net.Client" Version="2.61.0" />
28+
<PackageReference Include="Microsoft.AspNetCore.Diagnostics.HealthChecks" Version="2.2.0" />
2729
<PackageReference Include="Microsoft.Extensions.Http" Version="7.0.0" />
2830
<PackageReference Include="OpenTelemetry" Version="1.5.1" />
2931
<PackageReference Include="OpenTelemetry.Exporter.Console" Version="1.5.1" />

test/debugHost/Program.cs

+9-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ public static void Main(string[] args) {
1717
services.AddAzureOrbitalFramework();
1818
services.AddHostedService<Worker>();
1919
services.AddHostedService<WorkerBravo>();
20+
21+
services.AddSingleton<Worker>();
22+
services.AddHostedService<Worker>(p => p.GetRequiredService<Worker>());
23+
24+
services.AddSingleton<WorkerBravo>();
25+
services.AddHostedService<WorkerBravo>(p => p.GetRequiredService<WorkerBravo>());
26+
27+
2028
services.AddSingleton<Microsoft.Azure.SpaceFx.Core.IMessageHandler<Microsoft.Azure.SpaceFx.MessageFormats.Testing.SimpleMessage>, MessageHandler<Microsoft.Azure.SpaceFx.MessageFormats.Testing.SimpleMessage>>();
2129
services.AddSingleton(plugins);
2230
services.AddSingleton<Utils.PluginDelegates>();
@@ -31,10 +39,10 @@ public static void Main(string[] args) {
3139
app.UseRouting();
3240
app.UseEndpoints(endpoints => {
3341
endpoints.MapGrpcService<Microsoft.Azure.SpaceFx.Core.Services.MessageReceiver>();
34-
endpoints.MapGrpcService<Microsoft.Azure.SpaceFx.Core.Services.HealthCheckService>();
3542
endpoints.MapGet("/", async context => {
3643
await context.Response.WriteAsync("Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909");
3744
});
45+
endpoints.MapGrpcHealthChecksService();
3846
});
3947

4048
// Add a middleware to catch exceptions and stop the host gracefully

test/integrationTestHost/Program.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public static void Main(string[] args) {
2121
app.UseRouting();
2222
app.UseEndpoints(endpoints => {
2323
endpoints.MapGrpcService<Core.Services.MessageReceiver>();
24-
endpoints.MapGrpcService<Core.Services.HealthCheckService>();
24+
endpoints.MapGrpcHealthChecksService();
2525
endpoints.MapGet("/", async context => {
2626
await context.Response.WriteAsync("Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909");
2727
});

test/integrationTests/TestSharedContext.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ public TestSharedContext() {
3939
_grpcHost.UseRouting();
4040
_grpcHost.UseEndpoints(endpoints => {
4141
endpoints.MapGrpcService<Core.Services.MessageReceiver>();
42-
endpoints.MapGrpcService<Core.Services.HealthCheckService>();
42+
endpoints.MapGrpcHealthChecksService();
4343
endpoints.MapGet("/", async context => {
4444
await context.Response.WriteAsync("Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909");
4545
});

test/integrationTests/appsettings.json

+4-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
"System.Net.Http": "Error",
88
"Microsoft.Azure.SpaceFx": "Debug",
99
"Microsoft.Azure.SpaceFx.Core.Services.MessageReceiver": "Error",
10-
"Microsoft.Azure.SpaceFx.Core.Services.HeartbeatService": "Error",
11-
"Microsoft.Azure.SpaceFx.Core.Services.ResourceUtilizationMonitor": "Error"
10+
"Microsoft.Azure.SpaceFx.Core.Services.HeartbeatService": "Debug",
11+
"Microsoft.Azure.SpaceFx.Core.Services.ResourceUtilizationMonitor": "Error",
12+
"Microsoft.Azure.SpaceFx.Core.Services.PluginLoader": "Error",
13+
"Microsoft.Azure.SpaceFx.Core.Services.HealthCheckService": "Error"
1214
},
1315
"Console": {
1416
"TimestampFormat": "[yyyy-MM-dd HH:mm:ss] "

test/integrationTests/integrationTests.csproj

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
</PackageReference>
1616
<PackageReference Include="Grpc.AspNetCore" Version="2.57.0" />
1717
<PackageReference Include="JunitXml.TestLogger" Version="3.1.12" />
18+
<PackageReference Include="Microsoft.AspNetCore.Mvc.Testing" Version="6.0.0" />
1819
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.7.2" />
1920
<PackageReference Include="xunit" Version="2.5.1" />
2021
<PackageReference Include="xunit.runner.visualstudio" Version="2.5.1">
@@ -26,6 +27,6 @@
2627
<ProjectReference Include="/workspaces/spacesdk-core/src/spacesdk-core.csproj" />
2728
</ItemGroup>
2829
<ItemGroup>
29-
<Protobuf Include="/workspaces/spacesdk-core/test/protos/Testing.proto" GrpcServices="Both" Access="Public" ProtoCompile="True" CompileOutputs="True" ProtoRoot="/workspaces/spacesdk-core" OutputDir="obj/Debug/net6.0/" AdditionalImportDirs="/var/spacedev/protos/" ></Protobuf>
30+
<Protobuf Include="/workspaces/spacesdk-core/test/protos/Testing.proto" GrpcServices="Both" Access="Public" ProtoCompile="True" CompileOutputs="True" ProtoRoot="/workspaces/spacesdk-core" OutputDir="obj/Debug/net6.0/" AdditionalImportDirs="/var/spacedev/protos/"></Protobuf>
3031
</ItemGroup>
3132
</Project>

0 commit comments

Comments
 (0)