Skip to content

Commit edbfee7

Browse files
authored
EBS Metrics (#151)
* Start EBS * save work * Save Work * Fix Release Builds * Misc Improvements * Fix Naming
1 parent 1969922 commit edbfee7

File tree

9 files changed

+402
-3
lines changed

9 files changed

+402
-3
lines changed

AtlasAgent/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ target_link_libraries(atlas_system_agent
1616
nvml
1717
proc
1818
disk
19+
ebs
1920
ethtool
2021
ntp
2122
perfmetrics

AtlasAgent/src/atlas-agent.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <lib/collectors/cpu_freq/src/cpufreq.h>
77
#include <lib/collectors/dcgm/src/dcgm_stats.h>
88
#include <lib/collectors/disk/src/disk.h>
9+
#include <lib/collectors/ebs/src/ebs.h>
910
#include <lib/collectors/ethtool/src/ethtool.h>
1011
#include <lib/collectors/nvml/src/gpumetrics.h>
1112
#include <lib/collectors/ntp/src/ntp.h>
@@ -262,7 +263,7 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagen
262263
gpuDCGM.emplace(registry);
263264
}
264265

265-
// TODO: DCGM & ServiceMonitor have Dynamic metric collection. During each iteration we have to
266+
// TODO: DCGM, EBS, and ServiceMonitor have Dynamic metric collection. During each iteration we have to
266267
// check if these optionals have a set value. lets improve how we handle this
267268

268269
// Create a ServiceMonitor object to monitor Systemd services if any configs are valid
@@ -275,6 +276,16 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagen
275276
Logger()->info("Service Monitoring is disabled.");
276277
}
277278

279+
// Create an EBS collector object to monitor EBS devices if any configs are valid
280+
std::optional<EBSCollector<TaggingRegistry> > ebsMetrics{};
281+
std::optional<std::unordered_set<std::string> > ebsConfig{parse_ebs_config_directory(EBSConstants::ConfigPath)};
282+
if (ebsConfig.has_value()) {
283+
ebsMetrics.emplace(registry, ebsConfig.value());
284+
}
285+
else{
286+
Logger()->info("EBS Monitoring is disabled.");
287+
}
288+
278289
if (gpuDCGM.has_value()) {
279290
std::string serviceStatus = atlasagent::is_service_running(DCGMConstants::ServiceName) ? "ON" : "OFF";
280291
Logger()->info("DCGMI binary present. Agent will collect DCGM metrics if service is ON. DCGM service state: {}.", serviceStatus);
@@ -316,6 +327,10 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagen
316327
Logger()->error("Failed to gather DCGM metrics");
317328
}
318329
}
330+
331+
if (ebsMetrics.has_value() && ebsMetrics.value().gather_metrics() == false) {
332+
Logger()->error("Failed to gather EBS metrics");
333+
}
319334

320335
if (serviceMetrics.has_value() && serviceMetrics.value().gather_metrics() == false) {
321336
Logger()->error("Failed to gather Service metrics");

lib/collectors/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ add_subdirectory(cgroup)
33
add_subdirectory(cpu_freq)
44
add_subdirectory(dcgm)
55
add_subdirectory(disk)
6+
add_subdirectory(ebs)
67
add_subdirectory(ethtool)
78
add_subdirectory(ntp)
89
add_subdirectory(nvml)

lib/collectors/ebs/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
add_library(ebs
2+
src/ebs.cpp
3+
src/ebs.h
4+
)
5+
6+
target_include_directories(ebs
7+
PUBLIC ${CMAKE_SOURCE_DIR}
8+
)
9+
10+
# Add dependencies
11+
target_link_libraries(ebs
12+
fmt::fmt
13+
abseil::abseil
14+
spectator
15+
tagging
16+
)

lib/collectors/ebs/src/ebs.cpp

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#include "ebs.h"
2+
3+
#include <lib/util/src/util.h>
4+
5+
#include <fcntl.h>
6+
#include <filesystem>
7+
#include <regex>
8+
#include <sys/ioctl.h>
9+
#include <unistd.h>
10+
11+
struct EBSMetricConstants {
12+
// Operation types
13+
static constexpr auto ReadOp{"read"};
14+
static constexpr auto WriteOp{"write"};
15+
16+
// Device types
17+
static constexpr auto Volume{"volume"};
18+
static constexpr auto Instance{"instance"};
19+
20+
// Metric types
21+
static constexpr auto ebsOperations{"aws.ebs.totalOperations"};
22+
static constexpr auto ebsBytes{"aws.ebs.totalBytes"};
23+
static constexpr auto ebsTime{"aws.ebs.totalTime"};
24+
static constexpr auto ebsIOPS{"aws.ebs.perfExceededIOPS"};
25+
static constexpr auto ebsTP{"aws.ebs.perfExceededTput"};
26+
static constexpr auto ebsQueueLength{"aws.ebs.volumeQueueLength"};
27+
static constexpr auto ebsHistogram{"aws.ebs.ioLatencyHistogram"};
28+
29+
// Conversion Constants
30+
static constexpr auto ebsMicrosecondsToSeconds{.000001};
31+
};
32+
using EBSMC = EBSMetricConstants;
33+
34+
template class EBSCollector<atlasagent::TaggingRegistry>;
35+
36+
std::optional<std::vector<std::string>> ebs_parse_regex_config_file(const char* configFilePath) try {
37+
// Read the all the device paths in the config file
38+
std::optional<std::vector<std::string>> configContents = atlasagent::read_file(configFilePath);
39+
if (configContents.has_value() == false) {
40+
atlasagent::Logger()->error("Error reading config file {}", configFilePath);
41+
return std::nullopt;
42+
}
43+
44+
// Skip empty files
45+
if (configContents.value().empty()) {
46+
atlasagent::Logger()->debug("Empty config file {}", configFilePath);
47+
return std::nullopt;
48+
}
49+
50+
// Read all the device paths and assert the device exists
51+
std::vector<std::string> devicePaths{};
52+
for (const auto& device : configContents.value()) {
53+
if (std::filesystem::exists(device) == false) {
54+
atlasagent::Logger()->error("Device path: {} not valid in config file {}", device, configFilePath);
55+
return std::nullopt;
56+
}
57+
devicePaths.emplace_back(device);
58+
}
59+
return devicePaths;
60+
} catch (const std::exception& e) {
61+
atlasagent::Logger()->error("Exception: {} in parse_regex_config_file", e.what());
62+
return std::nullopt;
63+
}
64+
65+
std::optional<std::unordered_set<std::string>> parse_ebs_config_directory(const char* directoryPath) try {
66+
// Check if the directory exists and is a directory
67+
if (std::filesystem::exists(directoryPath) == false || std::filesystem::is_directory(directoryPath) == false) {
68+
atlasagent::Logger()->error("Invalid service ebs config directory {}", directoryPath);
69+
return std::nullopt;
70+
}
71+
72+
std::regex configFileExtPattern(EBSConstants::ConfigFileExtPattern);
73+
std::unordered_set<std::string> allDevices{};
74+
75+
// Iterate through all files in the config directory, but do not process them if they do not match the service
76+
// monitoring config regex pattern ".ebs-devices"
77+
for (const auto& file : std::filesystem::recursive_directory_iterator(directoryPath)) {
78+
if (std::regex_match(file.path().filename().string(), configFileExtPattern) == false) {
79+
continue;
80+
}
81+
82+
// Read all the devices listed in the config file
83+
auto devicePaths = ebs_parse_regex_config_file(file.path().c_str());
84+
if (devicePaths.has_value() == false) {
85+
atlasagent::Logger()->error("Could not add devices from config file {}", file.path().c_str());
86+
continue;
87+
}
88+
89+
// Insert each device path individually into the set
90+
for (const auto& devicePath : devicePaths.value()) {
91+
allDevices.insert(devicePath);
92+
}
93+
}
94+
95+
// If no devices are to be monitored, log the error and return nullopt
96+
if (allDevices.empty()) {
97+
atlasagent::Logger()->info("No ebs regex patterns found in directory {}", directoryPath);
98+
return std::nullopt;
99+
}
100+
101+
return allDevices;
102+
} catch (const std::exception& e) {
103+
atlasagent::Logger()->error("Exception: {} in parse_service_monitor_config_directory", e.what());
104+
return std::nullopt;
105+
}
106+
107+
template <typename Reg>
108+
EBSCollector<Reg>::EBSCollector(Reg* registry, const std::unordered_set<std::string>& config)
109+
: config{config},
110+
registry_{registry} {}
111+
112+
template <typename Reg>
113+
bool EBSCollector<Reg>::query_stats_from_device(const std::string& device, nvme_get_amzn_stats_logpage& stats) try {
114+
nvme_admin_command admin_cmd = {};
115+
admin_cmd.opcode = NVMeCommands::GetLogPage;
116+
admin_cmd.addr = (uint64_t)&stats;
117+
admin_cmd.alen = sizeof(stats);
118+
admin_cmd.nsid = 1;
119+
admin_cmd.cdw10 = NVMeCommands::StatsLogPageId | (1024 << 16);
120+
121+
int fd = open(device.c_str(), O_RDONLY);
122+
if (fd == -1) {
123+
std::error_code ec(errno, std::system_category());
124+
atlasagent::Logger()->error("Failed to open device {}: {}", device, ec.message());
125+
return false;
126+
}
127+
128+
int ret = ioctl(fd, NVMeCommands::AdminCommand, &admin_cmd);
129+
close(fd);
130+
131+
if (ret < 0) {
132+
std::error_code ec(errno, std::system_category());
133+
atlasagent::Logger()->error("Failed to call ioctl on device {}: {}", device, ec.message());
134+
return false;
135+
}
136+
137+
if (stats._magic != NVMeCommands::StatsMagic) {
138+
atlasagent::Logger()->error("Not an EBS device: {}", device);
139+
return false;
140+
}
141+
142+
return true;
143+
} catch (const std::exception& e) {
144+
atlasagent::Logger()->error("Exception: {} in parse_service_monitor_config_directory", e.what());
145+
return false;
146+
}
147+
148+
template <typename Reg>
149+
bool EBSCollector<Reg>::handle_histogram(const ebs_nvme_histogram& histogram, const std::string& devicePath, const std::string& type) {
150+
if (histogram.num_bins > AtlasNamingConvention.size()) {
151+
atlasagent::Logger()->error("Histogram has more bins than expected: {} > {}", histogram.num_bins, AtlasNamingConvention.size());
152+
return false;
153+
}
154+
for (uint64_t i = 0; i < histogram.num_bins; i++) {
155+
ebsHistogram(registry_, EBSMC::ebsHistogram, devicePath, type, AtlasNamingConvention.at(i))->Set(histogram.bins[i].count);
156+
}
157+
return true;
158+
}
159+
160+
template <class Reg>
161+
bool EBSCollector<Reg>::update_metrics(const std::string &devicePath, const nvme_get_amzn_stats_logpage &stats) {
162+
if (this->registry_ == nullptr) {
163+
return false;
164+
}
165+
166+
ebsMonocounter(registry_, EBSMC::ebsOperations, devicePath, EBSMC::ReadOp)->Set(stats.total_read_ops);
167+
ebsMonocounter(registry_, EBSMC::ebsOperations, devicePath, EBSMC::WriteOp)->Set(stats.total_write_ops);
168+
169+
ebsMonocounter(registry_, EBSMC::ebsBytes, devicePath, EBSMC::ReadOp)->Set(stats.total_read_bytes);
170+
ebsMonocounter(registry_, EBSMC::ebsBytes, devicePath, EBSMC::WriteOp)->Set(stats.total_write_bytes);
171+
172+
ebsMonocounter(registry_, EBSMC::ebsTime, devicePath, EBSMC::ReadOp)->Set(stats.total_read_time * EBSMC::ebsMicrosecondsToSeconds);
173+
ebsMonocounter(registry_, EBSMC::ebsTime, devicePath, EBSMC::WriteOp)->Set(stats.total_write_time * EBSMC::ebsMicrosecondsToSeconds);
174+
175+
ebsMonocounter(registry_, EBSMC::ebsIOPS, devicePath, EBSMC::Volume)->Set(stats.ebs_volume_performance_exceeded_iops * EBSMC::ebsMicrosecondsToSeconds);
176+
ebsMonocounter(registry_, EBSMC::ebsIOPS, devicePath, EBSMC::Instance)->Set(stats.ec2_instance_ebs_performance_exceeded_iops * EBSMC::ebsMicrosecondsToSeconds);
177+
178+
ebsMonocounter(registry_, EBSMC::ebsTP, devicePath, EBSMC::Volume)->Set(stats.ebs_volume_performance_exceeded_tp * EBSMC::ebsMicrosecondsToSeconds);
179+
ebsMonocounter(registry_, EBSMC::ebsTP, devicePath, EBSMC::Instance)->Set(stats.ec2_instance_ebs_performance_exceeded_tp * EBSMC::ebsMicrosecondsToSeconds);
180+
181+
ebsGauge(registry_, EBSMC::ebsQueueLength, devicePath)->Set(stats.volume_queue_length);
182+
183+
bool success {true};
184+
if (false == handle_histogram(stats.read_io_latency_histogram, devicePath, EBSMC::ReadOp)) {
185+
atlasagent::Logger()->error("Failed to handle read histogram for device {}", devicePath);
186+
success = false;
187+
}
188+
189+
if (false == handle_histogram(stats.write_io_latency_histogram, devicePath, EBSMC::WriteOp)) {
190+
atlasagent::Logger()->error("Failed to handle write histogram for device {}", devicePath);
191+
success = false;
192+
}
193+
194+
return success;
195+
}
196+
197+
template <typename Reg>
198+
bool EBSCollector<Reg>::gather_metrics() {
199+
bool success{true};
200+
// Iterate through all the devices in the config
201+
for (const auto& device : config) {
202+
// Gather statistics for each device
203+
nvme_get_amzn_stats_logpage stats {};
204+
if (false == query_stats_from_device(device, stats)) {
205+
atlasagent::Logger()->error("Failed to query stats from device {}", device);
206+
success = false;
207+
continue;
208+
}
209+
// Push the metrics to spectatorD
210+
if (update_metrics(device, stats) == false) {
211+
atlasagent::Logger()->error("Failed to update metrics for device {}", device);
212+
success = false;
213+
}
214+
}
215+
return success;
216+
}

0 commit comments

Comments
 (0)