1
+ #include " ebs.h"
2
+
3
+ #include < lib/util/src/util.h>
4
+
5
+ #include < fcntl.h>
6
+ #include < filesystem>
7
+ #include < regex>
8
+ #include < sys/ioctl.h>
9
+ #include < unistd.h>
10
+
11
+ struct EBSMetricConstants {
12
+ // Operation types
13
+ static constexpr auto ReadOp{" read" };
14
+ static constexpr auto WriteOp{" write" };
15
+
16
+ // Device types
17
+ static constexpr auto Volume{" volume" };
18
+ static constexpr auto Instance{" instance" };
19
+
20
+ // Metric types
21
+ static constexpr auto ebsOperations{" aws.ebs.totalOperations" };
22
+ static constexpr auto ebsBytes{" aws.ebs.totalBytes" };
23
+ static constexpr auto ebsTime{" aws.ebs.totalTime" };
24
+ static constexpr auto ebsIOPS{" aws.ebs.perfExceededIOPS" };
25
+ static constexpr auto ebsTP{" aws.ebs.perfExceededTput" };
26
+ static constexpr auto ebsQueueLength{" aws.ebs.volumeQueueLength" };
27
+ static constexpr auto ebsHistogram{" aws.ebs.ioLatencyHistogram" };
28
+
29
+ // Conversion Constants
30
+ static constexpr auto ebsMicrosecondsToSeconds{.000001 };
31
+ };
32
+ using EBSMC = EBSMetricConstants;
33
+
34
+ template class EBSCollector <atlasagent::TaggingRegistry>;
35
+
36
+ std::optional<std::vector<std::string>> ebs_parse_regex_config_file (const char * configFilePath) try {
37
+ // Read the all the device paths in the config file
38
+ std::optional<std::vector<std::string>> configContents = atlasagent::read_file (configFilePath);
39
+ if (configContents.has_value () == false ) {
40
+ atlasagent::Logger ()->error (" Error reading config file {}" , configFilePath);
41
+ return std::nullopt;
42
+ }
43
+
44
+ // Skip empty files
45
+ if (configContents.value ().empty ()) {
46
+ atlasagent::Logger ()->debug (" Empty config file {}" , configFilePath);
47
+ return std::nullopt;
48
+ }
49
+
50
+ // Read all the device paths and assert the device exists
51
+ std::vector<std::string> devicePaths{};
52
+ for (const auto & device : configContents.value ()) {
53
+ if (std::filesystem::exists (device) == false ) {
54
+ atlasagent::Logger ()->error (" Device path: {} not valid in config file {}" , device, configFilePath);
55
+ return std::nullopt;
56
+ }
57
+ devicePaths.emplace_back (device);
58
+ }
59
+ return devicePaths;
60
+ } catch (const std::exception & e) {
61
+ atlasagent::Logger ()->error (" Exception: {} in parse_regex_config_file" , e.what ());
62
+ return std::nullopt;
63
+ }
64
+
65
+ std::optional<std::unordered_set<std::string>> parse_ebs_config_directory (const char * directoryPath) try {
66
+ // Check if the directory exists and is a directory
67
+ if (std::filesystem::exists (directoryPath) == false || std::filesystem::is_directory (directoryPath) == false ) {
68
+ atlasagent::Logger ()->error (" Invalid service ebs config directory {}" , directoryPath);
69
+ return std::nullopt;
70
+ }
71
+
72
+ std::regex configFileExtPattern (EBSConstants::ConfigFileExtPattern);
73
+ std::unordered_set<std::string> allDevices{};
74
+
75
+ // Iterate through all files in the config directory, but do not process them if they do not match the service
76
+ // monitoring config regex pattern ".ebs-devices"
77
+ for (const auto & file : std::filesystem::recursive_directory_iterator (directoryPath)) {
78
+ if (std::regex_match (file.path ().filename ().string (), configFileExtPattern) == false ) {
79
+ continue ;
80
+ }
81
+
82
+ // Read all the devices listed in the config file
83
+ auto devicePaths = ebs_parse_regex_config_file (file.path ().c_str ());
84
+ if (devicePaths.has_value () == false ) {
85
+ atlasagent::Logger ()->error (" Could not add devices from config file {}" , file.path ().c_str ());
86
+ continue ;
87
+ }
88
+
89
+ // Insert each device path individually into the set
90
+ for (const auto & devicePath : devicePaths.value ()) {
91
+ allDevices.insert (devicePath);
92
+ }
93
+ }
94
+
95
+ // If no devices are to be monitored, log the error and return nullopt
96
+ if (allDevices.empty ()) {
97
+ atlasagent::Logger ()->info (" No ebs regex patterns found in directory {}" , directoryPath);
98
+ return std::nullopt;
99
+ }
100
+
101
+ return allDevices;
102
+ } catch (const std::exception & e) {
103
+ atlasagent::Logger ()->error (" Exception: {} in parse_service_monitor_config_directory" , e.what ());
104
+ return std::nullopt;
105
+ }
106
+
107
+ template <typename Reg>
108
+ EBSCollector<Reg>::EBSCollector(Reg* registry, const std::unordered_set<std::string>& config)
109
+ : config{config},
110
+ registry_{registry} {}
111
+
112
+ template <typename Reg>
113
+ bool EBSCollector<Reg>::query_stats_from_device(const std::string& device, nvme_get_amzn_stats_logpage& stats) try {
114
+ nvme_admin_command admin_cmd = {};
115
+ admin_cmd.opcode = NVMeCommands::GetLogPage;
116
+ admin_cmd.addr = (uint64_t )&stats;
117
+ admin_cmd.alen = sizeof (stats);
118
+ admin_cmd.nsid = 1 ;
119
+ admin_cmd.cdw10 = NVMeCommands::StatsLogPageId | (1024 << 16 );
120
+
121
+ int fd = open (device.c_str (), O_RDONLY);
122
+ if (fd == -1 ) {
123
+ std::error_code ec (errno, std::system_category ());
124
+ atlasagent::Logger ()->error (" Failed to open device {}: {}" , device, ec.message ());
125
+ return false ;
126
+ }
127
+
128
+ int ret = ioctl (fd, NVMeCommands::AdminCommand, &admin_cmd);
129
+ close (fd);
130
+
131
+ if (ret < 0 ) {
132
+ std::error_code ec (errno, std::system_category ());
133
+ atlasagent::Logger ()->error (" Failed to call ioctl on device {}: {}" , device, ec.message ());
134
+ return false ;
135
+ }
136
+
137
+ if (stats._magic != NVMeCommands::StatsMagic) {
138
+ atlasagent::Logger ()->error (" Not an EBS device: {}" , device);
139
+ return false ;
140
+ }
141
+
142
+ return true ;
143
+ } catch (const std::exception & e) {
144
+ atlasagent::Logger ()->error (" Exception: {} in parse_service_monitor_config_directory" , e.what ());
145
+ return false ;
146
+ }
147
+
148
+ template <typename Reg>
149
+ bool EBSCollector<Reg>::handle_histogram(const ebs_nvme_histogram& histogram, const std::string& devicePath, const std::string& type) {
150
+ if (histogram.num_bins > AtlasNamingConvention.size ()) {
151
+ atlasagent::Logger ()->error (" Histogram has more bins than expected: {} > {}" , histogram.num_bins , AtlasNamingConvention.size ());
152
+ return false ;
153
+ }
154
+ for (uint64_t i = 0 ; i < histogram.num_bins ; i++) {
155
+ ebsHistogram (registry_, EBSMC::ebsHistogram, devicePath, type, AtlasNamingConvention.at (i))->Set (histogram.bins [i].count );
156
+ }
157
+ return true ;
158
+ }
159
+
160
+ template <class Reg >
161
+ bool EBSCollector<Reg>::update_metrics(const std::string &devicePath, const nvme_get_amzn_stats_logpage &stats) {
162
+ if (this ->registry_ == nullptr ) {
163
+ return false ;
164
+ }
165
+
166
+ ebsMonocounter (registry_, EBSMC::ebsOperations, devicePath, EBSMC::ReadOp)->Set (stats.total_read_ops );
167
+ ebsMonocounter (registry_, EBSMC::ebsOperations, devicePath, EBSMC::WriteOp)->Set (stats.total_write_ops );
168
+
169
+ ebsMonocounter (registry_, EBSMC::ebsBytes, devicePath, EBSMC::ReadOp)->Set (stats.total_read_bytes );
170
+ ebsMonocounter (registry_, EBSMC::ebsBytes, devicePath, EBSMC::WriteOp)->Set (stats.total_write_bytes );
171
+
172
+ ebsMonocounter (registry_, EBSMC::ebsTime, devicePath, EBSMC::ReadOp)->Set (stats.total_read_time * EBSMC::ebsMicrosecondsToSeconds);
173
+ ebsMonocounter (registry_, EBSMC::ebsTime, devicePath, EBSMC::WriteOp)->Set (stats.total_write_time * EBSMC::ebsMicrosecondsToSeconds);
174
+
175
+ ebsMonocounter (registry_, EBSMC::ebsIOPS, devicePath, EBSMC::Volume)->Set (stats.ebs_volume_performance_exceeded_iops * EBSMC::ebsMicrosecondsToSeconds);
176
+ ebsMonocounter (registry_, EBSMC::ebsIOPS, devicePath, EBSMC::Instance)->Set (stats.ec2_instance_ebs_performance_exceeded_iops * EBSMC::ebsMicrosecondsToSeconds);
177
+
178
+ ebsMonocounter (registry_, EBSMC::ebsTP, devicePath, EBSMC::Volume)->Set (stats.ebs_volume_performance_exceeded_tp * EBSMC::ebsMicrosecondsToSeconds);
179
+ ebsMonocounter (registry_, EBSMC::ebsTP, devicePath, EBSMC::Instance)->Set (stats.ec2_instance_ebs_performance_exceeded_tp * EBSMC::ebsMicrosecondsToSeconds);
180
+
181
+ ebsGauge (registry_, EBSMC::ebsQueueLength, devicePath)->Set (stats.volume_queue_length );
182
+
183
+ bool success {true };
184
+ if (false == handle_histogram (stats.read_io_latency_histogram , devicePath, EBSMC::ReadOp)) {
185
+ atlasagent::Logger ()->error (" Failed to handle read histogram for device {}" , devicePath);
186
+ success = false ;
187
+ }
188
+
189
+ if (false == handle_histogram (stats.write_io_latency_histogram , devicePath, EBSMC::WriteOp)) {
190
+ atlasagent::Logger ()->error (" Failed to handle write histogram for device {}" , devicePath);
191
+ success = false ;
192
+ }
193
+
194
+ return success;
195
+ }
196
+
197
+ template <typename Reg>
198
+ bool EBSCollector<Reg>::gather_metrics() {
199
+ bool success{true };
200
+ // Iterate through all the devices in the config
201
+ for (const auto & device : config) {
202
+ // Gather statistics for each device
203
+ nvme_get_amzn_stats_logpage stats {};
204
+ if (false == query_stats_from_device (device, stats)) {
205
+ atlasagent::Logger ()->error (" Failed to query stats from device {}" , device);
206
+ success = false ;
207
+ continue ;
208
+ }
209
+ // Push the metrics to spectatorD
210
+ if (update_metrics (device, stats) == false ) {
211
+ atlasagent::Logger ()->error (" Failed to update metrics for device {}" , device);
212
+ success = false ;
213
+ }
214
+ }
215
+ return success;
216
+ }
0 commit comments