diff --git a/playbooks/roles/grafana/dashboards/lustre-and-hsm.json b/playbooks/roles/grafana/dashboards/lustre-and-hsm.json new file mode 100644 index 000000000..9668213d5 --- /dev/null +++ b/playbooks/roles/grafana/dashboards/lustre-and-hsm.json @@ -0,0 +1,1152 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 19, + "links": [], + "panels": [ + { + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 0 + }, + "id": 14, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "disk", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "used_percent" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [ + { + "key": "path", + "operator": "=", + "value": "/lustre" + }, + { + "condition": "AND", + "key": "host", + "operator": "=", + "value": "ondemand" + } + ] + } + ], + "title": "Usage on Disk (%)", + "type": "gauge" + }, + { + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 19, + "x": 5, + "y": 0 + }, + "id": 12, + "options": { + "graph": {}, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "alias": "$tag_host", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"load1\") AS \"Load1\" FROM \"system\" WHERE (\"host\" =~ /lustre/) AND $timeFilter GROUP BY time($__interval), \"host\" fill(none)", + "queryType": "randomWalk", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "title": "Lustre Load", + "type": "timeseries" + }, + { + "cacheTimeout": null, + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(200, 200, 200)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "$$hashKey": "object:2781", + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 5, + "x": 0, + "y": 7 + }, + "id": 10, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "alias": "$tag_status", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "status" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "lhsm", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"load5\") FROM \"system\" WHERE (\"host\" =~ /^$Host$/) AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": false, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "volume" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [] + } + ], + "title": "Volume", + "type": "stat" + }, + { + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 7 + }, + "id": 5, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "multi" + } + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "alias": "$tag_status", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "status" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "lhsm", + "orderByTime": "ASC", + "policy": "default", + "queryType": "randomWalk", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "volume" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Lustre HSM Status", + "type": "timeseries" + }, + { + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 15 + }, + "id": 6, + "options": { + "graph": {}, + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "multi" + } + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "alias": "$tag_user", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "user" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "lustre", + "orderByTime": "ASC", + "policy": "default", + "queryType": "randomWalk", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "volume" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Lustre Usage By User", + "type": "timeseries" + }, + { + "cacheTimeout": null, + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(200, 200, 200)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "$$hashKey": "object:2781", + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 5, + "x": 0, + "y": 21 + }, + "id": 8, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "alias": "$tag_status", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "status" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "lhsm", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT last(\"count\") FROM \"lhsm\" WHERE $timeFilter GROUP BY time($__interval), \"status\" fill(null)", + "rawQuery": false, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "count" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + } + ] + ], + "tags": [] + } + ], + "title": "Files", + "type": "stat" + }, + { + "datasource": "azhpc", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "graph": false, + "legend": false, + "tooltip": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 19, + "x": 5, + "y": 23 + }, + "id": 16, + "links": [], + "options": { + "graph": {}, + "legend": { + "calcs": [ + "mean", + "min", + "max" + ], + "displayMode": "table", + "placement": "right" + }, + "tooltipOptions": { + "mode": "single" + } + }, + "pluginVersion": "7.5.4", + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "lustre2", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT difference(mean(\"unlink\")) AS \"unlink\", difference(mean(\"close\")) AS \"close\", difference(mean(\"getattr\")) AS \"getattr\", difference(mean(\"getxattr\")) AS \"getxattr\", difference(mean(\"link\")) AS \"link\", difference(mean(\"mkdir\")) AS \"mkdir\", difference(mean(\"setxattr\")) AS \"setxattr\", difference(mean(\"open\")) AS \"open\", difference(mean(\"rename\")) AS \"rename\", difference(mean(\"rmdir\")) AS \"rmdir\", difference(mean(\"mknod\")) AS \"mknod\", difference(mean(\"setattr\")) AS \"setattr\", difference(mean(\"statfs\")) AS \"statfs\", difference(mean(\"setxattr\")) AS \"setxattr\", difference(mean(\"sync\")) AS \"sync\" FROM \"lustre2\" WHERE (\"host\" = 'mds1') AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": false, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "unlink" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "unlink" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "close" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "close" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "getattr" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "getattr" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "getxattr" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "getxattr" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "link" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "link" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "mkdir" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "mkdir" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "setxattr" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "setxattr" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "open" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "open" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "rename" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "rename" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "rmdir" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "rmdir" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "mknod" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "mknod" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "setattr" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "setattr" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "statfs" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "statfs" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "setxattr" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "setxattr" + ], + "type": "alias" + } + ], + [ + { + "params": [ + "sync" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + }, + { + "params": [], + "type": "difference" + }, + { + "params": [ + "sync" + ], + "type": "alias" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=", + "value": "lustre" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Lustre Metadata", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Lustre and HSM", + "uid": "IX3HAHg7z", + "version": 11 + } \ No newline at end of file diff --git a/playbooks/telegraf.yml b/playbooks/telegraf.yml index 5e7008b18..a75af13ca 100644 --- a/playbooks/telegraf.yml +++ b/playbooks/telegraf.yml @@ -22,3 +22,54 @@ telegraf_influxdb_urls: - "http://jumpbox:8086" +- name: Robinhood metrics + hosts: robinhood + become: true + vars_files: + - '{{global_config_file}}' + + tasks: + - name: create telegraf plugin directory + file: + path: /opt/telegraf/scripts + state: directory + - name: create rbh_metrics script + copy: + dest: /opt/telegraf/scripts/rbh_metrics.sh + mode: 0755 + content: | + #!/bin/bash + while IFS= read -r LINE; do + timestamp=$(date +%s%N) + rbh-report -u "*" -c 2>/dev/null \ + | tail -n +2 \ + | head -n -2 \ + | sed 's/ *//g' \ + | awk -F, '{ print "lustre,user="$1",type="$2" count="$3",volume="$4",space_used="$5" " '$timestamp' }' + rbh-report --status-info lhsm -c 2>/dev/null \ + | tail -n +2 \ + | head -n -2 \ + | sed 's/ *//g;s/^,/none,/g' \ + | awk -F, '{ print "lhsm,status="$1",type="$2" count="$3",volume="$4",space_used="$5" " '$timestamp' }' + done + - name: permissions for rbh lustre.conf + file: + path: /etc/robinhood.d/lustre.conf + group: telegraf + mode: '0640' + - name: permissions for rbh .dbpassword + file: + path: /etc/robinhood.d/.dbpassword + group: telegraf + mode: '0640' + - name: add rbh metrics to telegraf config + blockinfile: + path: /etc/telegraf/telegraf.conf + block: | + [[inputs.execd]] + command = ["/opt/telegraf/scripts/rbh_metrics.sh"] + signal = "STDIN" + - name: restart telegraf + service: + name: telegraf + state: restarted