Skip to content
Snippets Groups Projects

Feat Add Handling of Invalid Metric Due to Prometheus Con Issue

Merged Mike Hanby requested to merge feat-handle-invalid-metric-from-prometheus into main
1 file
+ 10
2
Compare changes
  • Side-by-side
  • Inline
+ 10
2
@@ -6,7 +6,6 @@
@@ -6,7 +6,6 @@
# Copyright and/or license information if different from upstream
# Copyright and/or license information if different from upstream
#
#
# Requires 'curl' and 'jq'
# Requires 'curl' and 'jq'
#
declare HW_CONTEXT_SWITCH_RATE=0
declare HW_CONTEXT_SWITCH_RATE=0
declare HW_CONTEXT_SWITCH_INTERVAL=0
declare HW_CONTEXT_SWITCH_INTERVAL=0
@@ -14,7 +13,7 @@ declare HW_CONTEXT_SWITCH_INTERVAL=0
@@ -14,7 +13,7 @@ declare HW_CONTEXT_SWITCH_INTERVAL=0
# Gather the metrics
# Gather the metrics
function uabrc_hw_gather_data() {
function uabrc_hw_gather_data() {
# Gather the nodes context switching rate from Prometheus
# Gather the nodes context switching rate from Prometheus
HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
HW_CONTEXT_SWITCH_RATE=$(curl --connect-timeout 5 -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
# Convert to an integer (hacky as it doesn't round, but insignificant)
# Convert to an integer (hacky as it doesn't round, but insignificant)
HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*}
HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*}
}
}
@@ -29,6 +28,15 @@ function uabrc_check_hw_context_switch_rate() {
@@ -29,6 +28,15 @@ function uabrc_check_hw_context_switch_rate() {
uabrc_hw_gather_data
uabrc_hw_gather_data
fi
fi
 
# Check again after data gather, if still 0, call nhcmain_finish to take no action
 
# Possible cause:
 
# - node_exporter isn't running on the node, i.e. no metrics sent to Prometheus
 
# - Prometheus unreachable?
 
if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
 
echo "$FUNCNAME: Unable to retrieve HW_CONTEXT_SWITCH_RATE from Prometheus, not changing state of node $HOSTNAME."
 
nhcmain_finish
 
fi
 
if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."
die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."
return 1
return 1
Loading