Skip to content
Snippets Groups Projects
Commit 30691eb0 authored by Mike Hanby's avatar Mike Hanby
Browse files

Merge branch 'feat-handle-invalid-metric-from-prometheus' into 'main'

Feat Add Handling of Invalid Metric Due to Prometheus Con Issue

See merge request !11
parents 5f7593ba 6c40d217
No related branches found
No related tags found
1 merge request!11Feat Add Handling of Invalid Metric Due to Prometheus Con Issue
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
# Copyright and/or license information if different from upstream # Copyright and/or license information if different from upstream
# #
# Requires 'curl' and 'jq' # Requires 'curl' and 'jq'
#
declare HW_CONTEXT_SWITCH_RATE=0 declare HW_CONTEXT_SWITCH_RATE=0
declare HW_CONTEXT_SWITCH_INTERVAL=0 declare HW_CONTEXT_SWITCH_INTERVAL=0
...@@ -14,7 +13,7 @@ declare HW_CONTEXT_SWITCH_INTERVAL=0 ...@@ -14,7 +13,7 @@ declare HW_CONTEXT_SWITCH_INTERVAL=0
# Gather the metrics # Gather the metrics
function uabrc_hw_gather_data() { function uabrc_hw_gather_data() {
# Gather the nodes context switching rate from Prometheus # Gather the nodes context switching rate from Prometheus
HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]') HW_CONTEXT_SWITCH_RATE=$(curl --connect-timeout 5 -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
# Convert to an integer (hacky as it doesn't round, but insignificant) # Convert to an integer (hacky as it doesn't round, but insignificant)
HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*} HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*}
} }
...@@ -29,6 +28,15 @@ function uabrc_check_hw_context_switch_rate() { ...@@ -29,6 +28,15 @@ function uabrc_check_hw_context_switch_rate() {
uabrc_hw_gather_data uabrc_hw_gather_data
fi fi
# Check again after data gather, if still 0, call nhcmain_finish to take no action
# Possible cause:
# - node_exporter isn't running on the node, i.e. no metrics sent to Prometheus
# - Prometheus unreachable?
if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
echo "$FUNCNAME: Unable to retrieve HW_CONTEXT_SWITCH_RATE from Prometheus, not changing state of node $HOSTNAME."
nhcmain_finish
fi
if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)." die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."
return 1 return 1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment