diff --git a/uabrc_hw.nhc b/uabrc_hw.nhc index 96740f6ff3fcb8096668dabd3c8eb1f24324af83..fc73bf8ca71fdcef0beb501ee7978d61a0b15b9b 100644 --- a/uabrc_hw.nhc +++ b/uabrc_hw.nhc @@ -6,7 +6,6 @@ # Copyright and/or license information if different from upstream # # Requires 'curl' and 'jq' -# declare HW_CONTEXT_SWITCH_RATE=0 declare HW_CONTEXT_SWITCH_INTERVAL=0 @@ -14,7 +13,7 @@ declare HW_CONTEXT_SWITCH_INTERVAL=0 # Gather the metrics function uabrc_hw_gather_data() { # Gather the nodes context switching rate from Prometheus - HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]') + HW_CONTEXT_SWITCH_RATE=$(curl --connect-timeout 5 -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]') # Convert to an integer (hacky as it doesn't round, but insignificant) HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*} } @@ -29,6 +28,15 @@ function uabrc_check_hw_context_switch_rate() { uabrc_hw_gather_data fi + # Check again after data gather, if still 0, call nhcmain_finish to take no action + # Possible cause: + # - node_exporter isn't running on the node, i.e. no metrics sent to Prometheus + # - Prometheus unreachable? + if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then + echo "$FUNCNAME: Unable to retrieve HW_CONTEXT_SWITCH_RATE from Prometheus, not changing state of node $HOSTNAME." + nhcmain_finish + fi + if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)." return 1