Newer
Older
# NHC -- UAB Research Computing - Hardware Checks
#
# Mike Hanby <mhanby@uab.edu>
# Date: 2023-08-07
#
# Copyright and/or license information if different from upstream
#
declare HW_CONTEXT_SWITCH_RATE=0
# Gather the metrics
function uabrc_hw_gather_data() {
# Gather the nodes context switching rate from Prometheus
HW_CONTEXT_SWITCH_RATE=$(curl --connect-timeout 5 -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
# Convert to an integer (hacky as it doesn't round, but insignificant)
HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*}
}
# Check that total context switches are less than max context switches ($1)
# The total context switches is collected from Prometheus Node Exporter
local CONTEXT_SWITCH_RATE_MAX="$1"
if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
# Check again after data gather, if still 0, call nhcmain_finish to take no action
# Possible cause:
# - node_exporter isn't running on the node, i.e. no metrics sent to Prometheus
# - Prometheus unreachable?
if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
echo "$FUNCNAME: Unable to retrieve HW_CONTEXT_SWITCH_RATE from Prometheus, not changing state of node $HOSTNAME."
nhcmain_finish
fi
if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."