Skip to content
Snippets Groups Projects
uabrc_hw.nhc 1.77 KiB
Newer Older
# NHC -- UAB Research Computing - Hardware Checks
#
# Mike Hanby <mhanby@uab.edu>
# Date: 2023-08-07
#
# Copyright and/or license information if different from upstream
#
Mike Hanby's avatar
Mike Hanby committed
# Requires 'curl' and 'jq'
declare HW_CONTEXT_SWITCH_RATE=0
Mike Hanby's avatar
Mike Hanby committed
declare HW_CONTEXT_SWITCH_INTERVAL=0

# Gather the metrics
function uabrc_hw_gather_data() {
Mike Hanby's avatar
Mike Hanby committed
    # Gather the nodes context switching rate from Prometheus
    HW_CONTEXT_SWITCH_RATE=$(curl --connect-timeout 5 -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
Mike Hanby's avatar
Mike Hanby committed
    # Convert to an integer (hacky as it doesn't round, but insignificant)
    HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*}
}

# Check that total context switches are less than max context switches ($1)
# The total context switches is collected from Prometheus Node Exporter
Mike Hanby's avatar
Mike Hanby committed
function uabrc_check_hw_context_switch_rate() {
    local CONTEXT_SWITCH_RATE_MAX="$1"
Mike Hanby's avatar
Mike Hanby committed
    HW_CONTEXT_SWITCH_INTERVAL="$2"
    if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
        uabrc_hw_gather_data
    fi

    # Check again after data gather, if still 0, call nhcmain_finish to take no action
    # Possible cause:
    # - node_exporter isn't running on the node, i.e. no metrics sent to Prometheus
    # - Prometheus unreachable?
    if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
        echo "$FUNCNAME: Unable to retrieve HW_CONTEXT_SWITCH_RATE from Prometheus, not changing state of node $HOSTNAME."
        nhcmain_finish
    fi

    if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
        die 1 "$FUNCNAME:  Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."