diff --git a/nhc.conf b/nhc.conf index 3b387ff6c4c72a6b229434a37320d7aab799a0b2..82b3be16f8a2a2da2e07daf2db46de0dbf17bebb 100644 --- a/nhc.conf +++ b/nhc.conf @@ -212,3 +212,5 @@ # nVidia HealthMon GPU health checks (requires Tesla Development Kit) # * || check_nv_healthmon +# Check for high rate of context switching + * || uabrc_check_hw_context_switch_rate 300000 30m diff --git a/uabrc_hw.nhc b/uabrc_hw.nhc index f71a4d6413e9295e9b266d405a32f9f6d42835b8..0d03409112d770484f45a644d189ea90a59951cf 100644 --- a/uabrc_hw.nhc +++ b/uabrc_hw.nhc @@ -6,24 +6,30 @@ # Copyright and/or license information if different from upstream # -declare HW_CONTEXT_SWITCHES=0 +declare HW_CONTEXT_SWITCH_RATE=0 # Gather the metrics function uabrc_hw_gather_data() { - HW_CONTEXT_SWITCHES=$(curl --silent http://localhost:9100/metrics | grep ^node_context_switches_total | awk '{print $2}' | awk '{printf("%d\n", $1*1e$2)}') + # HW_CONTEXT_SWITCH_RATE=$(curl --silent http://localhost:9100/metrics | grep ^node_context_switches_total | awk '{print $2}' | awk '{printf("%d\n", $1*1e$2)}') + HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode 'query=snode_context_switches_total{job="compute-node",name="c0159"}(30m)' https://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | [.metric.container_name, .metric.namespace, .value[1] ] | @csv') } +curl -fs --data-urlencode 'query=snode_context_switches_total{job="compute-node",name="c0159"}(30m)' https://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | [.metric.container_name, .metric.namespace, .value[1] ] | @csv' # Check that total context switches are less than max context switches ($1) # The total context switches is collected from Prometheus Node Exporter -function uabrc_check_hw_context_switches() { - local CONTEXT_SWITCH_MAX="$1" +function uabrc_check_HW_CONTEXT_SWITCH_RATE() { + local CONTEXT_SWITCH_RATE_MAX="$1" - if [[ $HW_CONTEXT_SWITCHES -eq 0 ]]; then + if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then uabrc_hw_gather_data fi - if [[ $((HW_CONTEXT_SWITCHES)) -gt $CONTEXT_SWITCH_MAX ]]; then - die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCHES) greater than maximum allowed ($CONTEXT_SWITCH_MAX)." + if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then + die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)." return 1 fi + echo "HW_CONTEXT_SWITCH_RATE: $HW_CONTEXT_SWITCH_RATE" + echo "CONTEXT_SWITCH_RATE_MAX: $CONTEXT_SWITCH_RATE_MAX" } + +uabrc_check_HW_CONTEXT_SWITCH_RATE $1 \ No newline at end of file