Skip to content
Snippets Groups Projects
Commit b7b596de authored by Mike Hanby's avatar Mike Hanby
Browse files

Update Context Switch Check to use Prometheus as Datasource

The previous method of using data returned by node_exporter was invalid, as it returns a total since boot.

The new method queries Prometheus to get a rate change of the same metric:

```shell
HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$NODENAME\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
```
parent 9d411ec3
No related branches found
No related tags found
1 merge request!6Update Context Switch Check to use Prometheus as Datasource
...@@ -212,3 +212,5 @@ ...@@ -212,3 +212,5 @@
# nVidia HealthMon GPU health checks (requires Tesla Development Kit) # nVidia HealthMon GPU health checks (requires Tesla Development Kit)
# * || check_nv_healthmon # * || check_nv_healthmon
# Check for high rate of context switching
* || uabrc_check_hw_context_switch_rate 300000 30m
...@@ -6,24 +6,30 @@ ...@@ -6,24 +6,30 @@
# Copyright and/or license information if different from upstream # Copyright and/or license information if different from upstream
# #
declare HW_CONTEXT_SWITCHES=0 declare HW_CONTEXT_SWITCH_RATE=0
# Gather the metrics # Gather the metrics
function uabrc_hw_gather_data() { function uabrc_hw_gather_data() {
HW_CONTEXT_SWITCHES=$(curl --silent http://localhost:9100/metrics | grep ^node_context_switches_total | awk '{print $2}' | awk '{printf("%d\n", $1*1e$2)}') # HW_CONTEXT_SWITCH_RATE=$(curl --silent http://localhost:9100/metrics | grep ^node_context_switches_total | awk '{print $2}' | awk '{printf("%d\n", $1*1e$2)}')
HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode 'query=snode_context_switches_total{job="compute-node",name="c0159"}(30m)' https://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | [.metric.container_name, .metric.namespace, .value[1] ] | @csv')
} }
curl -fs --data-urlencode 'query=snode_context_switches_total{job="compute-node",name="c0159"}(30m)' https://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | [.metric.container_name, .metric.namespace, .value[1] ] | @csv'
# Check that total context switches are less than max context switches ($1) # Check that total context switches are less than max context switches ($1)
# The total context switches is collected from Prometheus Node Exporter # The total context switches is collected from Prometheus Node Exporter
function uabrc_check_hw_context_switches() { function uabrc_check_HW_CONTEXT_SWITCH_RATE() {
local CONTEXT_SWITCH_MAX="$1" local CONTEXT_SWITCH_RATE_MAX="$1"
if [[ $HW_CONTEXT_SWITCHES -eq 0 ]]; then if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
uabrc_hw_gather_data uabrc_hw_gather_data
fi fi
if [[ $((HW_CONTEXT_SWITCHES)) -gt $CONTEXT_SWITCH_MAX ]]; then if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCHES) greater than maximum allowed ($CONTEXT_SWITCH_MAX)." die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."
return 1 return 1
fi fi
echo "HW_CONTEXT_SWITCH_RATE: $HW_CONTEXT_SWITCH_RATE"
echo "CONTEXT_SWITCH_RATE_MAX: $CONTEXT_SWITCH_RATE_MAX"
} }
uabrc_check_HW_CONTEXT_SWITCH_RATE $1
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment