From 6c40d2177e890c722b457e45be48e85ee2898882 Mon Sep 17 00:00:00 2001
From: Mike Hanby <mhanby@uab.edu>
Date: Thu, 17 Aug 2023 09:54:29 -0400
Subject: [PATCH] Feat Add Handling of Invalid Metric Due to Prometheus Con
 Issue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

https://gitlab.rc.uab.edu/rc/rc-nhc/-/issues/4

The existing check will undrain an unhealthy drained node in the case where curl can't reach the Prometheus server / metrics are missing from the current interval (node_exporter not running?).

In this scenario, it will return 0 when it fails because the curl output is piped to jq which succeeds. This gets passed back to NHC as success, which ultimately results in the node being undrained in Slurm (assuming another check didn't fail)

This is a first pass at fixing this issue, as this method has flaws.

The current fix jumps to the core NHC function `nhcmain_finish`, bypassing the code that would undrain the node, thus leaving it in whatever drain/undrain state that it’s currently in.

The downside is that it short circuits any checks that were supposed to be run after `uabrc_check_hw_context_switch_rate`. This can be mitigated by placing this check at the end of the `nhc.conf` file when running NHC in serial mode.

NHC also has a mode where it can fork all of the checks, I don’t suspect it will work in that case.
---
 uabrc_hw.nhc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/uabrc_hw.nhc b/uabrc_hw.nhc
index 96740f6..fc73bf8 100644
--- a/uabrc_hw.nhc
+++ b/uabrc_hw.nhc
@@ -6,7 +6,6 @@
 # Copyright and/or license information if different from upstream
 #
 # Requires 'curl' and 'jq'
-# 
 
 declare HW_CONTEXT_SWITCH_RATE=0
 declare HW_CONTEXT_SWITCH_INTERVAL=0
@@ -14,7 +13,7 @@ declare HW_CONTEXT_SWITCH_INTERVAL=0
 # Gather the metrics
 function uabrc_hw_gather_data() {
     # Gather the nodes context switching rate from Prometheus
-    HW_CONTEXT_SWITCH_RATE=$(curl -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
+    HW_CONTEXT_SWITCH_RATE=$(curl --connect-timeout 5 -fs --data-urlencode "query=irate(node_context_switches_total{job=\"compute-node\",name=\"$HOSTNAME_S\"}[$HW_CONTEXT_SWITCH_INTERVAL])" http://nagios.rc.uab.edu:9090/api/v1/query | jq -r '.data.result[] | .value[1]')
     # Convert to an integer (hacky as it doesn't round, but insignificant)
     HW_CONTEXT_SWITCH_RATE=${HW_CONTEXT_SWITCH_RATE%.*}
 }
@@ -29,6 +28,15 @@ function uabrc_check_hw_context_switch_rate() {
         uabrc_hw_gather_data
     fi
 
+    # Check again after data gather, if still 0, call nhcmain_finish to take no action
+    # Possible cause:
+    # - node_exporter isn't running on the node, i.e. no metrics sent to Prometheus
+    # - Prometheus unreachable?
+    if [[ $HW_CONTEXT_SWITCH_RATE -eq 0 ]]; then
+        echo "$FUNCNAME: Unable to retrieve HW_CONTEXT_SWITCH_RATE from Prometheus, not changing state of node $HOSTNAME."
+        nhcmain_finish
+    fi
+
     if [[ $((HW_CONTEXT_SWITCH_RATE)) -gt $CONTEXT_SWITCH_RATE_MAX ]]; then
         die 1 "$FUNCNAME:  Total Context Switches ($HW_CONTEXT_SWITCH_RATE) greater than maximum allowed ($CONTEXT_SWITCH_RATE_MAX)."
         return 1
-- 
GitLab