diff --git a/.gitignore b/.gitignore index 0624af8fca9a21c967cc45eb391edd55b33fbbd7..8c9bdde49eba15e420c2d1c340b3534563a98b28 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ scripts/lbnl_ps.nhc +.history/ +.vscode/ diff --git a/node_context_switches.py b/node_context_switches.py new file mode 100644 index 0000000000000000000000000000000000000000..078745cdf371c32cb6402f36a88db77ed2ebe904 --- /dev/null +++ b/node_context_switches.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# Script expects that Prometheus node_exporter is running on the local host +# https://github.com/prometheus/node_exporter +# +# The script queries node_exporter returns the total number of context +# switches as an integer, ex: 240080031436 +# +# node_eporter field: node_context_switches_total +import requests + +url = "http://localhost:9100/metrics" + +# Send GET request to the URL +response = requests.get(url) + +# Check if the request was successful (status code 200) + +if response.status_code == 200: + # Filter lines containing "node_context_switches_total" and ignore lines + # starting with "#" + filtered_lines = [line for line in response.text.split( + "\n") if "node_context_switches_total" in line and not line.startswith("#")] + + # Parse the second column of the filtered lines + for line in filtered_lines: + columns = line.split() + if len(columns) >= 2: + metric_name = columns[0] + metric_value = int(float(columns[1])) +# print(f"Metric: {metric_name}") +# print(f"Value: {metric_value}") + print(metric_value) +else: + print( + f"Error: Failed to retrieve data from {url}. Status code: {response.status_code}") diff --git a/uabrc_hw.nhc b/uabrc_hw.nhc new file mode 100644 index 0000000000000000000000000000000000000000..f71a4d6413e9295e9b266d405a32f9f6d42835b8 --- /dev/null +++ b/uabrc_hw.nhc @@ -0,0 +1,29 @@ +# NHC -- UAB Research Computing - Hardware Checks +# +# Mike Hanby <mhanby@uab.edu> +# Date: 2023-08-07 +# +# Copyright and/or license information if different from upstream +# + +declare HW_CONTEXT_SWITCHES=0 + +# Gather the metrics +function uabrc_hw_gather_data() { + HW_CONTEXT_SWITCHES=$(curl --silent http://localhost:9100/metrics | grep ^node_context_switches_total | awk '{print $2}' | awk '{printf("%d\n", $1*1e$2)}') +} + +# Check that total context switches are less than max context switches ($1) +# The total context switches is collected from Prometheus Node Exporter +function uabrc_check_hw_context_switches() { + local CONTEXT_SWITCH_MAX="$1" + + if [[ $HW_CONTEXT_SWITCHES -eq 0 ]]; then + uabrc_hw_gather_data + fi + + if [[ $((HW_CONTEXT_SWITCHES)) -gt $CONTEXT_SWITCH_MAX ]]; then + die 1 "$FUNCNAME: Total Context Switches ($HW_CONTEXT_SWITCHES) greater than maximum allowed ($CONTEXT_SWITCH_MAX)." + return 1 + fi +}