Newer
Older
# NHC Configuration File (sample)
#
# Lines are in the form "<hostmask>||<check>"
# Hostmask is a glob, /regexp/, or {noderange}
# Comments begin with '#'
#
## Nice example:
## https://hpc-syspros-basics.github.io/HPC_Basics_menu/Node_Health_Check/Configuring_NHC.html
#######################################################################
###
### NHC Configuration Variables
###
# Moved all of these settings to /etc/sysconfig/nhc
#######################################################################
###
### Hardware checks
###
# Set these to your correct socket, core, and thread counts.
{c00[97-99],c0[100-114]} || check_hw_cpuinfo 2 28 28
{c0[115-135],c0[140-149]} || check_hw_cpuinfo 2 24 24
{c0[136-139]} || check_hw_cpuinfo 4 48 48
{c0[202-255]} || check_hw_cpuinfo 2 128 128
# Set these to the amount of physical RAM you have (leave the fudge factor).
### 20241210 - c0101 and c0102 memory count has returned to 256G, adding them
### back to the original rule for the P100s
{c00[97-99],c0[100-114]} || check_hw_physmem 256gb 256gb 5%
#{c00[97-99],c0[103-114]} || check_hw_physmem 256gb 256gb 5%
#### c0101 and c0102 are out of warranty and have lost a DIMM
# {c0[101-102]} || check_hw_physmem 231gb 231gb 5%
{c0[115-135]} || check_hw_physmem 192gb 192gb 5%
{c0[136-139]} || check_hw_physmem 1510gb 1510gb 5%
{c0[140-201]} || check_hw_physmem 768gb 768gb 5%
{c0[202-255]} || check_hw_physmem 512gb 512gb 5%
# Set these to the amount of swap you have (leave the fudge factor).
# * || check_hw_swap 8g 8g 3%
# If you prefer to use this instead of the previous two, you can.
# * || check_hw_mem 40g 40g 5%
# Check specifically for free physical memory.
# * || check_hw_physmem_free 1MB
# Same, but for swap space.
# * || check_hw_swap_free 1MB
# Check for some sort of free memory of either type.
* || check_hw_mem_free 1g
# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp.
{c00[97-99],c0[100-255]} || check_hw_ib 100
# Checks for an active Myrinet interface named "myri0."
# * || check_hw_gm myri0
# Checks for an active ethernet interface named "eth1."
##MJH## {c0[150-201]} || check_hw_eth eth0
##MJH## {c0[202-235]} || check_hw_eth eth2
##MJH## {c0[150-201]} || check_hw_eth eth0.2050
##MJH## {c0[202-235]} || check_hw_eth eth2.2050
##MJH## * || check_hw_eth ib0
# Make sure we're running the correct BIOS version on all nodes.
##MJH## {c0[150-201]} || check_dmi_data_match "BIOS Information: Version: 2.17.1"
##MJH## {c0[202-235]} || check_dmi_data_match "BIOS Information: Version: 2.10.2"
# Make sure our RAM is running at the correct bus rate.
# * || check_dmi_data_match -t "Memory Device" "*Speed: 1866 MHz"
# Check the mcelog daemon for any pending errors.
* || check_hw_mcelog
#######################################################################
###
### Filesystem checks
###
# All nodes should have their root filesystem mounted read/write.
# / FS
* || check_fs_mount_rw -f /
* || check_fs_free / 10%
* || check_fs_ifree / 1k
# /local FS
{c00[97-99],c0[100-255]} || check_fs_mount_rw -f /local
{c00[97-99],c0[100-255]} || check_fs_free /local 10%
{c00[97-99],c0[100-255]} || check_fs_mount_rw -f /tmp
{c00[97-99],c0[100-255]} || check_fs_free /tmp 20%
{c00[97-99],c0[100-255]} || check_fs_mount_rw -f /var
{c00[97-99],c0[100-255]} || check_fs_free /var 10%
{c00[97-99],c0[100-201],c0[220-255]} || check_fs_mount_rw -t gpfs -f /data
{c00[97-99],c0[100-255]} || check_file_test -r -e -f /data/.nhc-test
# /data FS for GPFS5 nodes
{c0[202-219]} || check_fs_mount_rw -t nfs -f /data
{c00[97-99],c0[100-201],c0[220-255]} || check_fs_mount_rw -t gpfs -f /scratch
{c00[97-99],c0[100-255]} || check_file_test -r -e -f /scratch/.nhc-test
# /scratch FS for GPFS5 nodes
{c0[202-219]} || check_fs_mount_rw -t nfs -f /scratch
# /gpfs FS for GPFS5 clients
{c0[202-219]} || check_fs_mount_rw -t gpfs -f /gpfs
# /rstore/share FS
##MJH## * || check_fs_mount -s 192.168.200.19:6789,192.168.200.20:6789,192.168.200.21:6789:/share -t ceph -f /rstore/share
# /cm/shared
* || check_fs_mount -s gpfs.rc.uab.edu:/data/cm/shared-8.2 -t nfs -f /cm/shared
# Virtual GPU Node Mounts
{v2gpu00[1-9]} || check_fs_mount -s gpfs.rc.uab.edu:/scratch -t nfs -f /scratch
{v2gpu00[1-9]} || check_fs_mount -s gpfs.rc.uab.edu:/data/rc/apps -t nfs -f /data/rc/apps
{v2gpu00[1-9]} || check_fs_mount -s gpfs.rc.uab.edu:/data/project -t nfs -f /data/project
{v2gpu00[1-9]} || check_fs_mount -s gpfs.rc.uab.edu:/data/user -t nfs -f /data/user
{v2gpu00[1-9]} || check_fs_mount -s gpfs.rc.uab.edu:/data/user/home -t nfs -f /home
# Virtual GPU Node Nvidia device files
{v2gpu00[1-9]} || check_file_test -e /dev/nvidia0 /dev/nvidia1
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# Assert that /tmp is a mounted filesystem of type "tmpfs."
# * || check_fs_mount_rw -t tmpfs -f /tmp
# Controlling TTYs are a good thing!
# * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts
# Make sure the root filesystem doesn't get too full.
# * || check_fs_free / 3%
# Free inodes are also important.
# * || check_fs_ifree / 1k
# The following illustrates how to assert an NFSv3 mount (or any other specific mount option).
# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home
#######################################################################
###
### File/metadata checks
###
# These should always be directories and always be read/write/execute and sticky.
* || check_file_test -r -w -x -d -k /tmp /var/tmp
# These should always be readable and should never be empty.
* || check_file_test -r -s /etc/passwd /etc/group
# Assert common properties for /dev/null (which occasionally gets clobbered).
* || check_file_test -c -r -w /dev/null /dev/zero
# * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null
# Make sure there's relatively recent activity from the syslog.
# * || check_file_stat -n 7200 /var/log/messages
# Validate a couple important accounts in the passwd file.
# * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*"
# Nvidia /dev/nvidia[0-9]
{c00[97-99],c0[100-114]} || check_file_test -c -r -w /dev/nvidia0 /dev/nvidia1 /dev/nvidia2 /dev/nvidia3
{c0[236-255]} || check_file_test -c -r -w /dev/nvidia0 /dev/nvidia1
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# Check that LDAP is resolving correctly on the node
* || check_cmd_status -t 5 -r 0 getent group atlab
#######################################################################
###
### Process checks
###
# Everybody needs sshd running, right? But don't use -r (restart)!
* || check_ps_service -u root -S sshd
# The following should be running
* || check_ps_service -u root -S slurmd
## This isn't working, succeeds even if I fatfinger the mount name
# * || check_ps_service -d mmfsd data.mount
# * || check_ps_service -d mmfsd scratch.mount
# Check for wulfd but don't manage it.
# * || check_ps_daemon wulfd root
# Make sure no users are SSH'd in, but don't kill them.
# * || check_ps_blacklist sshd '!root'
# Flag and kill any processes which are owned by unauthorized users.
# * || check_ps_unauth_users log syslog kill
# Flag any user processes not properly parented.
##MJH## ## Disabling for now as this check is false flagging processes(?) by users who
##MJH## ## have an active job on the node
##MJH## ## Need to investigate how this check works
##MJH## * || check_ps_userproc_lineage log syslog
# Most systems also need NFS locking services.
* || check_ps_service -d rpc.statd -r nfslock
# The audit daemon can sometimes disappear if things get hairy.
# * || check_ps_service -r auditd
# This is only valid for RHEL6 and similar/newer systems.
* || check_ps_service -d rsyslogd -r rsyslog
# In the case of MySQL, it's typically better to cycle.
# * || check_ps_service -c mysqld
# Double your core count is a good rule of thumb for load average max.
# * || check_ps_loadavg 24
# This should work if you place it after one of the check_hw_*() checks.
# * || check_ps_loadavg $((2*HW_CORES))
* || check_ps_loadavg $((1*HW_CORES))
# Ensure that NTP is synchronized
* || check_cmd_output -t 2 -m 'NTP synchronized: yes' -e 'timedatectl'
#######################################################################
###
### Other checks
###
# Check to verify that SELinux is disabled. (Remove the "-r 1" to verify it's enabled.)
# * || check_cmd_status -t 1 -r 1 selinuxenabled
# Verify settings for an Ethernet interface.
# * || check_cmd_output -m '/addr:10\.0\.0\.1/' -m '/Bcast:10\.0\.0\.255/' -m '/Mask:255\.255\.255\.0/' -m '/^[[:space:]]*UP /' /sbin/ifconfig eth3
# nVidia HealthMon GPU health checks (requires Tesla Development Kit)
# * || check_nv_healthmon
# Check for high rate of context switching
* || uabrc_check_hw_context_switch_rate 300000 5m