diff --git a/nhc.conf b/nhc.conf index 61436e0a1adf74df01a9a1a96afac7757210578c..631d412a5c3e21cebc2bc11b99caf85b0fa8b65a 100644 --- a/nhc.conf +++ b/nhc.conf @@ -19,12 +19,16 @@ ### Hardware checks ### # Set these to your correct socket, core, and thread counts. + {c00[97-99],c0[100-114]} || check_hw_cpuinfo 2 28 48 + {c0[115-149]} || check_hw_cpuinfo 2 24 24 {c0[150-201]} || check_hw_cpuinfo 2 48 48 {c0[202-235]} || check_hw_cpuinfo 2 128 128 # Set these to the amount of physical RAM you have (leave the fudge factor). + {c00[97-99],c0[100-114]} || check_hw_physmem 256gb 256gb 5% + {c0[115-149]} || check_hw_physmem 192gb 192gb 5% {c0[150-201]} || check_hw_physmem 768gb 768gb 5% - {c0[202-235]} || check_hw_physmem 512gb 512gb 5% + {c0[202-255]} || check_hw_physmem 512gb 512gb 5% # Set these to the amount of swap you have (leave the fudge factor). # * || check_hw_swap 8g 8g 3% @@ -42,7 +46,7 @@ * || check_hw_mem_free 1g # Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp. - {c0[150-235]} || check_hw_ib 100 + {c00[97-99],c0[100-255]} || check_hw_ib 100 # Checks for an active Myrinet interface named "myri0." # * || check_hw_gm myri0 @@ -75,20 +79,20 @@ * || check_fs_free / 10% * || check_fs_ifree / 1k # /local FS - {c0[150-235]} || check_fs_mount_rw -f /local - {c0[150-235]} || check_fs_free /local 10% + {c00[97-99],c0[100-255]} || check_fs_mount_rw -f /local + {c00[97-99],c0[100-255]} || check_fs_free /local 10% # /tmp FS - {c0[150-235]} || check_fs_mount_rw -f /tmp - {c0[150-235]} || check_fs_free /tmp 20% + {c00[97-99],c0[100-255]} || check_fs_mount_rw -f /tmp + {c00[97-99],c0[100-255]} || check_fs_free /tmp 20% # /var FS - {c0[150-235]} || check_fs_mount_rw -f /var - {c0[150-235]} || check_fs_free /var 10% + {c00[97-99],c0[100-255]} || check_fs_mount_rw -f /var + {c00[97-99],c0[100-255]} || check_fs_free /var 10% # /data FS - {c0[150-235]} || check_fs_mount_rw -t gpfs -f /data - {c0[150-235]} || check_file_test -r -e -f /data/.nhc-test + {c00[97-99],c0[100-255]} || check_fs_mount_rw -t gpfs -f /data + {c00[97-99],c0[100-255]} || check_file_test -r -e -f /data/.nhc-test # /scratch FS - {c0[150-235]} || check_fs_mount_rw -t gpfs -f /scratch - {c0[150-235]} || check_file_test -r -e -f /scratch/.nhc-test + {c00[97-99],c0[100-255]} || check_fs_mount_rw -t gpfs -f /scratch + {c00[97-99],c0[100-255]} || check_file_test -r -e -f /scratch/.nhc-test # /rstore/share FS ##MJH## * || check_fs_mount -s 192.168.200.19:6789,192.168.200.20:6789,192.168.200.21:6789:/share -t ceph -f /rstore/share # /cm/shared @@ -138,6 +142,9 @@ # Validate a couple important accounts in the passwd file. # * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*" +# Nvidia /dev/nvidia[0-9] + {c00[97-99],c0[100-114]} || check_file_test -c -r -w /dev/nvidia0 /dev/nvidia1 /dev/nvidia2 /dev/nvidia3 + {c0[236-255]} || check_file_test -c -r -w /dev/nvidia0 /dev/nvidia1 # Check that LDAP is resolving correctly on the node * || check_cmd_status -t 5 -r 0 getent group atlab