Skip to content
Snippets Groups Projects
Commit 59c670b2 authored by Mike Hanby's avatar Mike Hanby
Browse files

Add A100 and other Missing Nodes to NHC

Add c0097 - c0201 and c0236 - c0255 to NHC

Add `/dev/nvidia*` checks
parent 19a265e1
No related branches found
No related tags found
1 merge request!4Add A100 and other Missing Nodes to NHC
......@@ -19,12 +19,16 @@
### Hardware checks
###
# Set these to your correct socket, core, and thread counts.
{c00[97-99],c0[100-114]} || check_hw_cpuinfo 2 28 48
{c0[115-149]} || check_hw_cpuinfo 2 24 24
{c0[150-201]} || check_hw_cpuinfo 2 48 48
{c0[202-235]} || check_hw_cpuinfo 2 128 128
# Set these to the amount of physical RAM you have (leave the fudge factor).
{c00[97-99],c0[100-114]} || check_hw_physmem 256gb 256gb 5%
{c0[115-149]} || check_hw_physmem 192gb 192gb 5%
{c0[150-201]} || check_hw_physmem 768gb 768gb 5%
{c0[202-235]} || check_hw_physmem 512gb 512gb 5%
{c0[202-255]} || check_hw_physmem 512gb 512gb 5%
# Set these to the amount of swap you have (leave the fudge factor).
# * || check_hw_swap 8g 8g 3%
......@@ -42,7 +46,7 @@
* || check_hw_mem_free 1g
# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp.
{c0[150-235]} || check_hw_ib 100
{c00[97-99],c0[100-255]} || check_hw_ib 100
# Checks for an active Myrinet interface named "myri0."
# * || check_hw_gm myri0
......@@ -75,20 +79,20 @@
* || check_fs_free / 10%
* || check_fs_ifree / 1k
# /local FS
{c0[150-235]} || check_fs_mount_rw -f /local
{c0[150-235]} || check_fs_free /local 10%
{c00[97-99],c0[100-255]} || check_fs_mount_rw -f /local
{c00[97-99],c0[100-255]} || check_fs_free /local 10%
# /tmp FS
{c0[150-235]} || check_fs_mount_rw -f /tmp
{c0[150-235]} || check_fs_free /tmp 20%
{c00[97-99],c0[100-255]} || check_fs_mount_rw -f /tmp
{c00[97-99],c0[100-255]} || check_fs_free /tmp 20%
# /var FS
{c0[150-235]} || check_fs_mount_rw -f /var
{c0[150-235]} || check_fs_free /var 10%
{c00[97-99],c0[100-255]} || check_fs_mount_rw -f /var
{c00[97-99],c0[100-255]} || check_fs_free /var 10%
# /data FS
{c0[150-235]} || check_fs_mount_rw -t gpfs -f /data
{c0[150-235]} || check_file_test -r -e -f /data/.nhc-test
{c00[97-99],c0[100-255]} || check_fs_mount_rw -t gpfs -f /data
{c00[97-99],c0[100-255]} || check_file_test -r -e -f /data/.nhc-test
# /scratch FS
{c0[150-235]} || check_fs_mount_rw -t gpfs -f /scratch
{c0[150-235]} || check_file_test -r -e -f /scratch/.nhc-test
{c00[97-99],c0[100-255]} || check_fs_mount_rw -t gpfs -f /scratch
{c00[97-99],c0[100-255]} || check_file_test -r -e -f /scratch/.nhc-test
# /rstore/share FS
##MJH## * || check_fs_mount -s 192.168.200.19:6789,192.168.200.20:6789,192.168.200.21:6789:/share -t ceph -f /rstore/share
# /cm/shared
......@@ -138,6 +142,9 @@
# Validate a couple important accounts in the passwd file.
# * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*"
# Nvidia /dev/nvidia[0-9]
{c00[97-99],c0[100-114]} || check_file_test -c -r -w /dev/nvidia0 /dev/nvidia1 /dev/nvidia2 /dev/nvidia3
{c0[236-255]} || check_file_test -c -r -w /dev/nvidia0 /dev/nvidia1
# Check that LDAP is resolving correctly on the node
* || check_cmd_status -t 5 -r 0 getent group atlab
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment