Skip to content
Snippets Groups Projects
Commit f78e856e authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

Merge branch 'add-submit-pol-wrapper' into 'main'

Add submit pol wrapper

Closes #12

See merge request !11
parents 122e4f84 cbb2a739
No related branches found
No related tags found
1 merge request!11Add submit pol wrapper
...@@ -4,3 +4,4 @@ slurm-* ...@@ -4,3 +4,4 @@ slurm-*
out/ out/
err/ err/
*.sif *.sif
__pycache__
\ No newline at end of file
# exceptions.py
import sys
class CustomException(Exception):
"""Base class for other exceptions"""
def __init__(self, message):
super().__init__(message)
self.handle_exception(message)
def handle_exception(self, message):
print(f"Error: {message}")
sys.exit(1)
class InvalidDeviceError(CustomException):
"""Exception raised when an improper fileset or path is given as the device"""
pass
class ValueError(CustomException):
"""Overloaded exception to exit when improper value is given"""
pass
\ No newline at end of file
File moved
#!/bin/bash #!/bin/bash
set -euxo pipefail
# run an mmapply policy across the cluster via slurm # run an mmapply policy across the cluster via slurm
# gather info to map mmapplypolicy to runtime configuration # gather info to map mmapplypolicy to runtime configuration
...@@ -35,14 +37,6 @@ echo $cmd ...@@ -35,14 +37,6 @@ echo $cmd
# run policy command # run policy command
$cmd $cmd
rcode=$?
if [ $rcode -ne 0 ]
then
echo error: mmapplypoicy failed: code $rcode
exit $rcode
fi
# tag output file with run metadata # tag output file with run metadata
outfile=`ls -t $tmpglobal | head -1` outfile=`ls -t $tmpglobal | head -1`
if [[ "$outfile" != "" ]] if [[ "$outfile" != "" ]]
......
#!/bin/python3
import argparse
from pathlib import Path
import subprocess
import re
from exceptions import InvalidDeviceError, ValueError
description = """
Interface for non-privileged users to execute the run-mmpol.sh script with elevated permissions. Calls the
submit-pol-job wrapper. The default applied policy is ./policy-def/list-path-external but can be changed to
./policy-def/list-path-dirplus using the --with-dirs flag. No other policy is available via this script.
"""
def parse_args():
parser = argparse.ArgumentParser(description=description,
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('-o','--outdir', type=str,
help="Directory to store the policy output in",
default='/data/rc/gpfs-policy/data')
parser.add_argument('-f','--outfile', type=str,
help="Base name of the output file. Defaults to 'list-policy_[device-id]' where 'device-id'"
"is the device stem when device is a path or just device when it is a fileset. The final"
"file name will have the policy type, the job ID, and the run date tagged on the end")
parser.add_argument('--with-dirs', action='store_true',
help="Include directories as entries in the policy output (Default: false)")
sbatch = parser.add_argument_group('sbatch parameters')
sbatch.add_argument('-N','--nodes',type=int,default=1,
help='Number of nodes to run job across')
sbatch.add_argument('-c','--cores',type=int,default=16,
help='Number of cores to request')
sbatch.add_argument('-p','--partition',type=str,default='amd-hdr100,medium',
help='Partition to submit job to. Can be a comma-separated list of multiple partitions')
sbatch.add_argument('-t','--time',type=str,default='24:00:00',
help='Time limit for job formatted as [D-]HH:MM:SS')
sbatch.add_argument('-m','--mem-per-cpu',type=str,default='8G',
help='Amount of RAM to allocate per core')
parser.add_argument('device',type=str,
help="GPFS fileset/directory apply the policy to. Can be specified as either the name of the"
"fileset or the full path to the directory. (Examples: scratch, /data/user/[username])')")
args = vars(parser.parse_args())
return args
# Validate that the string supplied to 'device' is either the name of a valid, predefined fileset (only 'scratch'
# for now) or is a valid path in GPFS. Will not accept 'data' alone. Only valid top-levels in /data are /data/user,
# /home, and /data/project
def validate_device(device):
device = device.strip()
if device in ['data','/data']:
raise InvalidDeviceError("A policy run cannot be performed on the full 'data' fileset. Choose a valid subdirectory such as '/data/user' or '/data/project'")
if device in ['scratch','home']:
return Path('/').joinpath(device).resolve()
if device in ['/scratch','/home','/data/user','/data/project']:
return Path(device).resolve()
p = Path(device).resolve() # resolve given path into absolute path
# check if p is a valid path and is located in /data or /scratch. If not, raise an exception
valid_parents = [Path(parent).resolve() for parent in ['/data','/home','/scratch']]
if p.exists() and any([parent in p.parents for parent in valid_parents]):
return p
else:
raise InvalidDeviceError(f'The path or fileset {device} does not exist within /data or /scratch')
def validate_time(time):
if not re.match(r'^(?:[0-6]-\d{2}|\d{1,3}):\d{2}:\d{2}$',time):
raise ValueError("Time must have format [[H]H]H:MM:SS or D-HH:MM:SS")
def validate_mem(mem):
if not re.fullmatch(r'^[\d]+[GM]?$', mem):
raise ValueError("Mem per CPU must be an integer. May be followed by M or G to denote units")
def validate_partition(partition):
if len(partition.split('\s')) > 1 and not re.search(',',partition):
raise ValueError("Multiple partitions should be given as a comma-separated list")
partition = re.sub(r'\s',r'',partition)
partitions = partition.split(r',')
cmd=r'sinfo -h -o "%R"'
avail_partitions=subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8').splitlines()
if any([p for p in partitions if p not in avail_partitions]):
incorrect = [p for p in partitions if p not in avail_partitions]
raise ValueError(f"The following partition(s) do not exist: {', '.join(incorrect)}. To see a list of valid partitions, visit https://rc.uab.edu/")
return partition
def validate_nodes(n):
if not isinstance(n,int) and n >= 1 and n <= 4:
raise ValueError('Nodes must be an integer between 1 and 4')
def validate_cores(n):
if not isinstance(n,int) and n >= 1 and n <= 48:
raise ValueError('Cores must be an integer between 1 and 48')
# Need to validate that the output directory exists. This will not create a directory that does not already exist.
def validate_output_directory(outdir):
p = Path(outdir).resolve()
if not p.is_dir():
raise ValueError(f"{p} is not a valid output directory")
return p
def create_default_outfile(device):
if device.match('/data/user'):
outfile = 'list-policy_data_user'
else:
outfile = f'list-policy_{device.stem}'
return outfile
def main():
args = parse_args()
args['device'] = validate_device(args['device'])
validate_nodes(args['nodes'])
validate_cores(args['cores'])
validate_mem(args['mem_per_cpu'])
validate_time(args['time'])
args['partition'] = validate_partition(args['partition'])
args['outdir'] = validate_output_directory(args['outdir'])
if args['outfile'] is None:
args['outfile'] = create_default_outfile(args['device'])
if args['with_dirs']:
args['policy'] = './policy-def/list-path-dirplus'
else:
args['policy'] = './policy-def/list-path-external'
cmd = "./submit-pol-job -o {outdir} -f {outfile} -P {policy} -N {nodes} -c {cores} -p {partition} -t {time} -m {mem_per_cpu} {device}".format(**args)
print(f"Command: {cmd}")
subprocess.run(cmd,shell=True)
exit()
if __name__ == '__main__':
main()
\ No newline at end of file
#!/bin/bash
set -euxo pipefail
############################################################
# Default Values #
############################################################
nodes=1
cores=16
mem_per_cpu="8G"
time="24:00:00"
partition="amd-hdr100,medium"
outdir="/data/rc/gpfs-policy/data"
policy="./policy-def/list-path-external"
outfile=""
############################################################
# Help #
############################################################
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --outfile ] [ --with-dirs ]
[ -N | --nodes ] [ -c | --cores ] [ -p | --partition]
[ -t | --time ] [ -m | --mem-per-cpu ]
device
EOF
exit 1
}
help()
{
>&2 cat << EOF
Wraps the run-mmpol.sh script for applying a policy file. Must be run directly
as root or via the run-submit-pol-job.py script. The default policy file is
./policy/list-path-external
Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --outfile ] [ -P | --policy ]
[ -N | --nodes ] [ -c | --cores ] [ -p | --partition ]
[ -t | --time ] [ -m | --mem ]
device
options:
-h|--help Print this Help.
Required:
device GPFS fileset/directory apply the policy to. Can be
specified as either the name of the fileset or the
full path to the directory
(Examples: scratch, /data/user/[username])
Path:
-o|--outdir Parent directory to save policy output to
(default: /data/rc/gpfs-policy/data)
-f|--outfile Name of the policy output file
Policy Options:
-P|--policy Path to policy file to apply to the given GPFS device
sbatch options:
-N|--nodes Number of nodes to run the job on (default: 1)
-c|--cores Number of cores (default: 16)
-p|--partition Partition to submit tasks to
(default: amd-hdr100,medium)
-t|--time Max walltime (default: 24:00:00)
-m|--mem-per-cpu RAM per task (default: 8G)
EOF
exit 0
}
args=$(getopt -a -o ho:f:P:N:c:p:t:m: --long help,outdir:,outfile:,policy:,nodes:,cores:,partition:,time:,mem: -- "$@")
if [[ $? -gt 0 ]]; then
usage
fi
eval set -- ${args}
while :
do
case $1 in
-h | --help) help ;;
-o | --outdir) outdir=$2 ; shift 2 ;;
-f | --outfile) outfile=$2 ; shift 2 ;;
-P | --policy) policy=$2 ; shift 2 ;;
-N | --nodes) nodes=$2 ; shift 2 ;;
-c | --cores) cores=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
-t | --time) time=$2 ; shift 2 ;;
-m | --mem-per-cpu) mem_per_cpu=$2 ; shift 2 ;;
--) shift; break ;;
*) >&2 echo Unsupported option: $1
usage ;;
esac
done
if [[ $# -eq 0 ]]; then
usage
fi
device="$1"
# Ensure gpfs_logdir is set
if [[ -z "$device" ]]; then
echo "Error: Specify either the name of a fileset or a directory path"
usage
fi
slurm_out="out/pol-%A-$(basename ${policy})-$(basename ${device}).out"
DIR=$outdir POLICYFILE=$policy FILESYSTEM=${device} OUTFILE=${outfile} && \
DIR=$DIR POLICYFILE=$POLICYFILE FILESYSTEM=${FILESYSTEM} OUTFILE=${OUTFILE} \
sbatch \
-N $nodes \
-c $cores \
-t $time \
--mem-per-cpu=$mem_per_cpu \
-p $partition \
-o ${slurm_out} \
./run-mmpol.sh
#!/bin/bash
# schedule a policy run on gpfs
outdir=$1
policy=$2
nodes=$3
cores=$4
ram=$5
partition=$6
filesystem=${7:-scratch}
time=${8:-60}
outfile=${9}
DIR=$outdir POLICYFILE=$policy FILESYSTEM=${filesystem} OUTFILE=${outfile} && \
DIR=$DIR POLICYFILE=$POLICYFILE FILESYSTEM=${FILESYSTEM} OUTFILE=${OUTFILE} \
sbatch \
-N $nodes \
-c $cores \
-t $time \
--mem-per-cpu=$ram \
-p $partition \
./run-mmpol.sh
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment