Merge branch 'add-submit-pol-wrapper' into 'main'

Add submit pol wrapper Closes #12 See merge request !11

Merge branch 'add-submit-pol-wrapper' into 'main'
Add submit pol wrapper Closes #12 See merge request !11
f78e856e · Matthew K Defenderfer · 122e4f84 · cbb2a739 · f78e856e · f78e856e
Commit f78e856e authored 8 months ago by Matthew K Defenderfer
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ slurm-*
 out/
 err/
 *.sif
+__pycache__
\ No newline at end of file
--- a/src/run-policy/exceptions.py
+++ b/src/run-policy/exceptions.py
+# exceptions.py
+import sys
+class CustomException(Exception):
+    """Base class for other exceptions"""
+    def __init__(self, message):
+        super().__init__(message)
+        self.handle_exception(message)
+    def handle_exception(self, message):
+        print(f"Error: {message}")
+        sys.exit(1)
+class InvalidDeviceError(CustomException):
+    """Exception raised when an improper fileset or path is given as the device"""
+    pass
+class ValueError(CustomException):
+    """Overloaded exception to exit when improper value is given"""
+    pass
\ No newline at end of file
--- a/policy/list-30day-with-excludes
+++ b/policy/list-30day-with-excludes
--- a/policy/list-path
+++ b/policy/list-path
--- a/policy/list-path-dirplus
+++ b/policy/list-path-dirplus
--- a/policy/list-path-external
+++ b/policy/list-path-external
--- a/run-mmpol.sh
+++ b/run-mmpol.sh
 #!/bin/bash
+set -euxo pipefail
 # run an mmapply policy across the cluster via slurm
 # gather info to map mmapplypolicy to runtime configuration
@@ -35,14 +37,6 @@ echo $cmd
 # run policy command
 $cmd
-rcode=$?
-if [ $rcode -ne 0 ]
-then
-  echo error: mmapplypoicy failed: code $rcode
-  exit $rcode
-fi
 # tag output file with run metadata
 outfile=`ls -t $tmpglobal | head -1`
 if [[ "$outfile" != "" ]]

--- a/src/run-policy/run-submit-pol-job.py
+++ b/src/run-policy/run-submit-pol-job.py
+#!/bin/python3
+import argparse
+from pathlib import Path
+import subprocess
+import re
+from exceptions import InvalidDeviceError, ValueError
+description = """
+Interface for non-privileged users to execute the run-mmpol.sh script with elevated permissions. Calls the 
+submit-pol-job wrapper. The default applied policy is ./policy-def/list-path-external but can be changed to 
+./policy-def/list-path-dirplus using the --with-dirs flag. No other policy is available via this script.
+"""
+def parse_args():
+    parser = argparse.ArgumentParser(description=description,
+                                     formatter_class=argparse.RawTextHelpFormatter
+                                     )
+    parser.add_argument('-o','--outdir', type=str,
+                        help="Directory to store the policy output in",
+                        default='/data/rc/gpfs-policy/data')
+    parser.add_argument('-f','--outfile', type=str,
+                        help="Base name of the output file. Defaults to 'list-policy_[device-id]' where 'device-id'"
+                             "is the device stem when device is a path or just device when it is a fileset. The final"
+                             "file name will have the policy type, the job ID, and the run date tagged on the end")
+    parser.add_argument('--with-dirs', action='store_true',
+                        help="Include directories as entries in the policy output (Default: false)")
+    sbatch = parser.add_argument_group('sbatch parameters')
+    sbatch.add_argument('-N','--nodes',type=int,default=1,
+                        help='Number of nodes to run job across')
+    sbatch.add_argument('-c','--cores',type=int,default=16,
+                        help='Number of cores to request')
+    sbatch.add_argument('-p','--partition',type=str,default='amd-hdr100,medium',
+                        help='Partition to submit job to. Can be a comma-separated list of multiple partitions')
+    sbatch.add_argument('-t','--time',type=str,default='24:00:00',
+                        help='Time limit for job formatted as [D-]HH:MM:SS')
+    sbatch.add_argument('-m','--mem-per-cpu',type=str,default='8G',
+                        help='Amount of RAM to allocate per core')
+    parser.add_argument('device',type=str,
+                        help="GPFS fileset/directory apply the policy to. Can be specified as either the name of the"
+                             "fileset or the full path to the directory. (Examples: scratch, /data/user/[username])')")
+    args = vars(parser.parse_args())
+    return args
+# Validate that the string supplied to 'device' is either the name of a valid, predefined fileset (only 'scratch' 
+# for now) or is a valid path in GPFS. Will not accept 'data' alone. Only valid top-levels in /data are /data/user,
+# /home, and /data/project
+def validate_device(device):
+    device = device.strip()
+    if device in ['data','/data']:
+        raise InvalidDeviceError("A policy run cannot be performed on the full 'data' fileset. Choose a valid subdirectory such as '/data/user' or '/data/project'")
+    if device in ['scratch','home']:
+        return Path('/').joinpath(device).resolve()
+    if device in ['/scratch','/home','/data/user','/data/project']:
+        return Path(device).resolve()
+    p = Path(device).resolve() # resolve given path into absolute path
+    # check if p is a valid path and is located in /data or /scratch. If not, raise an exception
+    valid_parents = [Path(parent).resolve() for parent in ['/data','/home','/scratch']]
+    if p.exists() and any([parent in p.parents for parent in valid_parents]):
+        return p
+    else:
+        raise InvalidDeviceError(f'The path or fileset {device} does not exist within /data or /scratch')
+def validate_time(time):
+    if not re.match(r'^(?:[0-6]-\d{2}|\d{1,3}):\d{2}:\d{2}$',time):
+        raise ValueError("Time must have format [[H]H]H:MM:SS or D-HH:MM:SS")
+def validate_mem(mem):
+    if not re.fullmatch(r'^[\d]+[GM]?$', mem):
+        raise ValueError("Mem per CPU must be an integer. May be followed by M or G to denote units")
+def validate_partition(partition):
+    if len(partition.split('\s')) > 1 and not re.search(',',partition):
+        raise ValueError("Multiple partitions should be given as a comma-separated list")
+    partition = re.sub(r'\s',r'',partition)
+    partitions = partition.split(r',')
+    cmd=r'sinfo -h -o "%R"'
+    avail_partitions=subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8').splitlines()
+    if any([p for p in partitions if p not in avail_partitions]):
+        incorrect = [p for p in partitions if p not in avail_partitions]
+        raise ValueError(f"The following partition(s) do not exist: {', '.join(incorrect)}. To see a list of valid partitions, visit https://rc.uab.edu/")
+    return partition
+def validate_nodes(n):
+    if not isinstance(n,int) and n >= 1 and n <= 4:
+        raise ValueError('Nodes must be an integer between 1 and 4')
+def validate_cores(n):
+    if not isinstance(n,int) and n >= 1 and n <= 48:
+        raise ValueError('Cores must be an integer between 1 and 48')
+# Need to validate that the output directory exists. This will not create a directory that does not already exist.
+def validate_output_directory(outdir):
+    p = Path(outdir).resolve()
+    if not p.is_dir():
+        raise ValueError(f"{p} is not a valid output directory")
+    return p
+def create_default_outfile(device):
+    if device.match('/data/user'):
+        outfile = 'list-policy_data_user'
+    else:
+        outfile = f'list-policy_{device.stem}'
+    return outfile
+def main():
+    args = parse_args()
+    args['device'] = validate_device(args['device'])
+    validate_nodes(args['nodes'])
+    validate_cores(args['cores'])
+    validate_mem(args['mem_per_cpu'])
+    validate_time(args['time'])
+    args['partition'] = validate_partition(args['partition'])
+    args['outdir'] = validate_output_directory(args['outdir'])
+    if args['outfile'] is None:
+        args['outfile'] = create_default_outfile(args['device'])
+    if args['with_dirs']:
+        args['policy'] = './policy-def/list-path-dirplus'
+    else:
+        args['policy'] = './policy-def/list-path-external'
+    cmd = "./submit-pol-job -o {outdir} -f {outfile} -P {policy} -N {nodes} -c {cores} -p {partition} -t {time} -m {mem_per_cpu} {device}".format(**args)
+    print(f"Command: {cmd}")
+    subprocess.run(cmd,shell=True)
+    exit()
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/src/run-policy/submit-pol-job
+++ b/src/run-policy/submit-pol-job
+#!/bin/bash
+set -euxo pipefail
+############################################################
+# Default Values                                           #
+############################################################
+nodes=1
+cores=16
+mem_per_cpu="8G"
+time="24:00:00"
+partition="amd-hdr100,medium"
+outdir="/data/rc/gpfs-policy/data"
+policy="./policy-def/list-path-external"
+outfile=""
+############################################################
+# Help                                                     #
+############################################################
+usage()
+{
+>&2 cat << EOF
+Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --outfile ] [ --with-dirs ] 
+          [ -N | --nodes ] [ -c | --cores ] [ -p | --partition] 
+          [ -t | --time ] [ -m | --mem-per-cpu ]
+          device
+EOF
+exit 1
+}
+help()
+{
+>&2 cat << EOF
+Wraps the  run-mmpol.sh script for applying a policy file. Must be run directly 
+as root or via the run-submit-pol-job.py script. The default policy file is
+./policy/list-path-external
+Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --outfile ] [ -P | --policy ] 
+          [ -N | --nodes ] [ -c | --cores ] [ -p | --partition ] 
+          [ -t | --time ] [ -m | --mem ]
+          device
+options:
+    -h|--help       Print this Help.
+Required:
+    device              GPFS fileset/directory apply the policy to. Can be 
+                            specified as either the name of the fileset or the 
+                            full path to the directory 
+                            (Examples: scratch, /data/user/[username])
+Path:   
+    -o|--outdir         Parent directory to save policy output to 
+                            (default: /data/rc/gpfs-policy/data)
+    -f|--outfile        Name of the policy output file
+Policy Options:
+    -P|--policy         Path to policy file to apply to the given GPFS device
+sbatch options:
+    -N|--nodes          Number of nodes to run the job on (default: 1)
+    -c|--cores         Number of cores (default: 16)
+    -p|--partition      Partition to submit tasks to 
+                            (default: amd-hdr100,medium)
+    -t|--time           Max walltime (default: 24:00:00)
+    -m|--mem-per-cpu    RAM per task (default: 8G)
+EOF
+exit 0
+}
+args=$(getopt -a -o ho:f:P:N:c:p:t:m: --long help,outdir:,outfile:,policy:,nodes:,cores:,partition:,time:,mem: -- "$@")
+if [[ $? -gt 0 ]]; then
+  usage
+fi
+eval set -- ${args}
+while :
+do
+  case $1 in
+    -h | --help)        help            ;;
+    -o | --outdir)      outdir=$2       ; shift 2 ;;
+    -f | --outfile)     outfile=$2      ; shift 2 ;;
+    -P | --policy)      policy=$2       ; shift 2 ;;
+    -N | --nodes)       nodes=$2        ; shift 2 ;;
+    -c | --cores)       cores=$2        ; shift 2 ;;
+    -p | --partition)   partition=$2    ; shift 2 ;;
+    -t | --time)        time=$2         ; shift 2 ;;
+    -m | --mem-per-cpu) mem_per_cpu=$2  ; shift 2 ;;
+    --) shift; break ;;
+    *) >&2 echo Unsupported option: $1
+       usage ;;
+  esac
+done
+if [[ $# -eq 0 ]]; then
+  usage
+fi
+device="$1"
+# Ensure gpfs_logdir is set
+if [[ -z "$device" ]]; then
+    echo "Error: Specify either the name of a fileset or a directory path"
+    usage
+fi
+slurm_out="out/pol-%A-$(basename ${policy})-$(basename ${device}).out"
+DIR=$outdir POLICYFILE=$policy FILESYSTEM=${device} OUTFILE=${outfile} && \
+DIR=$DIR POLICYFILE=$POLICYFILE FILESYSTEM=${FILESYSTEM} OUTFILE=${OUTFILE} \
+sbatch \
+   -N $nodes \
+   -c $cores \
+   -t $time \
+   --mem-per-cpu=$mem_per_cpu \
+   -p $partition \
+   -o ${slurm_out} \
+   ./run-mmpol.sh
--- a/submit-pol-job
+++ b/submit-pol-job
-#!/bin/bash
-# schedule a policy run on gpfs
-outdir=$1
-policy=$2
-nodes=$3
-cores=$4
-ram=$5
-partition=$6
-filesystem=${7:-scratch}
-time=${8:-60}
-outfile=${9}
-DIR=$outdir POLICYFILE=$policy FILESYSTEM=${filesystem} OUTFILE=${outfile} && \
-DIR=$DIR POLICYFILE=$POLICYFILE FILESYSTEM=${FILESYSTEM} OUTFILE=${OUTFILE} \
-sbatch \
-   -N $nodes \
-   -c $cores \
-   -t $time \
-   --mem-per-cpu=$ram \
-   -p $partition \
-   ./run-mmpol.sh