Merge branch 'main' into 'prod'

Release v0.1.1-2 See merge request !24

Merge branch 'main' into 'prod'
Release v0.1.1-2 See merge request !24
b4079acc · Matthew K Defenderfer · bbf19903 · 18b49a7c · b4079acc · b4079acc
Commit b4079acc authored 6 months ago by Matthew K Defenderfer
--- a/src/run-policy/run-mmpol.sh
+++ b/src/run-policy/run-mmpol.sh
@@ -4,43 +4,143 @@ set -euxo pipefail

 # run an mmapply policy across the cluster via slurm

-# gather info to map mmapplypolicy to runtime configuration
-# arguments passed via job env and runtime context
+############################################################
+# Default Values                                           #
+############################################################

-filesystem=${FILESYSTEM:-scratch}
-policyfile=$POLICYFILE
-tmpglobal=$DIR/slurm-tmp-${SLURM_JOBID}
-tmpscratch=$DIR/slurm-tmp-${SLURM_JOBID}
-mkdir -p $tmpglobal
+outdir="/data/rc/gpfs-policy/data"
+policy_file="./policy-def/list-path-external"
+output_log_prefix=""
+dry_run=""

-nodes=`scontrol show hostnames "${SLURM_JOB_NODELIST}" | tr '\n' ',' | sed -e 's/,$//'`
+############################################################
+# Help                                                     #
+############################################################
+
+usage()
+{
+>&2 cat << EOF
+Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --output-prefix ] [ -P | --policy-file] device
+EOF
+exit 1
+}
+
+help()
+{
+>&2 cat << EOF
+Runs mmapplypolicy on the specified device/fileset. The policy file dictates the actions performed including list, delete, add, etc. This is most often called by the submit-pol-job wrapper instead of invoked directly
+
+Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --output-prefix ] [ -P | --policy-file ] device
+
+options:
+    -h|--help           Print this Help.
+    --dry-run           Do not run the policy command or any command that modifies any files or directories such as 
+                            mkdir
+
+Required:
+    device              GPFS fileset/directory apply the policy to. Can be 
+                            specified as either the name of the fileset or the 
+                            full path to the directory 
+                            (Examples: scratch, /data/user/[username])
+
+Path:   
+    -o|--outdir         Parent directory to save policy output to 
+                            (default: /data/rc/gpfs-policy/data)
+    -f|--output-prefix  Prefix of the policy output file. Appended with a metadata string containing the policy name, 
+                            job ID, and date
+
+Policy Options:
+    -P|--policy-file    Path to policy file to apply to the given GPFS device
+EOF
+exit 0
+}
+
+args=$(getopt -a -o ho:f:P: --long help,outdir:,output-prefix:,policy-file:,dry-run -- "$@")
+
+if [[ $? -gt 0 ]]; then
+  usage
+fi
+
+eval set -- ${args}
+
+while :
+do
+  case $1 in
+    -h | --help)            help                    ;;
+    -o | --outdir)          outdir=$2               ; shift 2 ;;
+    -f | --output-prefix)   output_log_prefix=$2    ; shift 2 ;;
+    -P | --policy-file)     policy_file=$2          ; shift 2 ;;
+    --dry-run)              dry_run=true            ; shift 1 ;;
+    --) shift; break ;;
+    *) >&2 echo Unsupported option: $1
+       usage ;;
+  esac
+done
+
+if [[ $# -eq 0 ]]; then
+  usage
+fi
+
+device="$1"
+
+# Ensure device is specified
+if [[ -z "${device}" ]]; then
+    echo "Error: Specify either the name of a fileset or a directory path"
+    usage
+fi
+
+# set default output_log_prefix if not specified in the arguments
+if [[ -z "${output_log_prefix}" ]]; then
+    modified_device=$(echo "${device}" | sed -e 's|^/||' -e 's|/$||' -e 's|/|-|g')
+    output_log_prefix="list-policy_${modified_device}"
+fi
+
+# create temporary working directory for list aggregation
+tmpglobal="${outdir}/slurm-tmp-${SLURM_JOBID}"
+tmpscratch="${outdir}/slurm-tmp-${SLURM_JOBID}"
+
+nodes=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | tr '\n' ',' | sed -e 's/,$//')
 cores="${SLURM_CPUS_PER_TASK}"

-DATESTR=`date +'%Y-%m-%d-%H:%M:%S'`
+DATESTR=$(date +'%Y-%m-%dT%H:%M:%S')

-policy=`basename $policyfile`
+policy=$(basename ${policy_file})
 filetag="${policy}_slurm-${SLURM_JOBID}_${DATESTR}"

-cmd="mmapplypolicy ${filesystem} -I defer \
-  -P $policyfile \
-  -g $tmpglobal \
-  -s $tmpscratch \
-  -f ${DIR}/list-${SLURM_JOBID} \
-  -M FILEPATH=${filesystem} \
+cmd="mmapplypolicy ${device} -I defer \
+  -P ${policy_file} \
+  -g ${tmpglobal} \
+  -s ${tmpscratch} \
+  -f ${outdir}/list-${SLURM_JOBID} \
+  -M FILEPATH=${device} \
  -M JOBID=${SLURM_JOBID} \
-  -M LIST_OUTPUT_FILE=${OUTFILE:-/tmp/gpfs-list-policy}
+  -M LIST_OUTPUT_FILE=${output_log_prefix} \
  -N ${nodes} -n ${cores} -m ${cores}"

-# report final command in job log
-echo $cmd
+if [[ ! ${dry_run} ]]; then
+    mkdir -p ${tmpglobal}
+    
+    # run policy command
+    ${cmd}
+
+    log_name="${output_log_prefix}_${filetag}"
+    log_dir="${outdir}/${log_name}"

-# run policy command
-$cmd
+    mkdir -p ${log_dir}/raw
+    chmod 2770 ${log_dir}

-# tag output file with run metadata
-outfile=`ls -t $tmpglobal | head -1`
-if [[ "$outfile" != "" ]]
-then
-   mv -n $tmpglobal/$outfile $tmpglobal/../${outfile}_$filetag
+    # tag output file with run metadata
+    raw_log_file=$(find ${outdir} -maxdepth 1 -name "list-${SLURM_JOBID}*" -type f | head -1)
+    if [[ "$raw_log_file" != "" ]]; then
+        mv -n ${raw_log_file} ${log_dir}/raw/${log_name}
+        gzip ${log_dir}/raw/${log_name}
+
+        chmod 440 ${log_dir}/raw/${log_name}.gz
+        chmod 1550 ${log_dir}/raw
+    fi
+
+    chown -R ${USER}:atlab ${log_dir}
+    
+    rmdir ${tmpglobal}
 fi
-rmdir $tmpglobal
+
--- a/src/run-policy/run-submit-pol-job.py
+++ b/src/run-policy/run-submit-pol-job.py
@@ -36,6 +36,10 @@ def parse_args():
                        help='Time limit for job formatted as [D-]HH:MM:SS')
    sbatch.add_argument('-m','--mem-per-cpu',type=str,default='8G',
                        help='Amount of RAM to allocate per core')
+
+    parser.add_argument('--dry-run', action='store_true',
+                        help="Do not submit any jobs, run any policies, or create or remove any files or directories."
+                             "Used for testing")
    
    parser.add_argument('device',type=str,
                        help="GPFS fileset/directory apply the policy to. Can be specified as either the name of the"
@@ -106,11 +110,8 @@ def validate_output_directory(outdir):
    return p

 def create_default_log_prefix(device):
-    if device.match('/data/user'):
-        log_prefix = 'list-policy_data_user'
-    else:
-        log_prefix = f'list-policy_{device.stem}'
-    return log_prefix
+    mod_device = str(device).strip('/').replace('/','-')
+    return f"list-policy_{mod_device}"

 def main():
    args = parse_args()
@@ -135,7 +136,10 @@ def main():
    else:
        args['policy'] = './policy-def/list-path-external'

-    cmd = "./submit-pol-job -o {outdir} -f {log_prefix} -P {policy} -N {nodes} -c {cores} -p {partition} -t {time} -m {mem_per_cpu} {device}".format(**args)
+    if args['dry_run']:
+        cmd = "./submit-pol-job -o {outdir} -f {log_prefix} -P {policy} -N {nodes} -c {cores} -p {partition} -t {time} -m {mem_per_cpu} --dry-run {device}".format(**args)
+    else:
+        cmd = "./submit-pol-job -o {outdir} -f {log_prefix} -P {policy} -N {nodes} -c {cores} -p {partition} -t {time} -m {mem_per_cpu} {device}".format(**args)
    
    print(f"Command: {cmd}")
    subprocess.run(cmd,shell=True)

--- a/src/run-policy/submit-pol-job
+++ b/src/run-policy/submit-pol-job
@@ -14,6 +14,7 @@ partition="amd-hdr100,medium"
 outdir="/data/rc/gpfs-policy/data"
 policy="./policy-def/list-path-external"
 outfile=""
+dry_run=""

 ############################################################
 # Help                                                     #
@@ -23,7 +24,7 @@ usage()
 >&2 cat << EOF
 Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --outfile ] [ --with-dirs ] 
          [ -N | --nodes ] [ -c | --cores ] [ -p | --partition] 
-          [ -t | --time ] [ -m | --mem-per-cpu ]
+          [ -t | --time ] [ -m | --mem-per-cpu ] [ --dry_run ]
          device
 EOF
 exit 1
@@ -38,11 +39,13 @@ as root or via the run-submit-pol-job.py script. The default policy file is

 Usage: $0 [ -h ] [ -o | --outdir ] [ -f | --outfile ] [ -P | --policy ] 
          [ -N | --nodes ] [ -c | --cores ] [ -p | --partition ] 
-          [ -t | --time ] [ -m | --mem ]
+          [ -t | --time ] [ -m | --mem ] [ --dry-run ]
          device

 options:
-    -h|--help       Print this Help.
+    -h|--help           Print this Help.
+    --dry-run           Do not submit a Slurm job running the policy. Instead, pass --dry-run to run-mmpol.sh and call 
+                            it normally to just print the output to STDOUT

 Required:
    device              GPFS fileset/directory apply the policy to. Can be 
@@ -69,7 +72,8 @@ EOF
 exit 0
 }

-args=$(getopt -a -o ho:f:P:N:c:p:t:m: --long help,outdir:,outfile:,policy:,nodes:,cores:,partition:,time:,mem: -- "$@")
+args=$(getopt -a -o ho:f:P:N:c:p:t:m: \
+                 --long help,outdir:,outfile:,policy:,nodes:,cores:,partition:,time:,mem:,dry-run -- "$@")

 if [[ $? -gt 0 ]]; then
  usage
@@ -89,6 +93,7 @@ do
    -p | --partition)   partition=$2    ; shift 2 ;;
    -t | --time)        time=$2         ; shift 2 ;;
    -m | --mem-per-cpu) mem_per_cpu=$2  ; shift 2 ;;
+    --dry-run)          dry_run=true    ; shift 1 ;;
    --) shift; break ;;
    *) >&2 echo Unsupported option: $1
       usage ;;
@@ -108,15 +113,23 @@ if [[ -z "$device" ]]; then
 fi

 slurm_out="out/pol-%A-$(basename ${policy})-$(basename ${device}).out"
-mkdir -p out
-
-DIR=$outdir POLICYFILE=$policy FILESYSTEM=${device} OUTFILE=${outfile} && \
-DIR=$DIR POLICYFILE=$POLICYFILE FILESYSTEM=${FILESYSTEM} OUTFILE=${OUTFILE} \
-sbatch \
-   -N $nodes \
-   -c $cores \
-   -t $time \
-   --mem-per-cpu=$mem_per_cpu \
-   -p $partition \
-   -o ${slurm_out} \
-   ./run-mmpol.sh
+
+run_mmpol_cmd_base="./run-mmpol.sh -o ${outdir} -f ${outfile} -P ${policy}"
+
+if [[ -z "${dry_run}" ]]; then
+    mkdir -p out
+
+    run_mmpol_cmd="${run_mmpol_cmd_base} ${device}"
+
+    sbatch \
+       -N $nodes \
+       -c $cores \
+       -t $time \
+       --mem-per-cpu=$mem_per_cpu \
+       -p $partition \
+       -o ${slurm_out} \
+       --wrap "${run_mmpol_cmd}"
+else
+    run_mmpol_cmd="${run_mmpol_cmd_base} --dry-run ${device}"
+    ${run_mmpol_cmd}
+fi