From cbc91e4cb4dd4cbfcbbf2ed287380ca648adbd8b Mon Sep 17 00:00:00 2001
From: Matthew K Defenderfer <mdefende@uab.edu>
Date: Thu, 15 Aug 2024 12:42:35 -0500
Subject: [PATCH] convert to running from SIF container instead of bare metal
 conda env. add option to specify container

---
 convert-to-parquet/run-convert-to-parquet.sh | 39 +++++++++++++-------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/convert-to-parquet/run-convert-to-parquet.sh b/convert-to-parquet/run-convert-to-parquet.sh
index 58bf3d8..4055d10 100755
--- a/convert-to-parquet/run-convert-to-parquet.sh
+++ b/convert-to-parquet/run-convert-to-parquet.sh
@@ -11,6 +11,8 @@ mem="16G"
 time="02:00:00"
 partition="amd-hdr100"
 outdir=""
+sif=""
+default_sif_image="daskdev/dask:2024.8.0-py3.12"
 
 ############################################################
 # Help                                                     #
@@ -28,7 +30,7 @@ help()
 # Display Help
 echo "Submits an array job to convert parts of a GPFS log to parquet format"
 echo
-echo "Syntax: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>"
+echo "Syntax: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>"
 echo "options:"
 echo "-h|--help     Print this Help."
 echo 
@@ -36,6 +38,7 @@ echo "Required:"
 echo "   gpfs_log_dir           Directory containing GPFS log outputs"
 echo
 echo "Path:"
+echo "   -s|--sif         Path to SIF containing dask for processing"
 echo "   -o|--outdir      Directory to save parquet files to"
 echo 
 echo "sbatch options:"
@@ -47,7 +50,7 @@ echo
 exit 1
 }
 
-args=$(getopt -a -o ho:n:p:t:m: --long help,outdir:,ntasks:,partition:,time:,mem: -- "$@")
+args=$(getopt -a -o hs:o:n:p:t:m: --long help,sif:,outdir:,ntasks:,partition:,time:,mem: -- "$@")
 if [[ $? -gt 0 ]]; then
   usage
 fi
@@ -58,12 +61,12 @@ while :
 do
   case $1 in
     -h | --help)      help           ;;
+    -s | --sif)       sif=$2         ; shift 2 ;;
     -o | --outdir)    outdir=$2      ; shift 2 ;;
     -n | --ntasks)    ntasks=$2      ; shift 2 ;;
     -p | --partition) partition=$2   ; shift 2 ;;
     -t | --time)      time=$2        ; shift 2 ;;
     -m | --mem)       mem=$2         ; shift 2 ;;
-    # -- means the end of the arguments; drop this, and break out of the while loop
     --) shift; break ;;
     *) >&2 echo Unsupported option: $1
        usage ;;
@@ -82,15 +85,26 @@ if [[ -z "$gpfs_logdir" ]]; then
     exit 1
 fi
 
->&2 echo "output dir: ${outdir}"
->&2 echo "GPFS logs:  ${gpfs_logdir}"
->&2 echo "ntasks:     ${ntasks}"
->&2 echo "partition:  ${partition}"
->&2 echo "time:       ${time}"
->&2 echo "mem:        ${mem}"
+if [[ -z "${sif}" && ! -f "dask.sif" ]]; then
+    echo "No SIF set, downloading ${default_sif_image} as dask.sif"
+    singularity pull dask.sif docker://${default_sif_image}
+    sif="dask.sif"
+elif [[ -f "${sif}" ]]; then
+    echo "Singualrity file does not exist. Please pull the image first"
+    exit 1
+fi
 
 nlogs=$(ls ${gpfs_logdir}/list-* | wc -l)
 
+>&2 echo "sif:                 ${sif}"
+>&2 echo "output dir:          ${outdir}"
+>&2 echo "GPFS logs:           ${gpfs_logdir}"
+>&2 echo "ntasks:              ${ntasks}"
+>&2 echo "partition:           ${partition}"
+>&2 echo "time:                ${time}"
+>&2 echo "mem:                 ${mem}"
+>&2 echo "singularity command: singularity exec --bind /data ${sif} python3 convert-to-parquet.py -o ${outdir} -f \${log}"
+
 mkdir -p out
 mkdir -p err
 
@@ -98,7 +112,7 @@ mkdir -p err
 # Create Array Job Script                                  #
 ############################################################
 
-cat > convert-parquet-array.sh <<EOF
+{ cat | sbatch; } << EOF
 #!/bin/bash
 #
 #SBATCH --job-name=parquet-list-%a
@@ -110,12 +124,9 @@ cat > convert-parquet-array.sh <<EOF
 #SBATCH --error=err/%A_%a.err
 #SBATCH --array=1-${nlogs}
 
-module load Anaconda3
-conda activate gpfs
-
 log=\$(ls ${gpfs_logdir}/list-* | awk "NR==\${SLURM_ARRAY_TASK_ID} { print \$1 }")
 
-python convert-to-parquet.py -o ${outdir} -f \${log}
+singularity exec --bind /data dask.sif python3 convert-to-parquet.py -o ${outdir} -f \${log}
 EOF
 
 exit 0
\ No newline at end of file
-- 
GitLab