From cbc91e4cb4dd4cbfcbbf2ed287380ca648adbd8b Mon Sep 17 00:00:00 2001 From: Matthew K Defenderfer <mdefende@uab.edu> Date: Thu, 15 Aug 2024 12:42:35 -0500 Subject: [PATCH] convert to running from SIF container instead of bare metal conda env. add option to specify container --- convert-to-parquet/run-convert-to-parquet.sh | 39 +++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/convert-to-parquet/run-convert-to-parquet.sh b/convert-to-parquet/run-convert-to-parquet.sh index 58bf3d8..4055d10 100755 --- a/convert-to-parquet/run-convert-to-parquet.sh +++ b/convert-to-parquet/run-convert-to-parquet.sh @@ -11,6 +11,8 @@ mem="16G" time="02:00:00" partition="amd-hdr100" outdir="" +sif="" +default_sif_image="daskdev/dask:2024.8.0-py3.12" ############################################################ # Help # @@ -28,7 +30,7 @@ help() # Display Help echo "Submits an array job to convert parts of a GPFS log to parquet format" echo -echo "Syntax: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>" +echo "Syntax: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>" echo "options:" echo "-h|--help Print this Help." echo @@ -36,6 +38,7 @@ echo "Required:" echo " gpfs_log_dir Directory containing GPFS log outputs" echo echo "Path:" +echo " -s|--sif Path to SIF containing dask for processing" echo " -o|--outdir Directory to save parquet files to" echo echo "sbatch options:" @@ -47,7 +50,7 @@ echo exit 1 } -args=$(getopt -a -o ho:n:p:t:m: --long help,outdir:,ntasks:,partition:,time:,mem: -- "$@") +args=$(getopt -a -o hs:o:n:p:t:m: --long help,sif:,outdir:,ntasks:,partition:,time:,mem: -- "$@") if [[ $? -gt 0 ]]; then usage fi @@ -58,12 +61,12 @@ while : do case $1 in -h | --help) help ;; + -s | --sif) sif=$2 ; shift 2 ;; -o | --outdir) outdir=$2 ; shift 2 ;; -n | --ntasks) ntasks=$2 ; shift 2 ;; -p | --partition) partition=$2 ; shift 2 ;; -t | --time) time=$2 ; shift 2 ;; -m | --mem) mem=$2 ; shift 2 ;; - # -- means the end of the arguments; drop this, and break out of the while loop --) shift; break ;; *) >&2 echo Unsupported option: $1 usage ;; @@ -82,15 +85,26 @@ if [[ -z "$gpfs_logdir" ]]; then exit 1 fi ->&2 echo "output dir: ${outdir}" ->&2 echo "GPFS logs: ${gpfs_logdir}" ->&2 echo "ntasks: ${ntasks}" ->&2 echo "partition: ${partition}" ->&2 echo "time: ${time}" ->&2 echo "mem: ${mem}" +if [[ -z "${sif}" && ! -f "dask.sif" ]]; then + echo "No SIF set, downloading ${default_sif_image} as dask.sif" + singularity pull dask.sif docker://${default_sif_image} + sif="dask.sif" +elif [[ -f "${sif}" ]]; then + echo "Singualrity file does not exist. Please pull the image first" + exit 1 +fi nlogs=$(ls ${gpfs_logdir}/list-* | wc -l) +>&2 echo "sif: ${sif}" +>&2 echo "output dir: ${outdir}" +>&2 echo "GPFS logs: ${gpfs_logdir}" +>&2 echo "ntasks: ${ntasks}" +>&2 echo "partition: ${partition}" +>&2 echo "time: ${time}" +>&2 echo "mem: ${mem}" +>&2 echo "singularity command: singularity exec --bind /data ${sif} python3 convert-to-parquet.py -o ${outdir} -f \${log}" + mkdir -p out mkdir -p err @@ -98,7 +112,7 @@ mkdir -p err # Create Array Job Script # ############################################################ -cat > convert-parquet-array.sh <<EOF +{ cat | sbatch; } << EOF #!/bin/bash # #SBATCH --job-name=parquet-list-%a @@ -110,12 +124,9 @@ cat > convert-parquet-array.sh <<EOF #SBATCH --error=err/%A_%a.err #SBATCH --array=1-${nlogs} -module load Anaconda3 -conda activate gpfs - log=\$(ls ${gpfs_logdir}/list-* | awk "NR==\${SLURM_ARRAY_TASK_ID} { print \$1 }") -python convert-to-parquet.py -o ${outdir} -f \${log} +singularity exec --bind /data dask.sif python3 convert-to-parquet.py -o ${outdir} -f \${log} EOF exit 0 \ No newline at end of file -- GitLab