Skip to content
Snippets Groups Projects
Commit cbc91e4c authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

convert to running from SIF container instead of bare metal conda env. add...

convert to running from SIF container instead of bare metal conda env. add option to specify container
parent 59515b7a
No related branches found
No related tags found
1 merge request!8Automate conversion of GPFS policy outputs to parquet without Jupyter
......@@ -11,6 +11,8 @@ mem="16G"
time="02:00:00"
partition="amd-hdr100"
outdir=""
sif=""
default_sif_image="daskdev/dask:2024.8.0-py3.12"
############################################################
# Help #
......@@ -28,7 +30,7 @@ help()
# Display Help
echo "Submits an array job to convert parts of a GPFS log to parquet format"
echo
echo "Syntax: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>"
echo "Syntax: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>"
echo "options:"
echo "-h|--help Print this Help."
echo
......@@ -36,6 +38,7 @@ echo "Required:"
echo " gpfs_log_dir Directory containing GPFS log outputs"
echo
echo "Path:"
echo " -s|--sif Path to SIF containing dask for processing"
echo " -o|--outdir Directory to save parquet files to"
echo
echo "sbatch options:"
......@@ -47,7 +50,7 @@ echo
exit 1
}
args=$(getopt -a -o ho:n:p:t:m: --long help,outdir:,ntasks:,partition:,time:,mem: -- "$@")
args=$(getopt -a -o hs:o:n:p:t:m: --long help,sif:,outdir:,ntasks:,partition:,time:,mem: -- "$@")
if [[ $? -gt 0 ]]; then
usage
fi
......@@ -58,12 +61,12 @@ while :
do
case $1 in
-h | --help) help ;;
-s | --sif) sif=$2 ; shift 2 ;;
-o | --outdir) outdir=$2 ; shift 2 ;;
-n | --ntasks) ntasks=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
-t | --time) time=$2 ; shift 2 ;;
-m | --mem) mem=$2 ; shift 2 ;;
# -- means the end of the arguments; drop this, and break out of the while loop
--) shift; break ;;
*) >&2 echo Unsupported option: $1
usage ;;
......@@ -82,15 +85,26 @@ if [[ -z "$gpfs_logdir" ]]; then
exit 1
fi
>&2 echo "output dir: ${outdir}"
>&2 echo "GPFS logs: ${gpfs_logdir}"
>&2 echo "ntasks: ${ntasks}"
>&2 echo "partition: ${partition}"
>&2 echo "time: ${time}"
>&2 echo "mem: ${mem}"
if [[ -z "${sif}" && ! -f "dask.sif" ]]; then
echo "No SIF set, downloading ${default_sif_image} as dask.sif"
singularity pull dask.sif docker://${default_sif_image}
sif="dask.sif"
elif [[ -f "${sif}" ]]; then
echo "Singualrity file does not exist. Please pull the image first"
exit 1
fi
nlogs=$(ls ${gpfs_logdir}/list-* | wc -l)
>&2 echo "sif: ${sif}"
>&2 echo "output dir: ${outdir}"
>&2 echo "GPFS logs: ${gpfs_logdir}"
>&2 echo "ntasks: ${ntasks}"
>&2 echo "partition: ${partition}"
>&2 echo "time: ${time}"
>&2 echo "mem: ${mem}"
>&2 echo "singularity command: singularity exec --bind /data ${sif} python3 convert-to-parquet.py -o ${outdir} -f \${log}"
mkdir -p out
mkdir -p err
......@@ -98,7 +112,7 @@ mkdir -p err
# Create Array Job Script #
############################################################
cat > convert-parquet-array.sh <<EOF
{ cat | sbatch; } << EOF
#!/bin/bash
#
#SBATCH --job-name=parquet-list-%a
......@@ -110,12 +124,9 @@ cat > convert-parquet-array.sh <<EOF
#SBATCH --error=err/%A_%a.err
#SBATCH --array=1-${nlogs}
module load Anaconda3
conda activate gpfs
log=\$(ls ${gpfs_logdir}/list-* | awk "NR==\${SLURM_ARRAY_TASK_ID} { print \$1 }")
python convert-to-parquet.py -o ${outdir} -f \${log}
singularity exec --bind /data dask.sif python3 convert-to-parquet.py -o ${outdir} -f \${log}
EOF
exit 0
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment