diff --git a/convert-to-parquet/run-convert-to-parquet.sh b/convert-to-parquet/run-convert-to-parquet.sh index 7ce57af5a8c05b2e00ad87740cdd347e89434217..d08a2db3c477a3d8514ff620a857ce876dd015bf 100755 --- a/convert-to-parquet/run-convert-to-parquet.sh +++ b/convert-to-parquet/run-convert-to-parquet.sh @@ -11,8 +11,7 @@ mem="16G" time="02:00:00" partition="amd-hdr100" outdir="" -sif="" -default_sif_image="daskdev/dask:2024.8.0-py3.12" +sif="gitlab.rc.uab.edu:4567/mdefende/gpfs-policy:latest" ############################################################ # Help # @@ -20,37 +19,36 @@ default_sif_image="daskdev/dask:2024.8.0-py3.12" usage() { >&2 cat << EOF -Usage: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>" +Usage: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] gpfs_logdir" EOF exit 1 } help() { -# Display Help -echo "Submits an array job to convert parts of a GPFS log to parquet format" -echo -echo "Syntax: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>" -echo "options:" -echo "-h|--help Print this Help." -echo -echo "Required:" -echo " gpfs_log_dir Directory containing GPFS log outputs" -echo -echo "Path:" -echo " -s|--sif Path to SIF containing dask for processing" -echo " -o|--outdir Directory to save parquet files to" -echo -echo "sbatch options:" -echo " -n|--ntasks Number of tasks for each array index (default: 1)" -echo " -p|--partition Partition to submit tasks to (default: amd-hdr100)" -echo " -t|--time Max walltime (default: 02:00:00)" -echo " -m|--mem Memory for each task (default: 16G)" -echo -exit 1 +>&2 cat << EOF +Submits an array job to convert parts of a GPFS log to parquet format +Syntax: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] gpfs_logdir + +options: + -h|--help Print this Help. + +Required: + gpfs_log_dir Directory containing GPFS log outputs + +Path: + -o|--outdir Directory to save parquet files to + +sbatch options: + -n|--ntasks Number of tasks for each array index (default: 1) + -p|--partition Partition to submit tasks to (default: amd-hdr100) + -t|--time Max walltime (default: 02:00:00) + -m|--mem Memory for each task (default: 16G) +EOF +exit 0 } -args=$(getopt -a -o hs:o:n:p:t:m: --long help,sif:,outdir:,ntasks:,partition:,time:,mem: -- "$@") +args=$(getopt -a -o ho:n:p:t:m: --long help,outdir:,ntasks:,partition:,time:,mem: -- "$@") if [[ $? -gt 0 ]]; then usage fi @@ -61,7 +59,6 @@ while : do case $1 in -h | --help) help ;; - -s | --sif) sif=$2 ; shift 2 ;; -o | --outdir) outdir=$2 ; shift 2 ;; -n | --ntasks) ntasks=$2 ; shift 2 ;; -p | --partition) partition=$2 ; shift 2 ;; @@ -85,25 +82,31 @@ if [[ -z "$gpfs_logdir" ]]; then exit 1 fi -if [[ -z "${sif}" && ! -f "dask.sif" ]]; then - echo "No SIF set, downloading ${default_sif_image} as dask.sif" - singularity pull dask.sif docker://${default_sif_image} - sif="dask.sif" -elif [[ -f "${sif}" ]]; then - echo "Singualrity file does not exist. Please pull the image first" - exit 1 +# If outdir not set, set to ${gpfs_logdir}/parquet +if [[ -z "$outdir" ]]; then + outdir="${gpfs_logdir}/parquet" fi +singularity pull --force gpfs.sif docker://${sif} + nlogs=$(ls ${gpfs_logdir}/list-* | wc -l) ->&2 echo "sif: ${sif}" ->&2 echo "output dir: ${outdir}" ->&2 echo "GPFS logs: ${gpfs_logdir}" ->&2 echo "ntasks: ${ntasks}" ->&2 echo "partition: ${partition}" ->&2 echo "time: ${time}" ->&2 echo "mem: ${mem}" ->&2 echo "singularity command: singularity exec --bind /data ${sif} python3 convert-to-parquet.py -o ${outdir} -f \${log}" +cmd="singularity exec --bind /data,/scratch gpfs.sif python3 convert-to-parquet.py -o ${outdir} -f \${log}" + +>&2 cat << EOF +-------------------------------------------------------------------------------- +sif: ${sif} +output dir: ${outdir} +GPFS logs: ${gpfs_logdir} + +ntasks: ${ntasks} +partition: ${partition} +time: ${time} +mem: ${mem} + +command: ${cmd} +-------------------------------------------------------------------------------- +EOF mkdir -p out mkdir -p err @@ -126,7 +129,7 @@ mkdir -p err log=\$(ls ${gpfs_logdir}/list-* | awk "NR==\${SLURM_ARRAY_TASK_ID} { print \$1 }") -singularity exec --bind /data dask.sif python3 convert-to-parquet.py -o ${outdir} -f \${log} +${cmd} EOF exit 0 \ No newline at end of file