diff --git a/example-job-scripts/10-convert-logs.sh b/example-job-scripts/10-convert-logs.sh old mode 100644 new mode 100755 index 7ce3c09f8e21cc67fd3c99121c2ca19977335830..89c0a0c8d0c5e562903a28d3cd150eb5dc089709 --- a/example-job-scripts/10-convert-logs.sh +++ b/example-job-scripts/10-convert-logs.sh @@ -13,7 +13,17 @@ module load Anaconda3 conda activate gpfs -logs=($(find /data/rc/gpfs-policy/data -path "*/list-policy_data-project_list-path-external_slurm-*/chunks")) +# listcmd env var sets the command to enumerate datasets to process +# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript> +# note: maxdepth speeds execution of find by avoiding deep dirs +listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"} + +logs=($($listcmd)) log=${logs[${SLURM_ARRAY_TASK_ID}]} -convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log} \ No newline at end of file + +# for lazy submit. only do work if there is work to do +if [ ${SLURM_ARRAY_TASK_ID} -lt ${#logs[@]} ] +then + convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log} +fi diff --git a/example-job-scripts/20-convert-to-hive.sh b/example-job-scripts/20-convert-to-hive.sh old mode 100644 new mode 100755 index 1d6333ddcb39e7f9dc06745002f5d2a964d192da..a83ae39eb23bd76034722ca806b3647a31e43e46 --- a/example-job-scripts/20-convert-to-hive.sh +++ b/example-job-scripts/20-convert-to-hive.sh @@ -14,14 +14,23 @@ module load Anaconda3 conda activate gpfs +set -x +# listcmd env var sets the command to enumerate datasets to process +# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript> +# note: maxdepth speeds execution of find by avoiding deep dirs +listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet"} device="data-project" # data-project, data-user, or scratch -parquets=($(find /data/rc/gpfs-policy/data -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet")) +parquets=($($listcmd)) pq=${parquets[${SLURM_ARRAY_TASK_ID}]} -convert-to-hive --batch \ - --reservation=rc-gpfs \ - --partition=amperenodes-reserve \ - --mem=120G \ - ${pq} \ - /data/rc/gpfs-policy/data/gpfs-hive/${device} +# for lazy submit. only do work if there is work to do +if [ ${SLURM_ARRAY_TASK_ID} -lt ${#parquets[@]} ] +then + convert-to-hive --batch \ + --reservation=rc-gpfs \ + --partition=amperenodes-reserve \ + --mem=120G \ + ${pq} \ + /data/rc/gpfs-policy/data/gpfs-hive/${device} +fi