From 266b0c3a38e45db6fcaff2deb99c7031e7c24500 Mon Sep 17 00:00:00 2001 From: John-Paul Robinson <jpr@uab.edu> Date: Mon, 17 Feb 2025 13:05:22 -0600 Subject: [PATCH] Add caller defined worklist and lazy array invocation Lets caller control the list of directories to operate on without having to edit the script. Added array task protections for lazy callers who don't care if array size aligns with work tasks. --- example-job-scripts/10-convert-logs.sh | 14 ++++++++++++-- example-job-scripts/20-convert-to-hive.sh | 23 ++++++++++++++++------- 2 files changed, 28 insertions(+), 9 deletions(-) mode change 100644 => 100755 example-job-scripts/10-convert-logs.sh mode change 100644 => 100755 example-job-scripts/20-convert-to-hive.sh diff --git a/example-job-scripts/10-convert-logs.sh b/example-job-scripts/10-convert-logs.sh old mode 100644 new mode 100755 index 7ce3c09..89c0a0c --- a/example-job-scripts/10-convert-logs.sh +++ b/example-job-scripts/10-convert-logs.sh @@ -13,7 +13,17 @@ module load Anaconda3 conda activate gpfs -logs=($(find /data/rc/gpfs-policy/data -path "*/list-policy_data-project_list-path-external_slurm-*/chunks")) +# listcmd env var sets the command to enumerate datasets to process +# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript> +# note: maxdepth speeds execution of find by avoiding deep dirs +listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"} + +logs=($($listcmd)) log=${logs[${SLURM_ARRAY_TASK_ID}]} -convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log} \ No newline at end of file + +# for lazy submit. only do work if there is work to do +if [ ${SLURM_ARRAY_TASK_ID} -lt ${#logs[@]} ] +then + convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log} +fi diff --git a/example-job-scripts/20-convert-to-hive.sh b/example-job-scripts/20-convert-to-hive.sh old mode 100644 new mode 100755 index 1d6333d..a83ae39 --- a/example-job-scripts/20-convert-to-hive.sh +++ b/example-job-scripts/20-convert-to-hive.sh @@ -14,14 +14,23 @@ module load Anaconda3 conda activate gpfs +set -x +# listcmd env var sets the command to enumerate datasets to process +# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript> +# note: maxdepth speeds execution of find by avoiding deep dirs +listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet"} device="data-project" # data-project, data-user, or scratch -parquets=($(find /data/rc/gpfs-policy/data -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet")) +parquets=($($listcmd)) pq=${parquets[${SLURM_ARRAY_TASK_ID}]} -convert-to-hive --batch \ - --reservation=rc-gpfs \ - --partition=amperenodes-reserve \ - --mem=120G \ - ${pq} \ - /data/rc/gpfs-policy/data/gpfs-hive/${device} +# for lazy submit. only do work if there is work to do +if [ ${SLURM_ARRAY_TASK_ID} -lt ${#parquets[@]} ] +then + convert-to-hive --batch \ + --reservation=rc-gpfs \ + --partition=amperenodes-reserve \ + --mem=120G \ + ${pq} \ + /data/rc/gpfs-policy/data/gpfs-hive/${device} +fi -- GitLab