From 266b0c3a38e45db6fcaff2deb99c7031e7c24500 Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Mon, 17 Feb 2025 13:05:22 -0600
Subject: [PATCH] Add caller defined worklist and lazy array invocation

Lets caller control the list of directories to operate on without
having to edit the script.

Added array task protections for lazy callers who don't care if
array size aligns with work tasks.
---
 example-job-scripts/10-convert-logs.sh    | 14 ++++++++++++--
 example-job-scripts/20-convert-to-hive.sh | 23 ++++++++++++++++-------
 2 files changed, 28 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 example-job-scripts/10-convert-logs.sh
 mode change 100644 => 100755 example-job-scripts/20-convert-to-hive.sh

diff --git a/example-job-scripts/10-convert-logs.sh b/example-job-scripts/10-convert-logs.sh
old mode 100644
new mode 100755
index 7ce3c09..89c0a0c
--- a/example-job-scripts/10-convert-logs.sh
+++ b/example-job-scripts/10-convert-logs.sh
@@ -13,7 +13,17 @@
 module load Anaconda3
 conda activate gpfs
 
-logs=($(find /data/rc/gpfs-policy/data -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"))
+# listcmd env var sets the command to enumerate datasets to process
+# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript>
+# note: maxdepth speeds execution of find by avoiding deep dirs
+listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"}
+
+logs=($($listcmd))
 log=${logs[${SLURM_ARRAY_TASK_ID}]}
 
-convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log}
\ No newline at end of file
+
+# for lazy submit. only do work if there is work to do
+if [ ${SLURM_ARRAY_TASK_ID} -lt ${#logs[@]} ]
+then
+    convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log}
+fi
diff --git a/example-job-scripts/20-convert-to-hive.sh b/example-job-scripts/20-convert-to-hive.sh
old mode 100644
new mode 100755
index 1d6333d..a83ae39
--- a/example-job-scripts/20-convert-to-hive.sh
+++ b/example-job-scripts/20-convert-to-hive.sh
@@ -14,14 +14,23 @@
 
 module load Anaconda3
 conda activate gpfs
+set -x
+# listcmd env var sets the command to enumerate datasets to process
+# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript>
+# note: maxdepth speeds execution of find by avoiding deep dirs
+listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet"}
 
 device="data-project" # data-project, data-user, or scratch
-parquets=($(find /data/rc/gpfs-policy/data -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet"))
+parquets=($($listcmd))
 pq=${parquets[${SLURM_ARRAY_TASK_ID}]}
 
-convert-to-hive --batch \
-    --reservation=rc-gpfs \
-    --partition=amperenodes-reserve \
-    --mem=120G \
-    ${pq} \
-    /data/rc/gpfs-policy/data/gpfs-hive/${device}
+# for lazy submit. only do work if there is work to do
+if [ ${SLURM_ARRAY_TASK_ID} -lt ${#parquets[@]} ]
+then
+    convert-to-hive --batch \
+        --reservation=rc-gpfs \
+        --partition=amperenodes-reserve \
+        --mem=120G \
+        ${pq} \
+        /data/rc/gpfs-policy/data/gpfs-hive/${device}
+fi
-- 
GitLab