From 517774cf92b80b781eaa8a7ba959b7b8a522dcef Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Mon, 17 Feb 2025 12:15:18 -0600
Subject: [PATCH] Add listcmd param to split script

This lets caller control the list of directories to split without
having to edit the script.

Added array task protections for lazy callers who don't care if
array size aligns with work tasks.
---
 example-job-scripts/00-split-logs.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)
 mode change 100644 => 100755 example-job-scripts/00-split-logs.sh

diff --git a/example-job-scripts/00-split-logs.sh b/example-job-scripts/00-split-logs.sh
old mode 100644
new mode 100755
index 07c4daf..f152f1a
--- a/example-job-scripts/00-split-logs.sh
+++ b/example-job-scripts/00-split-logs.sh
@@ -13,7 +13,17 @@
 module load Anaconda3
 conda activate gpfs
 
-logs=($(find /data/rc/gpfs-policy/data -path "*/list-policy_data-user_list-path-external_slurm-31[35]*/raw/*.gz"))
+# listcmd env var sets the command to enumerate datasets to process
+# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript>
+# note: maxdeth speeds execution of find by avoiding deep dirs
+listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 3 -path "*/list-policy_data-user_list-path-external_slurm-31[35]*/raw/*.gz"}
+
+logs=($($listcmd))
 log=${logs[${SLURM_ARRAY_TASK_ID}]}
 
-split-log --no-clobber ${log}
\ No newline at end of file
+
+# for lazy submit. only do work if there is work to do
+if [ ${SLURM_ARRAY_TASK_ID} -lt ${#logs[@]} ]
+then
+    echo split-log --no-clobber ${log}
+fi
-- 
GitLab