Skip to content
Snippets Groups Projects
Commit 266b0c3a authored by John-Paul Robinson's avatar John-Paul Robinson
Browse files

Add caller defined worklist and lazy array invocation

Lets caller control the list of directories to operate on without
having to edit the script.

Added array task protections for lazy callers who don't care if
array size aligns with work tasks.
parent 6397cc32
No related branches found
No related tags found
1 merge request!52Draft: Improvements to post processing workflow to make ad hoc use of scripts easier
......@@ -13,7 +13,17 @@
module load Anaconda3
conda activate gpfs
logs=($(find /data/rc/gpfs-policy/data -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"))
# listcmd env var sets the command to enumerate datasets to process
# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript>
# note: maxdepth speeds execution of find by avoiding deep dirs
listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"}
logs=($($listcmd))
log=${logs[${SLURM_ARRAY_TASK_ID}]}
convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log}
\ No newline at end of file
# for lazy submit. only do work if there is work to do
if [ ${SLURM_ARRAY_TASK_ID} -lt ${#logs[@]} ]
then
convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log}
fi
......@@ -14,14 +14,23 @@
module load Anaconda3
conda activate gpfs
set -x
# listcmd env var sets the command to enumerate datasets to process
# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript>
# note: maxdepth speeds execution of find by avoiding deep dirs
listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet"}
device="data-project" # data-project, data-user, or scratch
parquets=($(find /data/rc/gpfs-policy/data -path "*/list-policy_${device}_list-path-external_slurm-*2025-01-21*/parquet"))
parquets=($($listcmd))
pq=${parquets[${SLURM_ARRAY_TASK_ID}]}
convert-to-hive --batch \
--reservation=rc-gpfs \
--partition=amperenodes-reserve \
--mem=120G \
${pq} \
/data/rc/gpfs-policy/data/gpfs-hive/${device}
# for lazy submit. only do work if there is work to do
if [ ${SLURM_ARRAY_TASK_ID} -lt ${#parquets[@]} ]
then
convert-to-hive --batch \
--reservation=rc-gpfs \
--partition=amperenodes-reserve \
--mem=120G \
${pq} \
/data/rc/gpfs-policy/data/gpfs-hive/${device}
fi
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment