10-convert-logs.sh

#! /bin/bash
#
#SBATCH --job-name=convert
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --mem=8G
#SBATCH --partition=amd-hdr100,intel-dcb,express
#SBATCH --time=02:00:00
#SBATCH --output=out/convert-%A-%a.out
#SBATCH --error=out/convert-%A-%a.err
#SBATCH --array=0-49

module load Anaconda3
conda activate gpfs

# listcmd env var sets the command to enumerate datasets to process
# supports passing args during sbatch, e.g. listcmd="cat split-list" sbatch <thisscript>
# note: maxdepth speeds execution of find by avoiding deep dirs
listcmd=${listcmd:-find /data/rc/gpfs-policy/data -maxdepth 2 -path "*/list-policy_data-project_list-path-external_slurm-*/chunks"}

logs=($($listcmd))
log=${logs[${SLURM_ARRAY_TASK_ID}]}


# for lazy submit. only do work if there is work to do
if [ ${SLURM_ARRAY_TASK_ID} -lt ${#logs[@]} ]
then
    convert-to-parquet --batch --no-clobber --partition=amd-hdr100,express,intel-dcb ${log}
fi