Skip to content
Snippets Groups Projects

Automate conversion of GPFS policy outputs to parquet without Jupyter

Merged Matthew K Defenderfer requested to merge convert-to-parquet into main
1 file
+ 45
42
Compare changes
  • Side-by-side
  • Inline
@@ -11,8 +11,7 @@ mem="16G"
time="02:00:00"
partition="amd-hdr100"
outdir=""
sif=""
default_sif_image="daskdev/dask:2024.8.0-py3.12"
sif="gitlab.rc.uab.edu:4567/mdefende/gpfs-policy:latest"
############################################################
# Help #
@@ -20,37 +19,36 @@ default_sif_image="daskdev/dask:2024.8.0-py3.12"
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>"
Usage: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] gpfs_logdir"
EOF
exit 1
}
help()
{
# Display Help
echo "Submits an array job to convert parts of a GPFS log to parquet format"
echo
echo "Syntax: $0 [ -h ] [ -s | --sif ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] <gpfs_logdir>"
echo "options:"
echo "-h|--help Print this Help."
echo
echo "Required:"
echo " gpfs_log_dir Directory containing GPFS log outputs"
echo
echo "Path:"
echo " -s|--sif Path to SIF containing dask for processing"
echo " -o|--outdir Directory to save parquet files to"
echo
echo "sbatch options:"
echo " -n|--ntasks Number of tasks for each array index (default: 1)"
echo " -p|--partition Partition to submit tasks to (default: amd-hdr100)"
echo " -t|--time Max walltime (default: 02:00:00)"
echo " -m|--mem Memory for each task (default: 16G)"
echo
exit 1
>&2 cat << EOF
Submits an array job to convert parts of a GPFS log to parquet format
Syntax: $0 [ -h ] [ -o | --outdir ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] gpfs_logdir
options:
-h|--help Print this Help.
Required:
gpfs_log_dir Directory containing GPFS log outputs
Path:
-o|--outdir Directory to save parquet files to
sbatch options:
-n|--ntasks Number of tasks for each array index (default: 1)
-p|--partition Partition to submit tasks to (default: amd-hdr100)
-t|--time Max walltime (default: 02:00:00)
-m|--mem Memory for each task (default: 16G)
EOF
exit 0
}
args=$(getopt -a -o hs:o:n:p:t:m: --long help,sif:,outdir:,ntasks:,partition:,time:,mem: -- "$@")
args=$(getopt -a -o ho:n:p:t:m: --long help,outdir:,ntasks:,partition:,time:,mem: -- "$@")
if [[ $? -gt 0 ]]; then
usage
fi
@@ -61,7 +59,6 @@ while :
do
case $1 in
-h | --help) help ;;
-s | --sif) sif=$2 ; shift 2 ;;
-o | --outdir) outdir=$2 ; shift 2 ;;
-n | --ntasks) ntasks=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
@@ -85,25 +82,31 @@ if [[ -z "$gpfs_logdir" ]]; then
exit 1
fi
if [[ -z "${sif}" && ! -f "dask.sif" ]]; then
echo "No SIF set, downloading ${default_sif_image} as dask.sif"
singularity pull dask.sif docker://${default_sif_image}
sif="dask.sif"
elif [[ -f "${sif}" ]]; then
echo "Singualrity file does not exist. Please pull the image first"
exit 1
# If outdir not set, set to ${gpfs_logdir}/parquet
if [[ -z "$outdir" ]]; then
outdir="${gpfs_logdir}/parquet"
fi
singularity pull --force gpfs.sif docker://${sif}
nlogs=$(ls ${gpfs_logdir}/list-* | wc -l)
>&2 echo "sif: ${sif}"
>&2 echo "output dir: ${outdir}"
>&2 echo "GPFS logs: ${gpfs_logdir}"
>&2 echo "ntasks: ${ntasks}"
>&2 echo "partition: ${partition}"
>&2 echo "time: ${time}"
>&2 echo "mem: ${mem}"
>&2 echo "singularity command: singularity exec --bind /data ${sif} python3 convert-to-parquet.py -o ${outdir} -f \${log}"
cmd="singularity exec --bind /data,/scratch gpfs.sif python3 convert-to-parquet.py -o ${outdir} -f \${log}"
>&2 cat << EOF
--------------------------------------------------------------------------------
sif: ${sif}
output dir: ${outdir}
GPFS logs: ${gpfs_logdir}
ntasks: ${ntasks}
partition: ${partition}
time: ${time}
mem: ${mem}
command: ${cmd}
--------------------------------------------------------------------------------
EOF
mkdir -p out
mkdir -p err
@@ -126,7 +129,7 @@ mkdir -p err
log=\$(ls ${gpfs_logdir}/list-* | awk "NR==\${SLURM_ARRAY_TASK_ID} { print \$1 }")
singularity exec --bind /data dask.sif python3 convert-to-parquet.py -o ${outdir} -f \${log}
${cmd}
EOF
exit 0
\ No newline at end of file
Loading