Skip to content
Snippets Groups Projects
Commit c764d3af authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

create runscript for dataset partitioning and transfer

parent 1a970090
No related branches found
No related tags found
1 merge request!9Draft: Partition parquet dataset for sync with s5cmd
#!/bin/bash
set -euo pipefail
############################################################
# Default Values #
############################################################
ntasks=8
mem="16G"
time="12:00:00"
partition="amd-hdr100"
split_count=10000
part_dir='./part'
sif=${CI_REGISTRY_IMAGE}/s5cmd_dask:latest
############################################################
# Help #
############################################################
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -s | --sif ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
[ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination
EOF
}
help()
{
# Display Help
>&2 cat << EOF
Submits an array job to transfer files listed in a GPFS dataset to a bucket on LTS using s5cmd
Syntax: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
[ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination
General:
-h|--help Print this Help.
--dry-run Only print
Required:
filter Parent path to transfer. For example, /scratch/user1 will transfer all files in the GPFS log
that begin with /scratch/user1. Object prefixes will retain all subdirectory listings directly
underneath the filter. For example, a file with absolute path /scratch/user1/subdir1/file.txt
will be synced to an LTS bucket with prefix /bucket/subdir1.
input_parquet Path to the GPFS parquet dataset to read from
destination URI to sync data to. Only LTS buckets for now. Should be specified as 's3://bucket[/prefix] where
any additional prefix is optional.
File Partition:
-c|--split-count Number of files to sync in each s5cmd partition (default: 10000)
-d|--part-dir Location to store the partition files (default: ./part)
sbatch:
-n|--ntasks Number of tasks for each array index (default: 1)
-p|--partition Partition to submit tasks to (default: amd-hdr100)
-t|--time Max walltime (default: 02:00:00)
-m|--mem Memory for each task (default: 16G)
EOF
exit 0
}
args=$(getopt -a -o hn:p:t:m:c:d: --long help,ntasks:,partition:,time:,mem:,split-count:,part-dir: -- "$@")
if [[ $? -gt 0 ]]; then
usage
fi
eval set -- ${args}
while :
do
case $1 in
-h | --help) help ;;
-n | --ntasks) ntasks=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
-t | --time) time=$2 ; shift 2 ;;
-m | --mem) mem=$2 ; shift 2 ;;
-c | --split-count) split_count=$2 ; shift 2 ;;
-d | --part-dir) part_dir=$2 ; shift 2 ;;
--) shift; break ;;
*) >&2 echo Unsupported option: $1
usage ;;
esac
done
if [[ $# -eq 0 ]]; then
usage
fi
filter="$1"
input_parquet="$2"
destination="$3"
# Ensure positional arguments are set
if [[ -z $filter || -z $input_parquet || -z $destination ]]; then
echo "Missing positional argument"
usage
exit 1
fi
singularity pull gpfs.sif docker://${sif}
split_cmd="singularity exec --bind /data,/scratch \
gpfs.sif python3 fpart-db.py \
-c ${split_count} \
-p ${part_dir} \
-f ${filter} \
-i ${input_parquet} \
-d ${destination}"
transfer_cmd="singularity exec --bind /data,/scratch \
gpfs.sif s5cmd \
--nworkers 8 \
--endpoint-url https://s3.lts.rc.uab.edu \
run ${part_dir}/part_${SLURM_ARRAY_TASK_ID}.txt"
>&2 cat << EOF
filter: ${filter}
input parquet: ${input_parquet}
destination: ${destination}
sif: ${sif}
split count: ${split_count}
partition dir: ${part_dir}
ntasks: ${ntasks}
partition: ${partition}
time: ${time}
mem: ${mem}
split dataset command: ${split_cmd}
transfer command: ${transfer_cmd}
EOF
mkdir -p out
mkdir -p err
############################################################
# Split Dataset #
############################################################
$split_cmd
nparts=$(ls ${part_dir}/part* | wc -l)
############################################################
# Create Array Job Script #
############################################################
{ cat; } << EOF
#!/bin/bash
#
#SBATCH --job-name=s5-array-%a
#SBATCH --ntasks=${ntasks}
#SBATCH --partition=${partition}
#SBATCH --time=${time}
#SBATCH --mem=${mem}
#SBATCH --output=out/%A_%a.out
#SBATCH --error=err/%A_%a.err
#SBATCH --array=1-${nparts}
${transfer_cmd}
EOF
exit 0
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment