diff --git a/prep-parquet-for-s5cmd/run-fpart-db.sh b/prep-parquet-for-s5cmd/run-fpart-db.sh new file mode 100644 index 0000000000000000000000000000000000000000..551d596bead714ccbe5553d94c6d3ee899379332 --- /dev/null +++ b/prep-parquet-for-s5cmd/run-fpart-db.sh @@ -0,0 +1,165 @@ +#!/bin/bash + +set -euo pipefail + +############################################################ +# Default Values # +############################################################ + +ntasks=8 +mem="16G" +time="12:00:00" +partition="amd-hdr100" +split_count=10000 +part_dir='./part' +sif=${CI_REGISTRY_IMAGE}/s5cmd_dask:latest + +############################################################ +# Help # +############################################################ +usage() +{ +>&2 cat << EOF +Usage: $0 [ -h ] [ -s | --sif ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] + [ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination +EOF +} + +help() +{ +# Display Help +>&2 cat << EOF +Submits an array job to transfer files listed in a GPFS dataset to a bucket on LTS using s5cmd +Syntax: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] + [ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination + +General: + -h|--help Print this Help. + --dry-run Only print + +Required: + filter Parent path to transfer. For example, /scratch/user1 will transfer all files in the GPFS log + that begin with /scratch/user1. Object prefixes will retain all subdirectory listings directly + underneath the filter. For example, a file with absolute path /scratch/user1/subdir1/file.txt + will be synced to an LTS bucket with prefix /bucket/subdir1. + input_parquet Path to the GPFS parquet dataset to read from + destination URI to sync data to. Only LTS buckets for now. Should be specified as 's3://bucket[/prefix] where + any additional prefix is optional. + +File Partition: + -c|--split-count Number of files to sync in each s5cmd partition (default: 10000) + -d|--part-dir Location to store the partition files (default: ./part) + +sbatch: + -n|--ntasks Number of tasks for each array index (default: 1) + -p|--partition Partition to submit tasks to (default: amd-hdr100) + -t|--time Max walltime (default: 02:00:00) + -m|--mem Memory for each task (default: 16G) +EOF +exit 0 +} + +args=$(getopt -a -o hn:p:t:m:c:d: --long help,ntasks:,partition:,time:,mem:,split-count:,part-dir: -- "$@") +if [[ $? -gt 0 ]]; then + usage +fi + +eval set -- ${args} + +while : +do + case $1 in + -h | --help) help ;; + -n | --ntasks) ntasks=$2 ; shift 2 ;; + -p | --partition) partition=$2 ; shift 2 ;; + -t | --time) time=$2 ; shift 2 ;; + -m | --mem) mem=$2 ; shift 2 ;; + -c | --split-count) split_count=$2 ; shift 2 ;; + -d | --part-dir) part_dir=$2 ; shift 2 ;; + --) shift; break ;; + *) >&2 echo Unsupported option: $1 + usage ;; + esac +done + +if [[ $# -eq 0 ]]; then + usage +fi + +filter="$1" +input_parquet="$2" +destination="$3" + +# Ensure positional arguments are set +if [[ -z $filter || -z $input_parquet || -z $destination ]]; then + echo "Missing positional argument" + usage + exit 1 +fi + +singularity pull gpfs.sif docker://${sif} + +split_cmd="singularity exec --bind /data,/scratch \ + gpfs.sif python3 fpart-db.py \ + -c ${split_count} \ + -p ${part_dir} \ + -f ${filter} \ + -i ${input_parquet} \ + -d ${destination}" + +transfer_cmd="singularity exec --bind /data,/scratch \ + gpfs.sif s5cmd \ + --nworkers 8 \ + --endpoint-url https://s3.lts.rc.uab.edu \ + run ${part_dir}/part_${SLURM_ARRAY_TASK_ID}.txt" + +>&2 cat << EOF +filter: ${filter} +input parquet: ${input_parquet} +destination: ${destination} + +sif: ${sif} + +split count: ${split_count} +partition dir: ${part_dir} + +ntasks: ${ntasks} +partition: ${partition} +time: ${time} +mem: ${mem} + +split dataset command: ${split_cmd} +transfer command: ${transfer_cmd} +EOF + +mkdir -p out +mkdir -p err + +############################################################ +# Split Dataset # +############################################################ + +$split_cmd + +nparts=$(ls ${part_dir}/part* | wc -l) + +############################################################ +# Create Array Job Script # +############################################################ + +{ cat; } << EOF +#!/bin/bash +# +#SBATCH --job-name=s5-array-%a +#SBATCH --ntasks=${ntasks} +#SBATCH --partition=${partition} +#SBATCH --time=${time} +#SBATCH --mem=${mem} +#SBATCH --output=out/%A_%a.out +#SBATCH --error=err/%A_%a.err +#SBATCH --array=1-${nparts} + +${transfer_cmd} +EOF + +exit 0 \ No newline at end of file