diff --git a/prep-parquet-for-s5cmd/run-fpart-db.sh b/prep-parquet-for-s5cmd/run-fpart-db.sh old mode 100644 new mode 100755 index 551d596bead714ccbe5553d94c6d3ee899379332..7e5314ea19a6fb80d41f8180a807107a337251ec --- a/prep-parquet-for-s5cmd/run-fpart-db.sh +++ b/prep-parquet-for-s5cmd/run-fpart-db.sh @@ -12,7 +12,9 @@ time="12:00:00" partition="amd-hdr100" split_count=10000 part_dir='./part' -sif=${CI_REGISTRY_IMAGE}/s5cmd_dask:latest +sif="gitlab.rc.uab.edu:4567/mdefende/gpfs-policy:latest" +credentials_file="${HOME}/.aws/credentials" +profile="default" ############################################################ # Help # @@ -20,8 +22,10 @@ sif=${CI_REGISTRY_IMAGE}/s5cmd_dask:latest usage() { >&2 cat << EOF -Usage: $0 [ -h ] [ -s | --sif ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] - [ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination +Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] + [ -c | --split-count ] [ -d | --part-dir ] + [ -a | --aws-credentials-file ] [ -u | --credentials-profile ] + filter input_parquet destination EOF } @@ -30,36 +34,42 @@ help() # Display Help >&2 cat << EOF Submits an array job to transfer files listed in a GPFS dataset to a bucket on LTS using s5cmd -Syntax: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] - [ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination +Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ] + [ -c | --split-count ] [ -d | --part-dir ] + [ -a | --aws-credentials-file ] [ -u | --credentials-profile ] + filter input_parquet destination General: -h|--help Print this Help. --dry-run Only print Required: - filter Parent path to transfer. For example, /scratch/user1 will transfer all files in the GPFS log - that begin with /scratch/user1. Object prefixes will retain all subdirectory listings directly - underneath the filter. For example, a file with absolute path /scratch/user1/subdir1/file.txt - will be synced to an LTS bucket with prefix /bucket/subdir1. - input_parquet Path to the GPFS parquet dataset to read from - destination URI to sync data to. Only LTS buckets for now. Should be specified as 's3://bucket[/prefix] where - any additional prefix is optional. - -File Partition: - -c|--split-count Number of files to sync in each s5cmd partition (default: 10000) - -d|--part-dir Location to store the partition files (default: ./part) - -sbatch: - -n|--ntasks Number of tasks for each array index (default: 1) - -p|--partition Partition to submit tasks to (default: amd-hdr100) - -t|--time Max walltime (default: 02:00:00) - -m|--mem Memory for each task (default: 16G) + filter Parent path to transfer. For example, /scratch/user1 will transfer all files in the GPFS log + that begin with /scratch/user1. Object prefixes will retain all subdirectory listings directly + underneath the filter. For example, a file with absolute path /scratch/user1/subdir1/file.txt + will be synced to an LTS bucket with prefix /bucket/subdir1. + input_parquet Path to the GPFS parquet dataset to read from + destination URI to sync data to. Only LTS buckets for now. Should be specified as 's3://bucket[/prefix] where + any additional prefix is optional. + +File Partition: + -c|--split-count Number of files to sync in each s5cmd partition (default: 10000) + -d|--part-dir Location to store the partition files (default: ./part) + +s5cmd: + -a|--aws-credentials-file Path to AWS credentials file containing the access and secret keys (default: $HOME/.aws/credentials) + -u|--credentials-profile Profile to use in the AWS credentials (default: default) + +Job Parameters for File Transfer: + -n|--ntasks Number of tasks for each array index (default: 8) + -p|--partition Partition to submit tasks to (default: amd-hdr100) + -t|--time Max walltime (default: 12:00:00) + -m|--mem Memory for each task (default: 16G) EOF exit 0 } -args=$(getopt -a -o hn:p:t:m:c:d: --long help,ntasks:,partition:,time:,mem:,split-count:,part-dir: -- "$@") +args=$(getopt -a -o hn:p:t:m:c:d:a:u: --long help,ntasks:,partition:,time:,mem:,split-count:,part-dir:,aws-credentials-file:,credentials-profile: -- "$@") if [[ $? -gt 0 ]]; then usage fi @@ -69,13 +79,15 @@ eval set -- ${args} while : do case $1 in - -h | --help) help ;; - -n | --ntasks) ntasks=$2 ; shift 2 ;; - -p | --partition) partition=$2 ; shift 2 ;; - -t | --time) time=$2 ; shift 2 ;; - -m | --mem) mem=$2 ; shift 2 ;; - -c | --split-count) split_count=$2 ; shift 2 ;; - -d | --part-dir) part_dir=$2 ; shift 2 ;; + -h | --help) help ;; + -n | --ntasks) ntasks=$2 ; shift 2 ;; + -p | --partition) partition=$2 ; shift 2 ;; + -t | --time) time=$2 ; shift 2 ;; + -m | --mem) mem=$2 ; shift 2 ;; + -c | --split-count) split_count=$2 ; shift 2 ;; + -d | --part-dir) part_dir=$2 ; shift 2 ;; + -a | --aws-credentials-file) credentials_file=$2 ; shift 2 ;; + -u | --credentials-profile) profile=$2 ; shift 2 ;; --) shift; break ;; *) >&2 echo Unsupported option: $1 usage ;; @@ -109,8 +121,11 @@ split_cmd="singularity exec --bind /data,/scratch \ transfer_cmd="singularity exec --bind /data,/scratch \ gpfs.sif s5cmd \ - --nworkers 8 \ + --numworkers ${ntasks} \ --endpoint-url https://s3.lts.rc.uab.edu \ + --credentials-file ${credentials_file} \ + --profile $profile \ + --retry-count 3 \ run ${part_dir}/part_${SLURM_ARRAY_TASK_ID}.txt" >&2 cat << EOF @@ -123,6 +138,9 @@ sif: ${sif} split count: ${split_count} partition dir: ${part_dir} +credentials file: ${credentials_file} +credentials profile: ${profile} + ntasks: ${ntasks} partition: ${partition} time: ${time}