Skip to content
Snippets Groups Projects
Commit 14426714 authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

change default sif file and add credentials options for s5cmd

parent 18300268
No related branches found
No related tags found
1 merge request!9Draft: Partition parquet dataset for sync with s5cmd
......@@ -12,7 +12,9 @@ time="12:00:00"
partition="amd-hdr100"
split_count=10000
part_dir='./part'
sif=${CI_REGISTRY_IMAGE}/s5cmd_dask:latest
sif="gitlab.rc.uab.edu:4567/mdefende/gpfs-policy:latest"
credentials_file="${HOME}/.aws/credentials"
profile="default"
############################################################
# Help #
......@@ -20,8 +22,10 @@ sif=${CI_REGISTRY_IMAGE}/s5cmd_dask:latest
usage()
{
>&2 cat << EOF
Usage: $0 [ -h ] [ -s | --sif ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
[ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination
Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
[ -c | --split-count ] [ -d | --part-dir ]
[ -a | --aws-credentials-file ] [ -u | --credentials-profile ]
filter input_parquet destination
EOF
}
......@@ -30,36 +34,42 @@ help()
# Display Help
>&2 cat << EOF
Submits an array job to transfer files listed in a GPFS dataset to a bucket on LTS using s5cmd
Syntax: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
[ -c | --split-count ] [ -d | --part-dir ] filter input_parquet destination
Usage: $0 [ -h ] [ -n | --ntasks ] [ -p | --partition] [ -t | --time ] [ -m | --mem ]
[ -c | --split-count ] [ -d | --part-dir ]
[ -a | --aws-credentials-file ] [ -u | --credentials-profile ]
filter input_parquet destination
General:
-h|--help Print this Help.
--dry-run Only print
Required:
filter Parent path to transfer. For example, /scratch/user1 will transfer all files in the GPFS log
that begin with /scratch/user1. Object prefixes will retain all subdirectory listings directly
underneath the filter. For example, a file with absolute path /scratch/user1/subdir1/file.txt
will be synced to an LTS bucket with prefix /bucket/subdir1.
input_parquet Path to the GPFS parquet dataset to read from
destination URI to sync data to. Only LTS buckets for now. Should be specified as 's3://bucket[/prefix] where
any additional prefix is optional.
File Partition:
-c|--split-count Number of files to sync in each s5cmd partition (default: 10000)
-d|--part-dir Location to store the partition files (default: ./part)
sbatch:
-n|--ntasks Number of tasks for each array index (default: 1)
-p|--partition Partition to submit tasks to (default: amd-hdr100)
-t|--time Max walltime (default: 02:00:00)
-m|--mem Memory for each task (default: 16G)
filter Parent path to transfer. For example, /scratch/user1 will transfer all files in the GPFS log
that begin with /scratch/user1. Object prefixes will retain all subdirectory listings directly
underneath the filter. For example, a file with absolute path /scratch/user1/subdir1/file.txt
will be synced to an LTS bucket with prefix /bucket/subdir1.
input_parquet Path to the GPFS parquet dataset to read from
destination URI to sync data to. Only LTS buckets for now. Should be specified as 's3://bucket[/prefix] where
any additional prefix is optional.
File Partition:
-c|--split-count Number of files to sync in each s5cmd partition (default: 10000)
-d|--part-dir Location to store the partition files (default: ./part)
s5cmd:
-a|--aws-credentials-file Path to AWS credentials file containing the access and secret keys (default: $HOME/.aws/credentials)
-u|--credentials-profile Profile to use in the AWS credentials (default: default)
Job Parameters for File Transfer:
-n|--ntasks Number of tasks for each array index (default: 8)
-p|--partition Partition to submit tasks to (default: amd-hdr100)
-t|--time Max walltime (default: 12:00:00)
-m|--mem Memory for each task (default: 16G)
EOF
exit 0
}
args=$(getopt -a -o hn:p:t:m:c:d: --long help,ntasks:,partition:,time:,mem:,split-count:,part-dir: -- "$@")
args=$(getopt -a -o hn:p:t:m:c:d:a:u: --long help,ntasks:,partition:,time:,mem:,split-count:,part-dir:,aws-credentials-file:,credentials-profile: -- "$@")
if [[ $? -gt 0 ]]; then
usage
fi
......@@ -69,13 +79,15 @@ eval set -- ${args}
while :
do
case $1 in
-h | --help) help ;;
-n | --ntasks) ntasks=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
-t | --time) time=$2 ; shift 2 ;;
-m | --mem) mem=$2 ; shift 2 ;;
-c | --split-count) split_count=$2 ; shift 2 ;;
-d | --part-dir) part_dir=$2 ; shift 2 ;;
-h | --help) help ;;
-n | --ntasks) ntasks=$2 ; shift 2 ;;
-p | --partition) partition=$2 ; shift 2 ;;
-t | --time) time=$2 ; shift 2 ;;
-m | --mem) mem=$2 ; shift 2 ;;
-c | --split-count) split_count=$2 ; shift 2 ;;
-d | --part-dir) part_dir=$2 ; shift 2 ;;
-a | --aws-credentials-file) credentials_file=$2 ; shift 2 ;;
-u | --credentials-profile) profile=$2 ; shift 2 ;;
--) shift; break ;;
*) >&2 echo Unsupported option: $1
usage ;;
......@@ -109,8 +121,11 @@ split_cmd="singularity exec --bind /data,/scratch \
transfer_cmd="singularity exec --bind /data,/scratch \
gpfs.sif s5cmd \
--nworkers 8 \
--numworkers ${ntasks} \
--endpoint-url https://s3.lts.rc.uab.edu \
--credentials-file ${credentials_file} \
--profile $profile \
--retry-count 3 \
run ${part_dir}/part_${SLURM_ARRAY_TASK_ID}.txt"
>&2 cat << EOF
......@@ -123,6 +138,9 @@ sif: ${sif}
split count: ${split_count}
partition dir: ${part_dir}
credentials file: ${credentials_file}
credentials profile: ${profile}
ntasks: ${ntasks}
partition: ${partition}
time: ${time}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment