Skip to content
Snippets Groups Projects

Draft: Partition parquet dataset for sync with s5cmd

Closed Matthew K Defenderfer requested to merge partition-parquet-dataset into main
1 file
+ 9
4
Compare changes
  • Side-by-side
  • Inline
@@ -109,7 +109,7 @@ if [[ -z $filter || -z $input_parquet || -z $destination ]]; then
exit 1
fi
singularity pull gpfs.sif docker://${sif}
singularity pull --force gpfs.sif docker://${sif}
split_cmd="singularity exec --bind /data,/scratch \
gpfs.sif python3 fpart-db.py \
@@ -126,9 +126,10 @@ transfer_cmd="singularity exec --bind /data,/scratch \
--credentials-file ${credentials_file} \
--profile $profile \
--retry-count 3 \
run ${part_dir}/part_${SLURM_ARRAY_TASK_ID}.txt"
run ${part_dir}/part_\${SLURM_ARRAY_TASK_ID}.txt"
>&2 cat << EOF
--------------------------------------------------------------------------------
filter: ${filter}
input parquet: ${input_parquet}
destination: ${destination}
@@ -146,8 +147,12 @@ partition: ${partition}
time: ${time}
mem: ${mem}
split dataset command: ${split_cmd}
transfer command: ${transfer_cmd}
split dataset command:
$(printf "%s" "${split_cmd}")
transfer command:
$(printf "%s" "${transfer_cmd}")
--------------------------------------------------------------------------------
EOF
mkdir -p out
Loading