diff --git a/prep-parquet-for-s5cmd/fpart-db.py b/prep-parquet-for-s5cmd/fpart-db.py index 3e5e6c23bc7c01b21dca66de71785eda952995b0..beedcace24cee805967b3cff8e02d1d6ccf9e09c 100755 --- a/prep-parquet-for-s5cmd/fpart-db.py +++ b/prep-parquet-for-s5cmd/fpart-db.py @@ -27,8 +27,8 @@ def main(): split_count = args.split_count part_dir = args.partition_dir input_parquet = args.input_parquet - dest = re.sub(r'/$','',args.destination) - filter = re.sub(r'/$','',args.filter) + dest = re.sub(r'/*$','',args.destination) + filter = re.sub(r'/*$','',args.filter) ddf = dd.read_parquet(input_parquet) @@ -38,7 +38,7 @@ def main(): ddf = ddf.loc[~ddf['mode'].str.startswith('d')].reset_index(drop=True) - ddf['group'] = np.floor(ddf.index/split_count).astype(int) + ddf['group'] = np.floor(ddf.index/split_count).astype(int) + 1 ddf['cmd'] = ddf['path'].map(lambda x: create_sync_cmd(x, filter=filter, dest=dest), meta=str) @@ -62,28 +62,5 @@ def main(): for value in values: f.write(f"{value}\n") - - array_sh = f"""#!/bin/bash - # - #SBATCH --job-name=s5-array-%a - #SBATCH --partition=amd-hdr100 - #SBATCH --ntasks=8 - #SBATCH --mem=16G - #SBATCH --time=02:00:00 - #SBATCH --output=out/%A-%a.out - #SBATCH --error=err/%A-%a.err - #SBATCH --array=0-{df['group'].max()}%10 - - module load Anaconda3 - conda activate s3 - - s5cmd --nworkers 8 --endpoint-url https://s3.lts.rc.uab.edu run {part_dir}/part_${{SLURM_ARRAY_TASK_ID}}.txt - """ - - - with open('s5cmd_array.sh','wt') as f: - f.write(array_sh) - - if __name__ == "__main__": main()