diff --git a/prep-parquet-for-s5cmd/fpart-db.py b/prep-parquet-for-s5cmd/fpart-db.py index beedcace24cee805967b3cff8e02d1d6ccf9e09c..11ecc9e238d9948b8d44f4b6a22ca4402688773a 100755 --- a/prep-parquet-for-s5cmd/fpart-db.py +++ b/prep-parquet-for-s5cmd/fpart-db.py @@ -34,16 +34,12 @@ def main(): ddf = dd.read_parquet(input_parquet) ddf = ddf.loc[ddf['path'].str.startswith(filter)].sort_values('path') - - ddf = ddf.loc[~ddf['mode'].str.startswith('d')].reset_index(drop=True) - - - ddf['group'] = np.floor(ddf.index/split_count).astype(int) + 1 + ddf = ddf.loc[~ddf['mode'].str.startswith('d')] ddf['cmd'] = ddf['path'].map(lambda x: create_sync_cmd(x, filter=filter, dest=dest), meta=str) - - - df = ddf[['group','cmd']].compute() + + df = ddf[['cmd']].compute().reset_index(drop=True) + df['group'] = np.floor(df.index/split_count).astype(int) + 1 os.makedirs(part_dir,exist_ok=True)