Skip to content
Snippets Groups Projects
Commit 20090a6a authored by Matthew K Defenderfer's avatar Matthew K Defenderfer
Browse files

correctly calculate group and move group to in-memory calculation

parent 464ca4c3
No related branches found
No related tags found
1 merge request!9Draft: Partition parquet dataset for sync with s5cmd
...@@ -34,16 +34,12 @@ def main(): ...@@ -34,16 +34,12 @@ def main():
ddf = dd.read_parquet(input_parquet) ddf = dd.read_parquet(input_parquet)
ddf = ddf.loc[ddf['path'].str.startswith(filter)].sort_values('path') ddf = ddf.loc[ddf['path'].str.startswith(filter)].sort_values('path')
ddf = ddf.loc[~ddf['mode'].str.startswith('d')]
ddf = ddf.loc[~ddf['mode'].str.startswith('d')].reset_index(drop=True)
ddf['group'] = np.floor(ddf.index/split_count).astype(int) + 1
ddf['cmd'] = ddf['path'].map(lambda x: create_sync_cmd(x, filter=filter, dest=dest), meta=str) ddf['cmd'] = ddf['path'].map(lambda x: create_sync_cmd(x, filter=filter, dest=dest), meta=str)
df = ddf[['cmd']].compute().reset_index(drop=True)
df = ddf[['group','cmd']].compute() df['group'] = np.floor(df.index/split_count).astype(int) + 1
os.makedirs(part_dir,exist_ok=True) os.makedirs(part_dir,exist_ok=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment