From 20090a6aa8d547c217ba49c6fb331debdf79c303 Mon Sep 17 00:00:00 2001
From: Matthew K Defenderfer <mdefende@uab.edu>
Date: Mon, 19 Aug 2024 16:33:24 -0500
Subject: [PATCH] correctly calculate group and move group to in-memory
 calculation

---
 prep-parquet-for-s5cmd/fpart-db.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/prep-parquet-for-s5cmd/fpart-db.py b/prep-parquet-for-s5cmd/fpart-db.py
index beedcac..11ecc9e 100755
--- a/prep-parquet-for-s5cmd/fpart-db.py
+++ b/prep-parquet-for-s5cmd/fpart-db.py
@@ -34,16 +34,12 @@ def main():
     ddf = dd.read_parquet(input_parquet)
     ddf = ddf.loc[ddf['path'].str.startswith(filter)].sort_values('path')
 
-
-    ddf = ddf.loc[~ddf['mode'].str.startswith('d')].reset_index(drop=True)
-
-
-    ddf['group'] = np.floor(ddf.index/split_count).astype(int) + 1
+    ddf = ddf.loc[~ddf['mode'].str.startswith('d')]
 
     ddf['cmd'] = ddf['path'].map(lambda x: create_sync_cmd(x, filter=filter, dest=dest), meta=str)
-
-
-    df = ddf[['group','cmd']].compute()
+    
+    df = ddf[['cmd']].compute().reset_index(drop=True)
+    df['group'] = np.floor(df.index/split_count).astype(int) + 1
 
 
     os.makedirs(part_dir,exist_ok=True)
-- 
GitLab