From b3a99478976fbf54821902750efc2fab222085dc Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Fri, 26 Jul 2024 11:47:08 -0500
Subject: [PATCH] Add minor improvements to pickle script

Makes it slightly more resilient and forms the foundation for
the parquet notebook.
---
 pickle-list-policy-data.ipynb | 51 ++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/pickle-list-policy-data.ipynb b/pickle-list-policy-data.ipynb
index 40518a5..9dee8d7 100644
--- a/pickle-list-policy-data.ipynb
+++ b/pickle-list-policy-data.ipynb
@@ -59,15 +59,54 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "932707e6",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
    "outputs": [],
    "source": [
-    "dirname=\"\"  # directory to fine files to pickle\n",
+    "dirname=\"data/list-20191520.list.gather-info.d\"  # directory to fine files to pickle\n",
     "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
     "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "\n",
+    "verbose = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "833be559",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickledir=f\"{dirname}/pickles\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47ea1d93",
+   "metadata": {},
+   "source": [
+    "dirname=\"data/list-17404604.list.gather-info.d/\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"list-*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/pickles\"\n",
+    "\n",
+    "verbose = True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07ef745a",
+   "metadata": {},
+   "source": [
+    "dirname=\"data/list-16144464.list.gather-info.d/\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"list-*\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
     "pickledir=f\"{dirname}/pickles\"\n",
     "\n",
-    "verbose = False"
+    "verbose = True"
    ]
   },
   {
@@ -185,11 +224,11 @@
     "    ## Write the pickled data\n",
     "\n",
     "    # only create dir if there is data to pickle\n",
-    "    if (len(parsedfiles) and not os.path.isdir(pickledir)):\n",
+    "    if (not os.path.isdir(pickledir)):\n",
     "        os.mkdir(pickledir)\n",
     "\n",
-    "    if (verbose): print(f\"pickling: {file}\")\n",
-    "    parsedfiles[file].to_pickle(f\"{pickledir}/{file}\")"
+    "    if (verbose): print(f\"pickling: {filename}\")\n",
+    "    df.to_pickle(f\"{pickledir}/{filename}\")"
    ]
   }
  ],
-- 
GitLab