Create notebook to pickle list policy output files

This simplifies later report running to consolidating parsing and dataframe create operations into a single batch.

Create notebook to pickle list policy output files
This simplifies later report running to consolidating parsing and dataframe create operations into a single batch.
f5915ddc · John-Paul · 28617e9f · f5915ddc
Commit f5915ddc authored 2 years ago by John-Paul
--- a/pickle-list-policy-data.ipynb
+++ b/pickle-list-policy-data.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "073ef418",
+   "metadata": {},
+   "source": [
+    "# Convert raw policy lists into pickles\n",
+    "\n",
+    "Having the raw list-policy output data converted to pickels reduces storage space, centralized  and speeds later processing and reporting.\n",
+    "\n",
+    "The script reads files that match the `glob_pattern` from the provided `dirname` and writes identical file names in pickled format to the `pickledir`, optionally filtering lines by the `line_regex_filter`.  If the default parameters aren't changed no files are read or written.\n",
+    "\n",
+    "Some parsing progress is available via the `verbose` flag.\n",
+    "\n",
+    "This converter assumes a policy show format defined in the [list-paths-external policy](https://code.rc.uab.edu/rc/gpfs-policy/-/blob/main/policy/list-path-external):\n",
+    "```\n",
+    "  SHOW ('|size='   || varchar(FILE_SIZE) ||\n",
+    "        '|kballoc='|| varchar(KB_ALLOCATED) ||\n",
+    "        '|access=' || varchar(ACCESS_TIME) ||\n",
+    "        '|create=' || varchar(CREATION_TIME) ||\n",
+    "        '|modify=' || varchar(MODIFICATION_TIME) ||\n",
+    "        '|uid='    || varchar(USER_ID) ||\n",
+    "        '|gid='    || varchar(GROUP_ID) ||\n",
+    "        '|heat='   || varchar(FILE_HEAT) ||\n",
+    "        '|pool='   || varchar(POOL_NAME) ||\n",
+    "        '|mode='   || varchar(MODE) ||\n",
+    "        '|misc='   || varchar(MISC_ATTRIBUTES) ||\n",
+    "        '|'\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af015950",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from urllib.parse import unquote\n",
+    "import sys\n",
+    "import os\n",
+    "import pathlib\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3781a0d6",
+   "metadata": {},
+   "source": [
+    "## input vars"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "932707e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirname=\"\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/pickles\"\n",
+    "\n",
+    "verbose = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5599e260",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# parse files with read_csv optionally filtering specific lines via regex\n",
+    "\n",
+    "def parse_file(filename, pattern=\".*\"):\n",
+    "    \n",
+    "    gen = pd.read_csv(filename, sep='\\n', header=None, iterator=True)\n",
+    "    df =  pd.concat((x[x[0].str.contains(pattern, regex=True)] for x in gen), ignore_index=True)\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6542cb23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# parse rows according to the list-policy-external format\n",
+    "\n",
+    "def parse_rows(df):\n",
+    "    # split content on white space\n",
+    "    df=df.rename(columns={0:\"details\"})\n",
+    "    new=df[\"details\"].str.split(expand=True)\n",
+    "    \n",
+    "    # create a new dataframe and populate with parsed data\n",
+    "    df = pd.DataFrame()\n",
+    "\n",
+    "    # split attribuignoring filename= prefix\n",
+    "    df[\"showattr\"] = new[3].map(lambda x: re.sub(\"\\w+=\", \"\", unquote(x)))\n",
+    "    df[[\"ignore1\", \"size\", \"kballoc\", \"access\", \"create\", \"modify\", \n",
+    "          \"uid\", \"gid\", \"heat\", \"pool\", \"mode\", \"misc\", \"ignore2\"]] = df[\"showattr\"].str.split(\"|\", expand=True)\n",
+    "    df[\"path\"] = new[5].map(lambda x: unquote(x))\n",
+    "\n",
+    "    # drop temp columns\n",
+    "    df = df.drop([\"showattr\", \"ignore1\", \"ignore2\"], axis=1)\n",
+    "\n",
+    "    df.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    df = set_types(df)\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9730f207",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# convert data to native pandas types\n",
+    "def set_types(df):\n",
+    "    df[\"size\"] = df[\"size\"].astype('int64')\n",
+    "    df[\"kballoc\"] = df[\"kballoc\"].astype('int64')\n",
+    "    df[\"uid\"] = df[\"uid\"].astype('int64')\n",
+    "    df[\"gid\"] = df[\"gid\"].astype('int64')\n",
+    "    df[\"access\"] = df[\"access\"].astype('datetime64')\n",
+    "    df[\"create\"] = df[\"create\"].astype('datetime64')\n",
+    "    df[\"modify\"] = df[\"modify\"].astype('datetime64')\n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ed6bdc8",
+   "metadata": {},
+   "source": [
+    "## Gather the files according to glob_pattern"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7297f0d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirpath = pathlib.Path(dirname)\n",
+    "\n",
+    "files = list()\n",
+    "for file in list(dirpath.glob(glob_pattern)):\n",
+    "    files.append(str(file))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e4929a0f",
+   "metadata": {},
+   "source": [
+    "## Read, parse and pickle files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ab7f7f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for file in files:\n",
+    "    if (verbose): print(f\"parse: {file}\")\n",
+    "    filename=os.path.basename(file)\n",
+    "    df = parse_rows(parse_file(file))\n",
+    "\n",
+    "    ## Write the pickled data\n",
+    "\n",
+    "    # only create dir if there is data to pickle\n",
+    "    if (len(parsedfiles) and not os.path.isdir(pickledir)):\n",
+    "        os.mkdir(pickledir)\n",
+    "\n",
+    "    if (verbose): print(f\"pickling: {file}\")\n",
+    "    parsedfiles[file].to_pickle(f\"{pickledir}/{file}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:073ef418 tags:
+# Convert raw policy lists into pickles
+Having the raw list-policy output data converted to pickels reduces storage space, centralized  and speeds later processing and reporting.
+The script reads files that match the `glob_pattern` from the provided `dirname` and writes identical file names in pickled format to the `pickledir`, optionally filtering lines by the `line_regex_filter`.  If the default parameters aren't changed no files are read or written.
+Some parsing progress is available via the `verbose` flag.
+This converter assumes a policy show format defined in the [list-paths-external policy](https://code.rc.uab.edu/rc/gpfs-policy/-/blob/main/policy/list-path-external):
+```
+  SHOW ('|size='   || varchar(FILE_SIZE) ||
+        '|kballoc='|| varchar(KB_ALLOCATED) ||
+        '|access=' || varchar(ACCESS_TIME) ||
+        '|create=' || varchar(CREATION_TIME) ||
+        '|modify=' || varchar(MODIFICATION_TIME) ||
+        '|uid='    || varchar(USER_ID) ||
+        '|gid='    || varchar(GROUP_ID) ||
+        '|heat='   || varchar(FILE_HEAT) ||
+        '|pool='   || varchar(POOL_NAME) ||
+        '|mode='   || varchar(MODE) ||
+        '|misc='   || varchar(MISC_ATTRIBUTES) ||
+        '|'
+```
+%% Cell type:code id:af015950 tags:
+``` 
+import datetime
+import pandas as pd
+import matplotlib.pyplot as plt
+from urllib.parse import unquote
+import sys
+import os
+import pathlib
+import re
+```
+%% Cell type:markdown id:3781a0d6 tags:
+## input vars
+%% Cell type:code id:932707e6 tags:
+``` 
+dirname=""  # directory to fine files to pickle
+glob_pattern = "*.gz"  # file name glob pattern to match, can be file name for individual file
+line_regex_filter = ".*"   # regex to match lines of interest in file
+pickledir=f"{dirname}/pickles"
+verbose = False
+```
+%% Cell type:code id:5599e260 tags:
+``` 
+# parse files with read_csv optionally filtering specific lines via regex
+def parse_file(filename, pattern=".*"):
+    gen = pd.read_csv(filename, sep='\n', header=None, iterator=True)
+    df =  pd.concat((x[x[0].str.contains(pattern, regex=True)] for x in gen), ignore_index=True)
+    return df
+```
+%% Cell type:code id:6542cb23 tags:
+``` 
+# parse rows according to the list-policy-external format
+def parse_rows(df):
+    # split content on white space
+    df=df.rename(columns={0:"details"})
+    new=df["details"].str.split(expand=True)
+    # create a new dataframe and populate with parsed data
+    df = pd.DataFrame()
+    # split attribuignoring filename= prefix
+    df["showattr"] = new[3].map(lambda x: re.sub("\w+=", "", unquote(x)))
+    df[["ignore1", "size", "kballoc", "access", "create", "modify",
+          "uid", "gid", "heat", "pool", "mode", "misc", "ignore2"]] = df["showattr"].str.split("|", expand=True)
+    df["path"] = new[5].map(lambda x: unquote(x))
+    # drop temp columns
+    df = df.drop(["showattr", "ignore1", "ignore2"], axis=1)
+    df.reset_index(drop=True, inplace=True)
+    df = set_types(df)
+    return df
+```
+%% Cell type:code id:9730f207 tags:
+``` 
+# convert data to native pandas types
+def set_types(df):
+    df["size"] = df["size"].astype('int64')
+    df["kballoc"] = df["kballoc"].astype('int64')
+    df["uid"] = df["uid"].astype('int64')
+    df["gid"] = df["gid"].astype('int64')
+    df["access"] = df["access"].astype('datetime64')
+    df["create"] = df["create"].astype('datetime64')
+    df["modify"] = df["modify"].astype('datetime64')
+    return df
+```
+%% Cell type:markdown id:2ed6bdc8 tags:
+## Gather the files according to glob_pattern
+%% Cell type:code id:7297f0d2 tags:
+``` 
+dirpath = pathlib.Path(dirname)
+files = list()
+for file in list(dirpath.glob(glob_pattern)):
+    files.append(str(file))
+```
+%% Cell type:markdown id:e4929a0f tags:
+## Read, parse and pickle files
+%% Cell type:code id:2ab7f7f5 tags:
+``` 
+for file in files:
+    if (verbose): print(f"parse: {file}")
+    filename=os.path.basename(file)
+    df = parse_rows(parse_file(file))
+    ## Write the pickled data
+    # only create dir if there is data to pickle
+    if (len(parsedfiles) and not os.path.isdir(pickledir)):
+        os.mkdir(pickledir)
+    if (verbose): print(f"pickling: {file}")
+    parsedfiles[file].to_pickle(f"{pickledir}/{file}")
+```