pickle-list-policy-data.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "073ef418",
   "metadata": {},
   "source": [
    "# Convert raw policy lists into pickles\n",
    "\n",
    "Having the raw list-policy output data converted to pickels reduces storage space, centralized  and speeds later processing and reporting.\n",
    "\n",
    "The script reads files that match the `glob_pattern` from the provided `dirname` and writes identical file names in pickled format to the `pickledir`, optionally filtering lines by the `line_regex_filter`.  If the default parameters aren't changed no files are read or written.\n",
    "\n",
    "Some parsing progress is available via the `verbose` flag.\n",
    "\n",
    "This converter assumes a policy show format defined in the [list-paths-external policy](https://code.rc.uab.edu/rc/gpfs-policy/-/blob/main/policy/list-path-external):\n",
    "```\n",
    "  SHOW ('|size='   || varchar(FILE_SIZE) ||\n",
    "        '|kballoc='|| varchar(KB_ALLOCATED) ||\n",
    "        '|access=' || varchar(ACCESS_TIME) ||\n",
    "        '|create=' || varchar(CREATION_TIME) ||\n",
    "        '|modify=' || varchar(MODIFICATION_TIME) ||\n",
    "        '|uid='    || varchar(USER_ID) ||\n",
    "        '|gid='    || varchar(GROUP_ID) ||\n",
    "        '|heat='   || varchar(FILE_HEAT) ||\n",
    "        '|pool='   || varchar(POOL_NAME) ||\n",
    "        '|mode='   || varchar(MODE) ||\n",
    "        '|misc='   || varchar(MISC_ATTRIBUTES) ||\n",
    "        '|'\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af015950",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from urllib.parse import unquote\n",
    "import sys\n",
    "import os\n",
    "import pathlib\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3781a0d6",
   "metadata": {},
   "source": [
    "## input vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "932707e6",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "dirname=\"data/list-20191520.list.gather-info.d\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "\n",
    "verbose = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "833be559",
   "metadata": {},
   "outputs": [],
   "source": [
    "pickledir=f\"{dirname}/pickles\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47ea1d93",
   "metadata": {},
   "source": [
    "dirname=\"data/list-17404604.list.gather-info.d/\"  # directory to fine files to pickle\n",
    "glob_pattern = \"list-*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/pickles\"\n",
    "\n",
    "verbose = True"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07ef745a",
   "metadata": {},
   "source": [
    "dirname=\"data/list-16144464.list.gather-info.d/\"  # directory to fine files to pickle\n",
    "glob_pattern = \"list-*\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/pickles\"\n",
    "\n",
    "verbose = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5599e260",
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse files with read_csv optionally filtering specific lines via regex\n",
    "\n",
    "def parse_file(filename, pattern=\".*\"):\n",
    "    \n",
    "    gen = pd.read_csv(filename, sep='\\n', header=None, iterator=True)\n",
    "    df =  pd.concat((x[x[0].str.contains(pattern, regex=True)] for x in gen), ignore_index=True)\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6542cb23",
   "metadata": {},
   "outputs": [],
   "source": [
    "# parse rows according to the list-policy-external format\n",
    "\n",
    "def parse_rows(df):\n",
    "    # split content on white space\n",
    "    df=df.rename(columns={0:\"details\"})\n",
    "    new=df[\"details\"].str.split(expand=True)\n",
    "    \n",
    "    # create a new dataframe and populate with parsed data\n",
    "    df = pd.DataFrame()\n",
    "\n",
    "    # split attribuignoring filename= prefix\n",
    "    df[\"showattr\"] = new[3].map(lambda x: re.sub(\"\\w+=\", \"\", unquote(x)))\n",
    "    df[[\"ignore1\", \"size\", \"kballoc\", \"access\", \"create\", \"modify\", \n",
    "          \"uid\", \"gid\", \"heat\", \"pool\", \"mode\", \"misc\", \"ignore2\"]] = df[\"showattr\"].str.split(\"|\", expand=True)\n",
    "    df[\"path\"] = new[5].map(lambda x: unquote(x))\n",
    "\n",
    "    # drop temp columns\n",
    "    df = df.drop([\"showattr\", \"ignore1\", \"ignore2\"], axis=1)\n",
    "\n",
    "    df.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    df = set_types(df)\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9730f207",
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert data to native pandas types\n",
    "def set_types(df):\n",
    "    df[\"size\"] = df[\"size\"].astype('int64')\n",
    "    df[\"kballoc\"] = df[\"kballoc\"].astype('int64')\n",
    "    df[\"uid\"] = df[\"uid\"].astype('int64')\n",
    "    df[\"gid\"] = df[\"gid\"].astype('int64')\n",
    "    df[\"access\"] = df[\"access\"].astype('datetime64')\n",
    "    df[\"create\"] = df[\"create\"].astype('datetime64')\n",
    "    df[\"modify\"] = df[\"modify\"].astype('datetime64')\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ed6bdc8",
   "metadata": {},
   "source": [
    "## Gather the files according to glob_pattern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7297f0d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirpath = pathlib.Path(dirname)\n",
    "\n",
    "files = list()\n",
    "for file in list(dirpath.glob(glob_pattern)):\n",
    "    files.append(str(file))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4929a0f",
   "metadata": {},
   "source": [
    "## Read, parse and pickle files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ab7f7f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in files:\n",
    "    if (verbose): print(f\"parse: {file}\")\n",
    "    filename=os.path.basename(file)\n",
    "    df = parse_rows(parse_file(file))\n",
    "\n",
    "    ## Write the pickled data\n",
    "\n",
    "    # only create dir if there is data to pickle\n",
    "    if (not os.path.isdir(pickledir)):\n",
    "        os.mkdir(pickledir)\n",
    "\n",
    "    if (verbose): print(f\"pickling: {filename}\")\n",
    "    df.to_pickle(f\"{pickledir}/{filename}\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}