diff --git a/pickle-list-policy-data.ipynb b/pickle-list-policy-data.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..40518a5632153228c6140b6a63ee093ddf89b95f --- /dev/null +++ b/pickle-list-policy-data.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "073ef418", + "metadata": {}, + "source": [ + "# Convert raw policy lists into pickles\n", + "\n", + "Having the raw list-policy output data converted to pickels reduces storage space, centralized and speeds later processing and reporting.\n", + "\n", + "The script reads files that match the `glob_pattern` from the provided `dirname` and writes identical file names in pickled format to the `pickledir`, optionally filtering lines by the `line_regex_filter`. If the default parameters aren't changed no files are read or written.\n", + "\n", + "Some parsing progress is available via the `verbose` flag.\n", + "\n", + "This converter assumes a policy show format defined in the [list-paths-external policy](https://code.rc.uab.edu/rc/gpfs-policy/-/blob/main/policy/list-path-external):\n", + "```\n", + " SHOW ('|size=' || varchar(FILE_SIZE) ||\n", + " '|kballoc='|| varchar(KB_ALLOCATED) ||\n", + " '|access=' || varchar(ACCESS_TIME) ||\n", + " '|create=' || varchar(CREATION_TIME) ||\n", + " '|modify=' || varchar(MODIFICATION_TIME) ||\n", + " '|uid=' || varchar(USER_ID) ||\n", + " '|gid=' || varchar(GROUP_ID) ||\n", + " '|heat=' || varchar(FILE_HEAT) ||\n", + " '|pool=' || varchar(POOL_NAME) ||\n", + " '|mode=' || varchar(MODE) ||\n", + " '|misc=' || varchar(MISC_ATTRIBUTES) ||\n", + " '|'\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af015950", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "import os\n", + "import pathlib\n", + "import re" + ] + }, + { + "cell_type": "markdown", + "id": "3781a0d6", + "metadata": {}, + "source": [ + "## input vars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "932707e6", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"\" # directory to fine files to pickle\n", + "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/pickles\"\n", + "\n", + "verbose = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5599e260", + "metadata": {}, + "outputs": [], + "source": [ + "# parse files with read_csv optionally filtering specific lines via regex\n", + "\n", + "def parse_file(filename, pattern=\".*\"):\n", + " \n", + " gen = pd.read_csv(filename, sep='\\n', header=None, iterator=True)\n", + " df = pd.concat((x[x[0].str.contains(pattern, regex=True)] for x in gen), ignore_index=True)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6542cb23", + "metadata": {}, + "outputs": [], + "source": [ + "# parse rows according to the list-policy-external format\n", + "\n", + "def parse_rows(df):\n", + " # split content on white space\n", + " df=df.rename(columns={0:\"details\"})\n", + " new=df[\"details\"].str.split(expand=True)\n", + " \n", + " # create a new dataframe and populate with parsed data\n", + " df = pd.DataFrame()\n", + "\n", + " # split attribuignoring filename= prefix\n", + " df[\"showattr\"] = new[3].map(lambda x: re.sub(\"\\w+=\", \"\", unquote(x)))\n", + " df[[\"ignore1\", \"size\", \"kballoc\", \"access\", \"create\", \"modify\", \n", + " \"uid\", \"gid\", \"heat\", \"pool\", \"mode\", \"misc\", \"ignore2\"]] = df[\"showattr\"].str.split(\"|\", expand=True)\n", + " df[\"path\"] = new[5].map(lambda x: unquote(x))\n", + "\n", + " # drop temp columns\n", + " df = df.drop([\"showattr\", \"ignore1\", \"ignore2\"], axis=1)\n", + "\n", + " df.reset_index(drop=True, inplace=True)\n", + "\n", + " df = set_types(df)\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9730f207", + "metadata": {}, + "outputs": [], + "source": [ + "# convert data to native pandas types\n", + "def set_types(df):\n", + " df[\"size\"] = df[\"size\"].astype('int64')\n", + " df[\"kballoc\"] = df[\"kballoc\"].astype('int64')\n", + " df[\"uid\"] = df[\"uid\"].astype('int64')\n", + " df[\"gid\"] = df[\"gid\"].astype('int64')\n", + " df[\"access\"] = df[\"access\"].astype('datetime64')\n", + " df[\"create\"] = df[\"create\"].astype('datetime64')\n", + " df[\"modify\"] = df[\"modify\"].astype('datetime64')\n", + " \n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "2ed6bdc8", + "metadata": {}, + "source": [ + "## Gather the files according to glob_pattern" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7297f0d2", + "metadata": {}, + "outputs": [], + "source": [ + "dirpath = pathlib.Path(dirname)\n", + "\n", + "files = list()\n", + "for file in list(dirpath.glob(glob_pattern)):\n", + " files.append(str(file))" + ] + }, + { + "cell_type": "markdown", + "id": "e4929a0f", + "metadata": {}, + "source": [ + "## Read, parse and pickle files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ab7f7f5", + "metadata": {}, + "outputs": [], + "source": [ + "for file in files:\n", + " if (verbose): print(f\"parse: {file}\")\n", + " filename=os.path.basename(file)\n", + " df = parse_rows(parse_file(file))\n", + "\n", + " ## Write the pickled data\n", + "\n", + " # only create dir if there is data to pickle\n", + " if (len(parsedfiles) and not os.path.isdir(pickledir)):\n", + " os.mkdir(pickledir)\n", + "\n", + " if (verbose): print(f\"pickling: {file}\")\n", + " parsedfiles[file].to_pickle(f\"{pickledir}/{file}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}