{ "cells": [ { "cell_type": "markdown", "id": "073ef418", "metadata": {}, "source": [ "# Convert raw policy lists into pickles\n", "\n", "Having the raw list-policy output data converted to pickels reduces storage space, centralized and speeds later processing and reporting.\n", "\n", "The script reads files that match the `glob_pattern` from the provided `dirname` and writes identical file names in pickled format to the `pickledir`, optionally filtering lines by the `line_regex_filter`. If the default parameters aren't changed no files are read or written.\n", "\n", "Some parsing progress is available via the `verbose` flag.\n", "\n", "This converter assumes a policy show format defined in the [list-paths-external policy](https://code.rc.uab.edu/rc/gpfs-policy/-/blob/main/policy/list-path-external):\n", "```\n", " SHOW ('|size=' || varchar(FILE_SIZE) ||\n", " '|kballoc='|| varchar(KB_ALLOCATED) ||\n", " '|access=' || varchar(ACCESS_TIME) ||\n", " '|create=' || varchar(CREATION_TIME) ||\n", " '|modify=' || varchar(MODIFICATION_TIME) ||\n", " '|uid=' || varchar(USER_ID) ||\n", " '|gid=' || varchar(GROUP_ID) ||\n", " '|heat=' || varchar(FILE_HEAT) ||\n", " '|pool=' || varchar(POOL_NAME) ||\n", " '|mode=' || varchar(MODE) ||\n", " '|misc=' || varchar(MISC_ATTRIBUTES) ||\n", " '|'\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "af015950", "metadata": {}, "outputs": [], "source": [ "import datetime\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from urllib.parse import unquote\n", "import sys\n", "import os\n", "import pathlib\n", "import re" ] }, { "cell_type": "markdown", "id": "3781a0d6", "metadata": {}, "source": [ "## input vars" ] }, { "cell_type": "code", "execution_count": null, "id": "932707e6", "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "dirname=\"data/list-20191520.list.gather-info.d\" # directory to fine files to pickle\n", "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", "line_regex_filter = \".*\" # regex to match lines of interest in file\n", "\n", "verbose = True" ] }, { "cell_type": "code", "execution_count": null, "id": "833be559", "metadata": {}, "outputs": [], "source": [ "pickledir=f\"{dirname}/pickles\"" ] }, { "cell_type": "markdown", "id": "47ea1d93", "metadata": {}, "source": [ "dirname=\"data/list-17404604.list.gather-info.d/\" # directory to fine files to pickle\n", "glob_pattern = \"list-*.gz\" # file name glob pattern to match, can be file name for individual file\n", "line_regex_filter = \".*\" # regex to match lines of interest in file\n", "pickledir=f\"{dirname}/pickles\"\n", "\n", "verbose = True" ] }, { "cell_type": "markdown", "id": "07ef745a", "metadata": {}, "source": [ "dirname=\"data/list-16144464.list.gather-info.d/\" # directory to fine files to pickle\n", "glob_pattern = \"list-*\" # file name glob pattern to match, can be file name for individual file\n", "line_regex_filter = \".*\" # regex to match lines of interest in file\n", "pickledir=f\"{dirname}/pickles\"\n", "\n", "verbose = True" ] }, { "cell_type": "code", "execution_count": null, "id": "5599e260", "metadata": {}, "outputs": [], "source": [ "# parse files with read_csv optionally filtering specific lines via regex\n", "\n", "def parse_file(filename, pattern=\".*\"):\n", " \n", " gen = pd.read_csv(filename, sep='\\n', header=None, iterator=True)\n", " df = pd.concat((x[x[0].str.contains(pattern, regex=True)] for x in gen), ignore_index=True)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "id": "6542cb23", "metadata": {}, "outputs": [], "source": [ "# parse rows according to the list-policy-external format\n", "\n", "def parse_rows(df):\n", " # split content on white space\n", " df=df.rename(columns={0:\"details\"})\n", " new=df[\"details\"].str.split(expand=True)\n", " \n", " # create a new dataframe and populate with parsed data\n", " df = pd.DataFrame()\n", "\n", " # split attribuignoring filename= prefix\n", " df[\"showattr\"] = new[3].map(lambda x: re.sub(\"\\w+=\", \"\", unquote(x)))\n", " df[[\"ignore1\", \"size\", \"kballoc\", \"access\", \"create\", \"modify\", \n", " \"uid\", \"gid\", \"heat\", \"pool\", \"mode\", \"misc\", \"ignore2\"]] = df[\"showattr\"].str.split(\"|\", expand=True)\n", " df[\"path\"] = new[5].map(lambda x: unquote(x))\n", "\n", " # drop temp columns\n", " df = df.drop([\"showattr\", \"ignore1\", \"ignore2\"], axis=1)\n", "\n", " df.reset_index(drop=True, inplace=True)\n", "\n", " df = set_types(df)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "id": "9730f207", "metadata": {}, "outputs": [], "source": [ "# convert data to native pandas types\n", "def set_types(df):\n", " df[\"size\"] = df[\"size\"].astype('int64')\n", " df[\"kballoc\"] = df[\"kballoc\"].astype('int64')\n", " df[\"uid\"] = df[\"uid\"].astype('int64')\n", " df[\"gid\"] = df[\"gid\"].astype('int64')\n", " df[\"access\"] = df[\"access\"].astype('datetime64')\n", " df[\"create\"] = df[\"create\"].astype('datetime64')\n", " df[\"modify\"] = df[\"modify\"].astype('datetime64')\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "2ed6bdc8", "metadata": {}, "source": [ "## Gather the files according to glob_pattern" ] }, { "cell_type": "code", "execution_count": null, "id": "7297f0d2", "metadata": {}, "outputs": [], "source": [ "dirpath = pathlib.Path(dirname)\n", "\n", "files = list()\n", "for file in list(dirpath.glob(glob_pattern)):\n", " files.append(str(file))" ] }, { "cell_type": "markdown", "id": "e4929a0f", "metadata": {}, "source": [ "## Read, parse and pickle files" ] }, { "cell_type": "code", "execution_count": null, "id": "2ab7f7f5", "metadata": {}, "outputs": [], "source": [ "for file in files:\n", " if (verbose): print(f\"parse: {file}\")\n", " filename=os.path.basename(file)\n", " df = parse_rows(parse_file(file))\n", "\n", " ## Write the pickled data\n", "\n", " # only create dir if there is data to pickle\n", " if (not os.path.isdir(pickledir)):\n", " os.mkdir(pickledir)\n", "\n", " if (verbose): print(f\"pickling: {filename}\")\n", " df.to_pickle(f\"{pickledir}/{filename}\")" ] } ], "metadata": { "language_info": { "name": "python", "pygments_lexer": "ipython3" } }, "nbformat": 4, "nbformat_minor": 5 }