{
"cells": [
{
"cell_type": "markdown",
"id": "073ef418",
"metadata": {},
"source": [
"# Convert raw policy lists into pickles\n",
"\n",
"Having the raw list-policy output data converted to pickels reduces storage space, centralized and speeds later processing and reporting.\n",
"\n",
"The script reads files that match the `glob_pattern` from the provided `dirname` and writes identical file names in pickled format to the `pickledir`, optionally filtering lines by the `line_regex_filter`. If the default parameters aren't changed no files are read or written.\n",
"\n",
"Some parsing progress is available via the `verbose` flag.\n",
"\n",
"This converter assumes a policy show format defined in the [list-paths-external policy](https://code.rc.uab.edu/rc/gpfs-policy/-/blob/main/policy/list-path-external):\n",
"```\n",
" SHOW ('|size=' || varchar(FILE_SIZE) ||\n",
" '|kballoc='|| varchar(KB_ALLOCATED) ||\n",
" '|access=' || varchar(ACCESS_TIME) ||\n",
" '|create=' || varchar(CREATION_TIME) ||\n",
" '|modify=' || varchar(MODIFICATION_TIME) ||\n",
" '|uid=' || varchar(USER_ID) ||\n",
" '|gid=' || varchar(GROUP_ID) ||\n",
" '|heat=' || varchar(FILE_HEAT) ||\n",
" '|pool=' || varchar(POOL_NAME) ||\n",
" '|mode=' || varchar(MODE) ||\n",
" '|misc=' || varchar(MISC_ATTRIBUTES) ||\n",
" '|'\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af015950",
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from urllib.parse import unquote\n",
"import sys\n",
"import os\n",
"import pathlib\n",
"import re"
]
},
{
"cell_type": "markdown",
"id": "3781a0d6",
"metadata": {},
"source": [
"## input vars"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "932707e6",
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"dirname=\"data/list-20191520.list.gather-info.d\" # directory to fine files to pickle\n",
"glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n",
"line_regex_filter = \".*\" # regex to match lines of interest in file\n",
"\n",
"verbose = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "833be559",
"metadata": {},
"outputs": [],
"source": [
"pickledir=f\"{dirname}/pickles\""
]
},
{
"cell_type": "markdown",
"id": "47ea1d93",
"metadata": {},
"source": [
"dirname=\"data/list-17404604.list.gather-info.d/\" # directory to fine files to pickle\n",
"glob_pattern = \"list-*.gz\" # file name glob pattern to match, can be file name for individual file\n",
"line_regex_filter = \".*\" # regex to match lines of interest in file\n",
"pickledir=f\"{dirname}/pickles\"\n",
"\n",
"verbose = True"
]
},
{
"cell_type": "markdown",
"id": "07ef745a",
"metadata": {},
"source": [
"dirname=\"data/list-16144464.list.gather-info.d/\" # directory to fine files to pickle\n",
"glob_pattern = \"list-*\" # file name glob pattern to match, can be file name for individual file\n",
"line_regex_filter = \".*\" # regex to match lines of interest in file\n",
"pickledir=f\"{dirname}/pickles\"\n",
"\n",
"verbose = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5599e260",
"metadata": {},
"outputs": [],
"source": [
"# parse files with read_csv optionally filtering specific lines via regex\n",
"\n",
"def parse_file(filename, pattern=\".*\"):\n",
" \n",
" gen = pd.read_csv(filename, sep='\\n', header=None, iterator=True)\n",
" df = pd.concat((x[x[0].str.contains(pattern, regex=True)] for x in gen), ignore_index=True)\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6542cb23",
"metadata": {},
"outputs": [],
"source": [
"# parse rows according to the list-policy-external format\n",
"\n",
"def parse_rows(df):\n",
" # split content on white space\n",
" df=df.rename(columns={0:\"details\"})\n",
" new=df[\"details\"].str.split(expand=True)\n",
" \n",
" # create a new dataframe and populate with parsed data\n",
" df = pd.DataFrame()\n",
"\n",
" # split attribuignoring filename= prefix\n",
" df[\"showattr\"] = new[3].map(lambda x: re.sub(\"\\w+=\", \"\", unquote(x)))\n",
" df[[\"ignore1\", \"size\", \"kballoc\", \"access\", \"create\", \"modify\", \n",
" \"uid\", \"gid\", \"heat\", \"pool\", \"mode\", \"misc\", \"ignore2\"]] = df[\"showattr\"].str.split(\"|\", expand=True)\n",
" df[\"path\"] = new[5].map(lambda x: unquote(x))\n",
"\n",
" # drop temp columns\n",
" df = df.drop([\"showattr\", \"ignore1\", \"ignore2\"], axis=1)\n",
"\n",
" df.reset_index(drop=True, inplace=True)\n",
"\n",
" df = set_types(df)\n",
"\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9730f207",
"metadata": {},
"outputs": [],
"source": [
"# convert data to native pandas types\n",
"def set_types(df):\n",
" df[\"size\"] = df[\"size\"].astype('int64')\n",
" df[\"kballoc\"] = df[\"kballoc\"].astype('int64')\n",
" df[\"uid\"] = df[\"uid\"].astype('int64')\n",
" df[\"gid\"] = df[\"gid\"].astype('int64')\n",
" df[\"access\"] = df[\"access\"].astype('datetime64')\n",
" df[\"create\"] = df[\"create\"].astype('datetime64')\n",
" df[\"modify\"] = df[\"modify\"].astype('datetime64')\n",
" \n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "2ed6bdc8",
"metadata": {},
"source": [
"## Gather the files according to glob_pattern"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7297f0d2",
"metadata": {},
"outputs": [],
"source": [
"dirpath = pathlib.Path(dirname)\n",
"\n",
"files = list()\n",
"for file in list(dirpath.glob(glob_pattern)):\n",
" files.append(str(file))"
]
},
{
"cell_type": "markdown",
"id": "e4929a0f",
"metadata": {},
"source": [
"## Read, parse and pickle files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ab7f7f5",
"metadata": {},
"outputs": [],
"source": [
"for file in files:\n",
" if (verbose): print(f\"parse: {file}\")\n",
" filename=os.path.basename(file)\n",
" df = parse_rows(parse_file(file))\n",
"\n",
" ## Write the pickled data\n",
"\n",
" # only create dir if there is data to pickle\n",
" if (not os.path.isdir(pickledir)):\n",
" os.mkdir(pickledir)\n",
"\n",
" if (verbose): print(f\"pickling: {filename}\")\n",
" df.to_pickle(f\"{pickledir}/{filename}\")"
]
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}