{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook to explore parsing of the gpfs policy outputs\n",
"\n",
"This is a collection of cells to understand data.\n",
"No particular endpoint in mind."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is the format of each line in the policy output;\n",
"\n",
" 5001:000fffffffffffff:0000000000004741:4b8f012b:0:2c172b:10002:0:40!basedir/path/to/file:13!scratch_tier1;253!|size=444|kballoc=0|access=2022-01-01 06:58:37.177440|create=2022-01-01 06:21:33.356110|modify=2022-01-01 06:23:47.011273|uid=10973|gid=10973|heat=+0.00000000000000E+000|pool=scratch_tier1|path=/rootdir/basedir/path/to/file|misc=FAu|"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file=\"data/mmapplypolicy.61746.962D9400.list.no_extern_list_list-30day-with-excludes_slurm-12551165_2022-03-03-04:00:09\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file=\"data/mmapplypolicy.54197.413B7AB5.list.no_extern_list_list-only-temporary-scratch_slurm-12790116_2022-03-14-18:47:51\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file=\"data/mmapplypolicy.120904.9DBFF7E6.list.no_extern_list_list-30day-with-excludes_slurm-13113652_2022-04-05-04:00:28\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file=\"data/mmapplypolicy.35838.667249E1.list.no_extern_list_list-30day-with-excludes_slurm-15685457_2022-08-23-04:00:23\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file=\"data/mmapplypolicy.41557.67790FB6.list.no_extern_list_list-path_slurm-15844227_2022-08-29-13:24:52\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parser functions\n",
"\n",
"First we define the stucture of the file then the columns we want to use."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fields=['ignore', 'size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']\n",
"\n",
"usecols=['size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def splitter(x):\n",
" '''\n",
" split each name=value field on = and return the value\n",
" '''\n",
" print(x)\n",
" return x.split(\"=\", 1)[1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set up a splitters dictionary to process all the used fields with the splitter function.\n",
"https://realpython.com/python-defaultdict/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"splitters = {}\n",
"\n",
"for name in usecols:\n",
" splitters.setdefault(name, splitter)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"df = pd.read_csv(file,\n",
" lineterminator='\\n',\n",
" sep=\"|\", header=0, \n",
" #on_bad_lines=\"warn\", \n",
" index_col=False,\n",
" #nrows=1000000,\n",
" names=fields,\n",
" usecols=usecols,\n",
" converters=splitters,\n",
" parse_dates=['atime', 'ctime', 'mtime'],\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Clean up data types for numeric values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for intcol in ['size', 'kballoc', 'uid', 'gid']:\n",
" df[intcol] = df[intcol].astype(\"int\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Quick summary of total storage allocated used by 30+day files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"kballoc\"].sum()/1024"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"size\"].sum()/1024/1024"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"atime\"].min()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[[\"atime\",\"uid\"]].sort_values(by=\"atime\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[[\"uid\",\"size\"]].groupby(\"uid\").sum()/1000/1000/1000/1000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(df[[\"uid\",\"size\"]].groupby(\"uid\").sum()/1000/1000/1000/1000).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"atime\"].sort_values().head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"uid\"].head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"misc\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"isfile\"]=df[\"misc\"].str.contains('F')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(df[\"uid\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"uid\"].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get usernames from uid values via the pwd password db iteration module https://stackoverflow.com/a/421670/8928529"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pwd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pwd.getpwuid(12137)[0].split(\":\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def getuser(uid):\n",
" return pwd.getpwuid(int(uid))[0].split(\":\")[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"getuser(10973)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# add new column for resolved uids\n",
"df[\"uname\"]=\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# set uname for uid\n",
"for uid in sorted(df[\"uid\"].unique()):\n",
" uname = pwd.getpwuid(int(uid))[0].split(\":\")[0]\n",
" print(\"uid: {} name: {}\".format(uid, uname))\n",
" df.loc[df[\"uid\"]==uid, [\"uname\"]] = uname"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[df[\"uid\"]==10005]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sorted(df[\"heat\"].unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"path\"] = df[\"path\"].astype(\"str\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)[[1,3,4]].rename(columns={1: \"fs\", 3:\"scratchdir\", 4:\"filename\"})], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={\"sratchdir\": \"scratchdir\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"userdata = df[[\"scratchdir\", \"size\", \"kballoc\", \"isfile\"]].groupby([\"scratchdir\"]).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"userdata"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"userdata[\"size\"]/1000/1000/1000"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)[[3,4]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bytesdays=df[[\"atime\",\"size\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd=bytesdays.set_index(\"atime\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd=bd.resample('D').sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd[\"sum\"]=bd.cumsum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd[:\"2022-02-15\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"size, gb = bd[bd[\"size\"]>0].loc[:\"2022-01-01\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"gb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd.loc[:\"2021-12-31\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd.loc[:\"2022-01-01\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd.loc[\"2022-01-01\":]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd[bd[\"size\"]>0]/1024/1024/1024 #.plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd[\"gb\"] = bd[\"sum\"]/1024/1024/1024"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd[\"gb\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"b2d=bd[\"2021-10-01\":]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"1024*1024*1024*1024"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bd7=b2d[[\"gb\"]].rolling(7, center=True).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot houry, daily, 7-day rolling mean\n",
"fig, ax = plt.subplots()\n",
"#ax.plot(kW, marker='.', markersize=2, color='gray', linestyle='None', label='Hourly Average')\n",
"ax.plot(b2d[\"gb\"], color='brown', linewidth=2, label='1-day Average')\n",
"ax.plot(bd7[\"gb\"], color='black', linewidth=1, label='7-day Rolling Average')\n",
"label='Trend (7 day Rolling Sum)'\n",
"ax.legend()\n",
"ax.set_ylabel('Size (GBytes)')\n",
"ax.set_title('Cheaha Trends in Scratch Usage');"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}