{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Notebook to explore parsing of the gpfs policy outputs\n", "\n", "This is a collection of cells to understand data.\n", "No particular endpoint in mind." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is the format of each line in the policy output;\n", "\n", " 5001:000fffffffffffff:0000000000004741:4b8f012b:0:2c172b:10002:0:40!basedir/path/to/file:13!scratch_tier1;253!|size=444|kballoc=0|access=2022-01-01 06:58:37.177440|create=2022-01-01 06:21:33.356110|modify=2022-01-01 06:23:47.011273|uid=10973|gid=10973|heat=+0.00000000000000E+000|pool=scratch_tier1|path=/rootdir/basedir/path/to/file|misc=FAu|" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file=\"data/mmapplypolicy.61746.962D9400.list.no_extern_list_list-30day-with-excludes_slurm-12551165_2022-03-03-04:00:09\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file=\"data/mmapplypolicy.54197.413B7AB5.list.no_extern_list_list-only-temporary-scratch_slurm-12790116_2022-03-14-18:47:51\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file=\"data/mmapplypolicy.120904.9DBFF7E6.list.no_extern_list_list-30day-with-excludes_slurm-13113652_2022-04-05-04:00:28\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file=\"data/mmapplypolicy.35838.667249E1.list.no_extern_list_list-30day-with-excludes_slurm-15685457_2022-08-23-04:00:23\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "file=\"data/mmapplypolicy.41557.67790FB6.list.no_extern_list_list-path_slurm-15844227_2022-08-29-13:24:52\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parser functions\n", "\n", "First we define the stucture of the file then the columns we want to use." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fields=['ignore', 'size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']\n", "\n", "usecols=['size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def splitter(x):\n", " '''\n", " split each name=value field on = and return the value\n", " '''\n", " print(x)\n", " return x.split(\"=\", 1)[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set up a splitters dictionary to process all the used fields with the splitter function.\n", "https://realpython.com/python-defaultdict/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "splitters = {}\n", "\n", "for name in usecols:\n", " splitters.setdefault(name, splitter)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "df = pd.read_csv(file,\n", " lineterminator='\\n',\n", " sep=\"|\", header=0, \n", " #on_bad_lines=\"warn\", \n", " index_col=False,\n", " #nrows=1000000,\n", " names=fields,\n", " usecols=usecols,\n", " converters=splitters,\n", " parse_dates=['atime', 'ctime', 'mtime'],\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Clean up data types for numeric values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for intcol in ['size', 'kballoc', 'uid', 'gid']:\n", " df[intcol] = df[intcol].astype(\"int\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Quick summary of total storage allocated used by 30+day files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"kballoc\"].sum()/1024" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"size\"].sum()/1024/1024" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"atime\"].min()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[[\"atime\",\"uid\"]].sort_values(by=\"atime\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[[\"uid\",\"size\"]].groupby(\"uid\").sum()/1000/1000/1000/1000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(df[[\"uid\",\"size\"]].groupby(\"uid\").sum()/1000/1000/1000/1000).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"atime\"].sort_values().head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"uid\"].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"misc\"].unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"isfile\"]=df[\"misc\"].str.contains('F')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(df[\"uid\"].unique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"uid\"].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get usernames from uid values via the pwd password db iteration module https://stackoverflow.com/a/421670/8928529" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pwd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pwd.getpwuid(12137)[0].split(\":\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def getuser(uid):\n", " return pwd.getpwuid(int(uid))[0].split(\":\")[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "getuser(10973)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# add new column for resolved uids\n", "df[\"uname\"]=\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set uname for uid\n", "for uid in sorted(df[\"uid\"].unique()):\n", " uname = pwd.getpwuid(int(uid))[0].split(\":\")[0]\n", " print(\"uid: {} name: {}\".format(uid, uname))\n", " df.loc[df[\"uid\"]==uid, [\"uname\"]] = uname" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df[\"uid\"]==10005]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sorted(df[\"heat\"].unique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"path\"] = df[\"path\"].astype(\"str\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.concat([df, df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)[[1,3,4]].rename(columns={1: \"fs\", 3:\"scratchdir\", 4:\"filename\"})], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.rename(columns={\"sratchdir\": \"scratchdir\"})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "userdata = df[[\"scratchdir\", \"size\", \"kballoc\", \"isfile\"]].groupby([\"scratchdir\"]).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "userdata" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "userdata[\"size\"]/1000/1000/1000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)[[3,4]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bytesdays=df[[\"atime\",\"size\"]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd=bytesdays.set_index(\"atime\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd=bd.resample('D').sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd[\"sum\"]=bd.cumsum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd[:\"2022-02-15\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "size, gb = bd[bd[\"size\"]>0].loc[:\"2022-01-01\"].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gb" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd.loc[:\"2021-12-31\"].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd.loc[:\"2022-01-01\"].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd.loc[\"2022-01-01\":]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd[bd[\"size\"]>0]/1024/1024/1024 #.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd[\"gb\"] = bd[\"sum\"]/1024/1024/1024" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd[\"gb\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "b2d=bd[\"2021-10-01\":]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "1024*1024*1024*1024" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bd7=b2d[[\"gb\"]].rolling(7, center=True).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot houry, daily, 7-day rolling mean\n", "fig, ax = plt.subplots()\n", "#ax.plot(kW, marker='.', markersize=2, color='gray', linestyle='None', label='Hourly Average')\n", "ax.plot(b2d[\"gb\"], color='brown', linewidth=2, label='1-day Average')\n", "ax.plot(bd7[\"gb\"], color='black', linewidth=1, label='7-day Rolling Average')\n", "label='Trend (7 day Rolling Sum)'\n", "ax.legend()\n", "ax.set_ylabel('Size (GBytes)')\n", "ax.set_title('Cheaha Trends in Scratch Usage');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "language_info": { "name": "python", "pygments_lexer": "ipython3" } }, "nbformat": 4, "nbformat_minor": 4 }