diff --git a/scratch-log-explorations.ipynb b/scratch-log-explorations.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c27ec47d59f08510623d8ae33e4a95f9ab2993eb --- /dev/null +++ b/scratch-log-explorations.ipynb @@ -0,0 +1,606 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook to explore parsing of the gpfs policy outputs\n", + "\n", + "This is a collection of cells to understand data.\n", + "No particular endpoint in mind." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " 5001:000fffffffffffff:0000000000004741:4b8f012b:0:2c172b:10002:0:40!hyun.d.song-vandy/pull_31/m1/prot_m1.out:13!scratch_tier1;253!|size=444|kballoc=0|access=2022-01-01 06:58:37.177440|create=2022-01-01 06:21:33.356110|modify=2022-01-01 06:23:47.011273|uid=10973|gid=10973|heat=+0.00000000000000E+000|pool=scratch_tier1|path=/scratch/hyun.d.song-vandy/pull_31/m1/prot_m1.out|misc=FAu|" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file=\"data/mmapplypolicy.61746.962D9400.list.no_extern_list_list-30day-with-excludes_slurm-12551165_2022-03-03-04:00:09\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file=\"data/mmapplypolicy.54197.413B7AB5.list.no_extern_list_list-only-temporary-scratch_slurm-12790116_2022-03-14-18:47:51\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file=\"data/mmapplypolicy.120904.9DBFF7E6.list.no_extern_list_list-30day-with-excludes_slurm-13113652_2022-04-05-04:00:28\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parser functions\n", + "\n", + "First we define the stucture of the file then the columns we want to use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fields=['ignore', 'size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']\n", + "\n", + "usecols=['size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def splitter(x):\n", + " '''\n", + " split each name=value field on = and return the value\n", + " '''\n", + " return x.split(\"=\")[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up a splitters dictionary to process all the used fields with the splitter function.\n", + "https://realpython.com/python-defaultdict/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "splitters = {}\n", + "\n", + "for name in usecols:\n", + " splitters.setdefault(name, splitter)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(file,\n", + " sep=\"|\", header=0, \n", + " #on_bad_lines=\"warn\", \n", + " index_col=False,\n", + " #nrows=1000000,\n", + " names=fields,\n", + " usecols=usecols,\n", + " converters=splitters,\n", + " parse_dates=['atime', 'ctime', 'mtime'],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clean up data types for numeric values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for intcol in ['size', 'kballoc', 'uid', 'gid']:\n", + " df[intcol] = df[intcol].astype(\"int\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quick summary of total storage allocated used by 30+day files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"kballoc\"].sum()/1024" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"size\"].sum()/1024/1024" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"atime\"].min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[[\"atime\",\"uid\"]].sort_values(by=\"atime\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[[\"uid\",\"size\"]].groupby(\"uid\").sum()/1000/1000/1000/1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(df[[\"uid\",\"size\"]].groupby(\"uid\").sum()/1000/1000/1000/1000).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"atime\"].sort_values().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"uid\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"misc\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"isfile\"]=df[\"misc\"].str.contains('F')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(df[\"uid\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"uid\"].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get usernames from uid values via the pwd password db iteration module https://stackoverflow.com/a/421670/8928529" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pwd.getpwuid(12137)[0].split(\":\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def getuser(uid):\n", + " return pwd.getpwuid(int(uid))[0].split(\":\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "getuser(10973)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for uid in sorted(df[\"uid\"].unique()):\n", + " print(\"uid: {} name: {}\".format(uid, pwd.getpwuid(int(uid))[0].split(\":\")[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sorted(df[\"heat\"].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"path\"] = df[\"path\"].astype(\"str\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.concat([df, df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)[[1,3,4]].rename(columns={1: \"fs\", 3:\"scratchdir\", 4:\"filename\"})], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.rename(columns={\"sratchdir\": \"scratchdir\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "userdata = df[[\"scratchdir\", \"size\", \"kballoc\", \"isfile\"]].groupby([\"scratchdir\"]).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "userdata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "userdata[\"size\"]/1000/1000/1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)[[3,4]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"path\"].apply(\"str\").split(\"/\", 4, expand=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bytesdays=df[[\"atime\",\"size\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd=bytesdays.set_index(\"atime\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd=bd.resample('D').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd[\"sum\"]=bd.cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd[:\"2022-02-15\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "size, gb = bd[bd[\"size\"]>0].loc[:\"2022-01-01\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd.loc[:\"2021-12-31\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd.loc[:\"2022-01-01\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd.loc[\"2022-01-01\":]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd[bd[\"size\"]>0]/1024/1024/1024 #.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd[\"gb\"] = bd[\"sum\"]/1024/1024/1024" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd[\"gb\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "b2d=bd[\"2021-10-01\":]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "1024*1024*1024*1024" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bd7=b2d[[\"gb\"]].rolling(7, center=True).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot houry, daily, 7-day rolling mean\n", + "fig, ax = plt.subplots()\n", + "#ax.plot(kW, marker='.', markersize=2, color='gray', linestyle='None', label='Hourly Average')\n", + "ax.plot(b2d[\"gb\"], color='brown', linewidth=2, label='1-day Average')\n", + "ax.plot(bd7[\"gb\"], color='black', linewidth=1, label='7-day Rolling Average')\n", + "label='Trend (7 day Rolling Sum)'\n", + "ax.legend()\n", + "ax.set_ylabel('Size (GBytes)')\n", + "ax.set_title('Cheaha Trends in Scratch Usage');" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}