From 69bb9645067e28e3a7adea2a8f33e1c5b0f91fe9 Mon Sep 17 00:00:00 2001 From: John-Paul Robinson <jpr@uab.edu> Date: Fri, 26 Jul 2024 11:58:36 -0500 Subject: [PATCH] Rollup commit on various policy report notebooks These are use to create specific reports by opening the notebook then copying it and modifying the parameters to a specific policy run data set. Their utility may be limited based on current parquet pipelines. --- max-access-per-user-merged.ipynb | 24 +- report-grouby-tld-year-of-last-access.ipynb | 290 +++++++++++++++++--- scratch-log-explorations.ipynb | 19 ++ 3 files changed, 293 insertions(+), 40 deletions(-) diff --git a/max-access-per-user-merged.ipynb b/max-access-per-user-merged.ipynb index aa4e14a..8087df6 100644 --- a/max-access-per-user-merged.ipynb +++ b/max-access-per-user-merged.ipynb @@ -22,7 +22,13 @@ "report_name = \"temporary-scratch\"\n", "dir = \"data/list-17075953.list.gather-info.d\"\n", "#report_name = \"temporary-scratch\"\n", - "#dir = 'data/list-17094088.list.gather-info.d'\n", + "#dir = 'data/list-17094088.list.gather-info.d'\\\n", + "glob_pattern=\"*.gz\"\n", + "\n", + "report_name = \"old-scratch\"\n", + "dir = \"data/list-16144464.list.gather-info.d\"\n", + "dir = \"data/list-policy_old-scratch_2022-09-15/pickles\"\n", + "glob_pattern = \"list-*.gz\"\n", "\n", "os.chdir(dir)" ] @@ -65,16 +71,16 @@ "outputs": [], "source": [ "frames = []\n", + "df = pd.DataFrame()\n", "\n", - "for file in glob.glob(\"*.gz\"):\n", + "for file in glob.glob(glob_pattern):\n", " print(f\"processing: {file}\")\n", " # combine picked dfs into one df\n", " # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html\n", - " df = pd.read_pickle(file)\n", - " frames.append(df)\n", - "\n", - "df = pd.concat(frames)\n", - "del(frames)" + " dfnew = pd.read_pickle(file)\n", + " #frames.append(df)\n", + " df = pd.concat([df, dfnew])\n", + "#del(frames)" ] }, { @@ -94,7 +100,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"max atime: {df['atime'].max()}\\n\")" + "print(f\"max atime: {df['access'].max()}\\n\")" ] }, { @@ -104,7 +110,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.groupby([\"uid\"], sort=False)[\"atime\"].max().sort_values()" + "df.groupby([\"uid\"], sort=False)[\"access\"].max().sort_values()" ] }, { diff --git a/report-grouby-tld-year-of-last-access.ipynb b/report-grouby-tld-year-of-last-access.ipynb index 95a4567..cc95e55 100644 --- a/report-grouby-tld-year-of-last-access.ipynb +++ b/report-grouby-tld-year-of-last-access.ipynb @@ -7,13 +7,37 @@ "source": [ "# run report on pickled list policy data\n", "\n", - "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the peer \"`dirname`-reports\" dir by default.\n", + "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n", "\n", "Some progress info is available via the `verbose` flag.\n", "\n", "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data." ] }, + { + "cell_type": "markdown", + "id": "51c07f66", + "metadata": {}, + "source": [ + "!conda info --envs" + ] + }, + { + "cell_type": "markdown", + "id": "15997b7d", + "metadata": {}, + "source": [ + "!conda list" + ] + }, + { + "cell_type": "markdown", + "id": "c740ad5f", + "metadata": {}, + "source": [ + "!pip list -freeze" + ] + }, { "cell_type": "code", "execution_count": null, @@ -50,12 +74,74 @@ "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", "line_regex_filter = \".*\" # regex to match lines of interest in file\n", "pickledir=f\"{dirname}/pickles\"\n", - "reportdir=f\"{dirname}-reports\"\n", + "reportdir=f\"{dirname}/reports\"\n", "tldpath=\"/\"\n", "\n", "verbose = False" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6e28615", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"data/list-17404604.list.gather-info.d\" # directory to fine files to pickle\n", + "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/pickles\"\n", + "reportdir=f\"{dirname}/reports\"\n", + "tldpath=\"/data/projects\"\n", + "\n", + "verbose = True\n", + "limit = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89a07a7f", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"data/list-20859348.list.gather-info.d\" # directory to fine files to pickle\n", + "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/pickles\"\n", + "reportdir=f\"{dirname}/reports\"\n", + "tldpath=\"/data/project/datascienceteam\"\n", + "\n", + "verbose = True\n", + "limit = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9533a4c", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"data/list-20191520.list.gather-info.d\" # directory to fine files to pickle\n", + "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/pickles\"\n", + "reportdir=f\"{dirname}/reports\"\n", + "tldpath=\"/data/project/thymelab\"\n", + "\n", + "verbose = True\n", + "limit = 0" + ] + }, + { + "cell_type": "markdown", + "id": "a28d0f15", + "metadata": {}, + "source": [ + "## Utilities" + ] + }, { "cell_type": "code", "execution_count": null, @@ -73,6 +159,18 @@ " return df" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bc11b96", + "metadata": {}, + "outputs": [], + "source": [ + "def report_tld_year(df):\n", + " report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n", + " return report" + ] + }, { "cell_type": "markdown", "id": "dd92dd03", @@ -96,98 +194,108 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "cbad833f", + "cell_type": "markdown", + "id": "4352f00c", "metadata": {}, - "outputs": [], "source": [ - "parsedfiles = list()\n", - "for file in files:\n", - " if (verbose): print(f\"parse: {file}\")\n", - " filename=os.path.basename(file)\n", - " parsedfiles.append(pd.read_pickle(file))\n" + "## Aggregate stats into running totals" ] }, { "cell_type": "code", "execution_count": null, - "id": "4ed9ca1b", + "id": "82abb24f", "metadata": {}, "outputs": [], "source": [ - "df=pd.concat(parsedfiles)\n", - " del(parsedfiles)\n", - "else:\n", - " return" + "#report = pd.DataFrame()\n", + "\n", + "reports=[]\n", + "\n", + "for count, file in enumerate(files):\n", + " if (verbose): print(f\"parse: {file}\")\n", + " filename=os.path.basename(file)\n", + " df = get_tld(pd.read_pickle(file), tldpath)\n", + " df = report_tld_year(df)\n", + " if (limit and count == limit):\n", + " break\n", + " # roll up into running total https://stackoverflow.com/a/55828762/8928529\n", + " reports.append(df) \n", + " del(df)\n", + " \n", + "report=pd.concat(reports)" ] }, { "cell_type": "code", "execution_count": null, - "id": "b69c9fde", + "id": "15bfddef", "metadata": {}, "outputs": [], "source": [ - "df = get_tld(df, tldpath)" + "report=report.groupby(['tld', 'access']).sum()" ] }, { "cell_type": "markdown", - "id": "4352f00c", + "id": "04a25511", "metadata": {}, "source": [ - "## Run report" + "## Create final report\n", + "\n", + "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529" ] }, { "cell_type": "code", "execution_count": null, - "id": "e3fe4e71", + "id": "4b8ee3a9", "metadata": {}, "outputs": [], "source": [ - "report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"mean\", \"median\", \"min\", \"max\", \"std\", \"count\"]})" + "report.columns = [col[1] for col in report.columns.values]" ] }, { "cell_type": "code", "execution_count": null, - "id": "329bc196", + "id": "ff16a48a", "metadata": {}, "outputs": [], "source": [ - "del(df)" + "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]" ] }, { "cell_type": "code", "execution_count": null, - "id": "754fcc89", + "id": "e1afd3d2", "metadata": {}, "outputs": [], "source": [ - "report.columns.values" + "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n", + "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)" ] }, { "cell_type": "code", "execution_count": null, - "id": "f279c061", + "id": "d85036b5", "metadata": {}, "outputs": [], "source": [ - "report.columns = [col[1] for col in report.columns.values]" + "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n", + "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ef9b007", + "id": "071eee76", "metadata": {}, "outputs": [], "source": [ - "report[\"gigabytes\"] = report[\"sum\"]/1000/1000/1000" + "report" ] }, { @@ -200,6 +308,14 @@ "if (verbose): print(report)" ] }, + { + "cell_type": "markdown", + "id": "b5472320", + "metadata": {}, + "source": [ + "## Save report as CSV" + ] + }, { "cell_type": "code", "execution_count": null, @@ -212,6 +328,16 @@ " os.mkdir(reportdir)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0ec8cf", + "metadata": {}, + "outputs": [], + "source": [ + "reportdir" + ] + }, { "cell_type": "code", "execution_count": null, @@ -220,7 +346,109 @@ "outputs": [], "source": [ "if (verbose): print(f\"report: groupby-tld\")\n", - "report.to_csv(f\"{reportdir}/groupby-tld.csv.gz\")" + "report.to_csv(f\"{reportdir}/groupby-tld.csv.gz\")\n", + "report.to_pickle(f\"{reportdir}/groupby-tld-year.pkl.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae4a6d7a", + "metadata": {}, + "outputs": [], + "source": [ + "report[report[\"sum\"] == report[\"sum\"].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d2f464a", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"sum\"] > 5*10**13)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bea3b7a5", + "metadata": {}, + "outputs": [], + "source": [ + "report=report.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35ebda85", + "metadata": {}, + "outputs": [], + "source": [ + "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2122a150", + "metadata": {}, + "outputs": [], + "source": [ + "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n", + "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0929c902", + "metadata": {}, + "outputs": [], + "source": [ + "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06e80054", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "504235d8", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30373be5", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e98ab4a", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12" ] } ], diff --git a/scratch-log-explorations.ipynb b/scratch-log-explorations.ipynb index e360063..a441345 100644 --- a/scratch-log-explorations.ipynb +++ b/scratch-log-explorations.ipynb @@ -56,6 +56,24 @@ "file=\"data/mmapplypolicy.120904.9DBFF7E6.list.no_extern_list_list-30day-with-excludes_slurm-13113652_2022-04-05-04:00:28\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file=\"data/mmapplypolicy.35838.667249E1.list.no_extern_list_list-30day-with-excludes_slurm-15685457_2022-08-23-04:00:23\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file=\"data/mmapplypolicy.41557.67790FB6.list.no_extern_list_list-path_slurm-15844227_2022-08-29-13:24:52\"" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -86,6 +104,7 @@ " '''\n", " split each name=value field on = and return the value\n", " '''\n", + " print(x)\n", " return x.split(\"=\", 1)[1]" ] }, -- GitLab