From 69bb9645067e28e3a7adea2a8f33e1c5b0f91fe9 Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Fri, 26 Jul 2024 11:58:36 -0500
Subject: [PATCH] Rollup commit on various policy report notebooks

These are use to create specific reports by opening the notebook
then copying it and modifying the parameters to a specific
policy run data set.

Their utility may be limited based on current parquet pipelines.
---
 max-access-per-user-merged.ipynb            |  24 +-
 report-grouby-tld-year-of-last-access.ipynb | 290 +++++++++++++++++---
 scratch-log-explorations.ipynb              |  19 ++
 3 files changed, 293 insertions(+), 40 deletions(-)

diff --git a/max-access-per-user-merged.ipynb b/max-access-per-user-merged.ipynb
index aa4e14a..8087df6 100644
--- a/max-access-per-user-merged.ipynb
+++ b/max-access-per-user-merged.ipynb
@@ -22,7 +22,13 @@
     "report_name = \"temporary-scratch\"\n",
     "dir = \"data/list-17075953.list.gather-info.d\"\n",
     "#report_name = \"temporary-scratch\"\n",
-    "#dir = 'data/list-17094088.list.gather-info.d'\n",
+    "#dir = 'data/list-17094088.list.gather-info.d'\\\n",
+    "glob_pattern=\"*.gz\"\n",
+    "\n",
+    "report_name = \"old-scratch\"\n",
+    "dir = \"data/list-16144464.list.gather-info.d\"\n",
+    "dir = \"data/list-policy_old-scratch_2022-09-15/pickles\"\n",
+    "glob_pattern = \"list-*.gz\"\n",
     "\n",
     "os.chdir(dir)"
    ]
@@ -65,16 +71,16 @@
    "outputs": [],
    "source": [
     "frames = []\n",
+    "df = pd.DataFrame()\n",
     "\n",
-    "for file in glob.glob(\"*.gz\"):\n",
+    "for file in glob.glob(glob_pattern):\n",
     "    print(f\"processing: {file}\")\n",
     "    # combine picked dfs into one df\n",
     "    # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html\n",
-    "    df = pd.read_pickle(file)\n",
-    "    frames.append(df)\n",
-    "\n",
-    "df = pd.concat(frames)\n",
-    "del(frames)"
+    "    dfnew = pd.read_pickle(file)\n",
+    "    #frames.append(df)\n",
+    "    df = pd.concat([df, dfnew])\n",
+    "#del(frames)"
    ]
   },
   {
@@ -94,7 +100,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(f\"max atime: {df['atime'].max()}\\n\")"
+    "print(f\"max atime: {df['access'].max()}\\n\")"
    ]
   },
   {
@@ -104,7 +110,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df.groupby([\"uid\"], sort=False)[\"atime\"].max().sort_values()"
+    "df.groupby([\"uid\"], sort=False)[\"access\"].max().sort_values()"
    ]
   },
   {
diff --git a/report-grouby-tld-year-of-last-access.ipynb b/report-grouby-tld-year-of-last-access.ipynb
index 95a4567..cc95e55 100644
--- a/report-grouby-tld-year-of-last-access.ipynb
+++ b/report-grouby-tld-year-of-last-access.ipynb
@@ -7,13 +7,37 @@
    "source": [
     "# run report on pickled list policy data\n",
     "\n",
-    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the peer \"`dirname`-reports\" dir by default.\n",
+    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n",
     "\n",
     "Some progress info is available via the `verbose` flag.\n",
     "\n",
     "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "51c07f66",
+   "metadata": {},
+   "source": [
+    "!conda info --envs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15997b7d",
+   "metadata": {},
+   "source": [
+    "!conda list"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c740ad5f",
+   "metadata": {},
+   "source": [
+    "!pip list -freeze"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -50,12 +74,74 @@
     "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
     "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
     "pickledir=f\"{dirname}/pickles\"\n",
-    "reportdir=f\"{dirname}-reports\"\n",
+    "reportdir=f\"{dirname}/reports\"\n",
     "tldpath=\"/\"\n",
     "\n",
     "verbose = False"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6e28615",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirname=\"data/list-17404604.list.gather-info.d\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/pickles\"\n",
+    "reportdir=f\"{dirname}/reports\"\n",
+    "tldpath=\"/data/projects\"\n",
+    "\n",
+    "verbose = True\n",
+    "limit = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89a07a7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirname=\"data/list-20859348.list.gather-info.d\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/pickles\"\n",
+    "reportdir=f\"{dirname}/reports\"\n",
+    "tldpath=\"/data/project/datascienceteam\"\n",
+    "\n",
+    "verbose = True\n",
+    "limit = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9533a4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirname=\"data/list-20191520.list.gather-info.d\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/pickles\"\n",
+    "reportdir=f\"{dirname}/reports\"\n",
+    "tldpath=\"/data/project/thymelab\"\n",
+    "\n",
+    "verbose = True\n",
+    "limit = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a28d0f15",
+   "metadata": {},
+   "source": [
+    "## Utilities"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -73,6 +159,18 @@
     "    return df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc11b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_tld_year(df):\n",
+    "    report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n",
+    "    return report"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "dd92dd03",
@@ -96,98 +194,108 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cbad833f",
+   "cell_type": "markdown",
+   "id": "4352f00c",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "parsedfiles = list()\n",
-    "for file in files:\n",
-    "    if (verbose): print(f\"parse: {file}\")\n",
-    "    filename=os.path.basename(file)\n",
-    "    parsedfiles.append(pd.read_pickle(file))\n"
+    "## Aggregate stats into running totals"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4ed9ca1b",
+   "id": "82abb24f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df=pd.concat(parsedfiles)\n",
-    "    del(parsedfiles)\n",
-    "else:\n",
-    "    return"
+    "#report = pd.DataFrame()\n",
+    "\n",
+    "reports=[]\n",
+    "\n",
+    "for count, file in enumerate(files):\n",
+    "    if (verbose): print(f\"parse: {file}\")\n",
+    "    filename=os.path.basename(file)\n",
+    "    df = get_tld(pd.read_pickle(file), tldpath)\n",
+    "    df = report_tld_year(df)\n",
+    "    if (limit and count == limit):\n",
+    "        break\n",
+    "    # roll up into running total https://stackoverflow.com/a/55828762/8928529\n",
+    "    reports.append(df) \n",
+    "    del(df)\n",
+    "    \n",
+    "report=pd.concat(reports)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b69c9fde",
+   "id": "15bfddef",
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = get_tld(df, tldpath)"
+    "report=report.groupby(['tld', 'access']).sum()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "4352f00c",
+   "id": "04a25511",
    "metadata": {},
    "source": [
-    "## Run report"
+    "## Create final report\n",
+    "\n",
+    "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e3fe4e71",
+   "id": "4b8ee3a9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"mean\", \"median\", \"min\", \"max\", \"std\", \"count\"]})"
+    "report.columns = [col[1] for col in report.columns.values]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "329bc196",
+   "id": "ff16a48a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "del(df)"
+    "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "754fcc89",
+   "id": "e1afd3d2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "report.columns.values"
+    "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n",
+    "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f279c061",
+   "id": "d85036b5",
    "metadata": {},
    "outputs": [],
    "source": [
-    "report.columns = [col[1] for col in report.columns.values]"
+    "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n",
+    "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8ef9b007",
+   "id": "071eee76",
    "metadata": {},
    "outputs": [],
    "source": [
-    "report[\"gigabytes\"] = report[\"sum\"]/1000/1000/1000"
+    "report"
    ]
   },
   {
@@ -200,6 +308,14 @@
     "if (verbose): print(report)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b5472320",
+   "metadata": {},
+   "source": [
+    "## Save report as CSV"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -212,6 +328,16 @@
     "    os.mkdir(reportdir)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d0ec8cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reportdir"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -220,7 +346,109 @@
    "outputs": [],
    "source": [
     "if (verbose): print(f\"report: groupby-tld\")\n",
-    "report.to_csv(f\"{reportdir}/groupby-tld.csv.gz\")"
+    "report.to_csv(f\"{reportdir}/groupby-tld.csv.gz\")\n",
+    "report.to_pickle(f\"{reportdir}/groupby-tld-year.pkl.gz\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae4a6d7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[report[\"sum\"] == report[\"sum\"].max()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d2f464a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"sum\"] > 5*10**13)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bea3b7a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report=report.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35ebda85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2122a150",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n",
+    "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0929c902",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06e80054",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "504235d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30373be5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e98ab4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12"
    ]
   }
  ],
diff --git a/scratch-log-explorations.ipynb b/scratch-log-explorations.ipynb
index e360063..a441345 100644
--- a/scratch-log-explorations.ipynb
+++ b/scratch-log-explorations.ipynb
@@ -56,6 +56,24 @@
     "file=\"data/mmapplypolicy.120904.9DBFF7E6.list.no_extern_list_list-30day-with-excludes_slurm-13113652_2022-04-05-04:00:28\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file=\"data/mmapplypolicy.35838.667249E1.list.no_extern_list_list-30day-with-excludes_slurm-15685457_2022-08-23-04:00:23\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file=\"data/mmapplypolicy.41557.67790FB6.list.no_extern_list_list-path_slurm-15844227_2022-08-29-13:24:52\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -86,6 +104,7 @@
     "    '''\n",
     "    split each name=value field on = and return the value\n",
     "    '''\n",
+    "    print(x)\n",
     "    return x.split(\"=\", 1)[1]"
    ]
   },
-- 
GitLab