report-grouby-tld-year-of-last-access.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5fb66d11",
   "metadata": {},
   "source": [
    "# run report on pickled list policy data\n",
    "\n",
    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n",
    "\n",
    "Some progress info is available via the `verbose` flag.\n",
    "\n",
    "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51c07f66",
   "metadata": {},
   "source": [
    "!conda info --envs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15997b7d",
   "metadata": {},
   "source": [
    "!conda list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c740ad5f",
   "metadata": {},
   "source": [
    "!pip list -freeze"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5059337b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from urllib.parse import unquote\n",
    "import sys\n",
    "import os\n",
    "import pathlib\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f4c10d1",
   "metadata": {},
   "source": [
    "## input vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92ddc402",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname=\"\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/pickles\"\n",
    "reportdir=f\"{dirname}/reports\"\n",
    "tldpath=\"/\"\n",
    "\n",
    "verbose = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6e28615",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname=\"data/list-17404604.list.gather-info.d\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/pickles\"\n",
    "reportdir=f\"{dirname}/reports\"\n",
    "tldpath=\"/data/projects\"\n",
    "\n",
    "verbose = True\n",
    "limit = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89a07a7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname=\"data/list-20859348.list.gather-info.d\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/pickles\"\n",
    "reportdir=f\"{dirname}/reports\"\n",
    "tldpath=\"/data/project/datascienceteam\"\n",
    "\n",
    "verbose = True\n",
    "limit = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9533a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname=\"data/list-20191520.list.gather-info.d\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.gz\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/pickles\"\n",
    "reportdir=f\"{dirname}/reports\"\n",
    "tldpath=\"/data/project/thymelab\"\n",
    "\n",
    "verbose = True\n",
    "limit = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a28d0f15",
   "metadata": {},
   "source": [
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed367712",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get top level dir on which to aggregate\n",
    "\n",
    "def get_tld(df, dirname):\n",
    "    dirpaths = dirname.split(\"/\")\n",
    "    new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
    "    df[\"tld\"] = new[len(dirpaths)]\n",
    "   \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7bc11b96",
   "metadata": {},
   "outputs": [],
   "source": [
    "def report_tld_year(df):\n",
    "    report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n",
    "    return report"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd92dd03",
   "metadata": {},
   "source": [
    "## Read and parse the files according to glob_pattern"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20315d88",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirpath = pathlib.Path(pickledir)\n",
    "\n",
    "files = list()\n",
    "for file in list(dirpath.glob(glob_pattern)):\n",
    "    files.append(str(file))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4352f00c",
   "metadata": {},
   "source": [
    "## Aggregate stats into running totals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82abb24f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#report = pd.DataFrame()\n",
    "\n",
    "reports=[]\n",
    "\n",
    "for count, file in enumerate(files):\n",
    "    if (verbose): print(f\"parse: {file}\")\n",
    "    filename=os.path.basename(file)\n",
    "    df = get_tld(pd.read_pickle(file), tldpath)\n",
    "    df = report_tld_year(df)\n",
    "    if (limit and count == limit):\n",
    "        break\n",
    "    # roll up into running total https://stackoverflow.com/a/55828762/8928529\n",
    "    reports.append(df) \n",
    "    del(df)\n",
    "    \n",
    "report=pd.concat(reports)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15bfddef",
   "metadata": {},
   "outputs": [],
   "source": [
    "report=report.groupby(['tld', 'access']).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04a25511",
   "metadata": {},
   "source": [
    "## Create final report\n",
    "\n",
    "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b8ee3a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "report.columns = [col[1] for col in report.columns.values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff16a48a",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1afd3d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n",
    "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d85036b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n",
    "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "071eee76",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4de0256",
   "metadata": {},
   "outputs": [],
   "source": [
    "if (verbose): print(report)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5472320",
   "metadata": {},
   "source": [
    "## Save report as CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffc99a54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# only create dir if there is data to pickle\n",
    "if (len(report) and not os.path.isdir(reportdir)):\n",
    "    os.mkdir(reportdir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d0ec8cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "reportdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12d02352",
   "metadata": {},
   "outputs": [],
   "source": [
    "if (verbose): print(f\"report: groupby-tld\")\n",
    "report.to_csv(f\"{reportdir}/groupby-tld.csv.gz\")\n",
    "report.to_pickle(f\"{reportdir}/groupby-tld-year.pkl.gz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae4a6d7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[report[\"sum\"] == report[\"sum\"].max()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d2f464a",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"sum\"] > 5*10**13)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bea3b7a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "report=report.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35ebda85",
   "metadata": {},
   "outputs": [],
   "source": [
    "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2122a150",
   "metadata": {},
   "outputs": [],
   "source": [
    "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n",
    "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0929c902",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06e80054",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "504235d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30373be5",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e98ab4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}