diff --git a/report-grouby-tld-year-of-last-access.ipynb b/report-grouby-tld-year-of-last-access.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..95a4567444a224d2c8c1cfd5f3a3961ad682814f --- /dev/null +++ b/report-grouby-tld-year-of-last-access.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5fb66d11", + "metadata": {}, + "source": [ + "# run report on pickled list policy data\n", + "\n", + "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the peer \"`dirname`-reports\" dir by default.\n", + "\n", + "Some progress info is available via the `verbose` flag.\n", + "\n", + "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5059337b", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "import os\n", + "import pathlib\n", + "import re" + ] + }, + { + "cell_type": "markdown", + "id": "5f4c10d1", + "metadata": {}, + "source": [ + "## input vars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92ddc402", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"\" # directory to fine files to pickle\n", + "glob_pattern = \"*.gz\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/pickles\"\n", + "reportdir=f\"{dirname}-reports\"\n", + "tldpath=\"/\"\n", + "\n", + "verbose = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed367712", + "metadata": {}, + "outputs": [], + "source": [ + "# get top level dir on which to aggregate\n", + "\n", + "def get_tld(df, dirname):\n", + " dirpaths = dirname.split(\"/\")\n", + " new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n", + " df[\"tld\"] = new[len(dirpaths)]\n", + " \n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "dd92dd03", + "metadata": {}, + "source": [ + "## Read and parse the files according to glob_pattern" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20315d88", + "metadata": {}, + "outputs": [], + "source": [ + "dirpath = pathlib.Path(pickledir)\n", + "\n", + "files = list()\n", + "for file in list(dirpath.glob(glob_pattern)):\n", + " files.append(str(file))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbad833f", + "metadata": {}, + "outputs": [], + "source": [ + "parsedfiles = list()\n", + "for file in files:\n", + " if (verbose): print(f\"parse: {file}\")\n", + " filename=os.path.basename(file)\n", + " parsedfiles.append(pd.read_pickle(file))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ed9ca1b", + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.concat(parsedfiles)\n", + " del(parsedfiles)\n", + "else:\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b69c9fde", + "metadata": {}, + "outputs": [], + "source": [ + "df = get_tld(df, tldpath)" + ] + }, + { + "cell_type": "markdown", + "id": "4352f00c", + "metadata": {}, + "source": [ + "## Run report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3fe4e71", + "metadata": {}, + "outputs": [], + "source": [ + "report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"mean\", \"median\", \"min\", \"max\", \"std\", \"count\"]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "329bc196", + "metadata": {}, + "outputs": [], + "source": [ + "del(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "754fcc89", + "metadata": {}, + "outputs": [], + "source": [ + "report.columns.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f279c061", + "metadata": {}, + "outputs": [], + "source": [ + "report.columns = [col[1] for col in report.columns.values]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ef9b007", + "metadata": {}, + "outputs": [], + "source": [ + "report[\"gigabytes\"] = report[\"sum\"]/1000/1000/1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4de0256", + "metadata": {}, + "outputs": [], + "source": [ + "if (verbose): print(report)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffc99a54", + "metadata": {}, + "outputs": [], + "source": [ + "# only create dir if there is data to pickle\n", + "if (len(report) and not os.path.isdir(reportdir)):\n", + " os.mkdir(reportdir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12d02352", + "metadata": {}, + "outputs": [], + "source": [ + "if (verbose): print(f\"report: groupby-tld\")\n", + "report.to_csv(f\"{reportdir}/groupby-tld.csv.gz\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}