From 91e8f41205701fdd8eda4b4a054df271c03b0d4b Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Tue, 30 Jul 2024 17:44:18 -0500
Subject: [PATCH] Add dask mpi notebooks to demonstrate validatity checking
 using dask+parquet

These nodebooks use dataframes build from parquet files to sanity check
file listings from multiple sources.
---
 ...r-of-last-access-projects-2024-05-03.ipynb | 1292 ++++++++++++++
 ...-validate-galaxy-tar-data-2024-05-03.ipynb | 1581 +++++++++++++++++
 2 files changed, 2873 insertions(+)
 create mode 100644 dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb
 create mode 100644 dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb

diff --git a/dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb b/dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb
new file mode 100644
index 0000000..ddd13ea
--- /dev/null
+++ b/dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb
@@ -0,0 +1,1292 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5fb66d11",
+   "metadata": {},
+   "source": [
+    "# run report on pickled list policy data\n",
+    "\n",
+    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n",
+    "\n",
+    "Some progress info is available via the `verbose` flag.\n",
+    "\n",
+    "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51c07f66",
+   "metadata": {},
+   "source": [
+    "!conda info --envs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15997b7d",
+   "metadata": {},
+   "source": [
+    "!conda list"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c740ad5f",
+   "metadata": {},
+   "source": [
+    "!pip list -freeze"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5059337b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from urllib.parse import unquote\n",
+    "import sys\n",
+    "import os\n",
+    "import pathlib\n",
+    "import re\n",
+    "import dask.dataframe as dd\n",
+    "import dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2beaec9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.diagnostics import ProgressBar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d8afdae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81b2e176",
+   "metadata": {},
+   "source": [
+    "Client(scheduler_file='scheduler.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514ecfc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Client(scheduler_file='scheduler.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b17e817d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "client = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a2cdaa6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f4c10d1",
+   "metadata": {},
+   "source": [
+    "## input vars"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9533a4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirname=\"data/list-policy_projects_2024-05-03\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"*.parquet\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/parquet\"\n",
+    "reportdir=f\"{dirname}/reports\"\n",
+    "tldpath=\"/data/project\"\n",
+    "\n",
+    "verbose = True\n",
+    "limit = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a28d0f15",
+   "metadata": {},
+   "source": [
+    "## Utilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed367712",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get top level dir on which to aggregate\n",
+    "\n",
+    "def get_tld(df, dirname):\n",
+    "    '''\n",
+    "    df: dataframe with path column (e.g. from policy run)\n",
+    "    dirname: top level dir (TLD) that contains dirs for report\n",
+    "    \n",
+    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
+    "    '''\n",
+    "    dirpaths = dirname.split(\"/\")\n",
+    "    new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
+    "    #df=df.assign(tld=new[len(dirpaths)])\n",
+    "    #df[\"tld\"] = new[len(dirpaths)]\n",
+    "   \n",
+    "    return new[len(dirpaths)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a057a9ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get top level dir on which to aggregate\n",
+    "\n",
+    "def get_year(df, column):\n",
+    "    '''\n",
+    "    df: dataframe with path column (e.g. from policy run)\n",
+    "    dirname: top level dir (TLD) that contains dirs for report\n",
+    "    \n",
+    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
+    "    '''\n",
+    "    new = df[column].dt.year\n",
+    "    #dirpaths = dirname.split(\"/\")\n",
+    "    #new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
+    "    #df=df.assign(tld=new[len(dirpaths)])\n",
+    "    #df[\"tld\"] = new[len(dirpaths)]\n",
+    "   \n",
+    "    return new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc11b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_tld_year(df):\n",
+    "    '''\n",
+    "    Aggregate the sum and count of files by year in the top level dir (TLD)\n",
+    "    \n",
+    "    Uses dict parameter to pandas agg to apply sum and count function to size column\n",
+    "    '''\n",
+    "    report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n",
+    "    return report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd92dd03",
+   "metadata": {},
+   "source": [
+    "## Read and parse the files according to glob_pattern"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cdc4558d",
+   "metadata": {},
+   "source": [
+    "dask.config.set(scheduler='threads')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b970eea",
+   "metadata": {},
+   "source": [
+    "dask.config.set(scheduler='processes')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c7fd35d",
+   "metadata": {},
+   "source": [
+    "df = dd.read_parquet(f'{pickledir}/list-1*.parquet', engine=\"pyarrow\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "389070cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = dd.read_parquet(f'{pickledir}/list-*.parquet', columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1336579",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9d8f57c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = client.persist(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "979a78d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cf39cd6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "df=df.repartition(partition_size=\"64MB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24b4e62d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b85535",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "df.map_partitions(len).compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44a7005a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4352f00c",
+   "metadata": {},
+   "source": [
+    "## Aggregate stats into running totals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1a345c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1=get_tld(df, tldpath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1da65bcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f4ac4d1",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "with ProgressBar():\n",
+    "    display(df1.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7706bcf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.assign(tld=df1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11e5c92a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10590450",
+   "metadata": {},
+   "source": [
+    "df = df.drop(columns=\"path\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70213716",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = get_year(df, \"access\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d777440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.assign(year=df1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ddf01b3d",
+   "metadata": {},
+   "source": [
+    "df = df.drop(columns=[\"uid\",\"access\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84d184c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d86e507e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db76d5f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = client.persist(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "471d5b18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d414a9ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a859189",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ls_path(df, path):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    return tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1e5fecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def du_by_year(df, path, time=\"access\"):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "\n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    tmp = tmp.assign(year=get_year(tmp, time))\n",
+    "    \n",
+    "    tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\"])\n",
+    "    tmp = client.persist(tmp)\n",
+    "    \n",
+    "    tmp = tmp.groupby(['tld', 'year']).sum()\n",
+    "    \n",
+    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
+    "    \n",
+    "    return tmp\n",
+    "                    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16ef8689",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def du_by_year(df, path, time=\"access\"):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "\n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    tmp = tmp.assign(year=get_year(tmp, time))\n",
+    "    \n",
+    "    tmp = tmp.drop(columns=[\"uid\", time, \"path\"])\n",
+    "    tmp = client.persist(tmp)\n",
+    "    \n",
+    "    tmp = tmp.groupby(['tld', 'year']).sum()\n",
+    "    \n",
+    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
+    "    \n",
+    "    return tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3de3cb54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def du_by_tld(df, path, time=\"access\"):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "\n",
+    "    \n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    #tmp = tmp.assign(year=get_year(tmp, time))\n",
+    "    \n",
+    "    tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\", \"year\"])\n",
+    "    tmp = client.persist(tmp)\n",
+    "    tmp = tmp.groupby(['tld']).sum()\n",
+    "    \n",
+    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
+    "    \n",
+    "    return tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0ca1d03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e7151f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1c75727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%time\n",
+    "\n",
+    "dudf = du_by_year(df, '/data/project/ccts', \"modify\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f9639f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dudf.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "489989ef",
+   "metadata": {},
+   "source": [
+    "%time\n",
+    "\n",
+    "dudf = du_by_tld(df, '/data/project/ccts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "312cfb3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf = client.persist(dudf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08783886",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dudf.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8686b92",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf=dudf.repartition(partition_size=\"64MB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "932cb3e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55e73440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf = client.compute(dudf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a095bd71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf = dudf.result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79a0378d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf.sort_values([\"tld\", \"year\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89abcc33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "tmp=dudf.reset_index()\n",
+    "#tmp[(tmp['tld']==\"galaxy\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5642e1d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp[tmp.tld=='galaxy'].sort_values('year')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "134dcdb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a801a5ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf.groupby(\"tld\").sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9379b396",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "lsdf = ls_path(df, '/data/project/ccts/galaxy')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87b04252",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "lsdf = client.persist(lsdf.tld.unique())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5599d717",
+   "metadata": {},
+   "source": [
+    "lsdf.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8d8e827",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "lsdf = client.compute(lsdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6acbccb",
+   "metadata": {},
+   "source": [
+    "lsdf.result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce4ba931",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a7ed7af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dfccts = df[df.path.str.match('/data/project/ccts')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf656e76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffbd1abc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts = dfccts.assign(tld=get_tld(dfccts, '/data/project/ccts/galaxy'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3748708",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c741dfdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7045b7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dfccts.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20beeb75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eebf3e8f",
+   "metadata": {},
+   "source": [
+    "lru_projects = ['ICOS', 'boldlab', 'hartmanlab', 'sdtrlab', 'kinglab', 'kobielab', 'MRIPhantom', 'NCRlab', 'bridgeslab', 'hsight', 'kutschlab', 'lcdl', 'metalsgroup', 'rowelab', 'szaflarski_mirman']\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b5d64a0",
+   "metadata": {},
+   "source": [
+    "condition=df1[\"tld\"].isin(lru_projects)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ccca62ff",
+   "metadata": {},
+   "source": [
+    "condition=df1[\"tld\"].isin([\"ccts\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55aa52c5",
+   "metadata": {},
+   "source": [
+    "lru=df1[condition]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9e1df01",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "with ProgressBar():\n",
+    "    display(lru.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31205ddd",
+   "metadata": {},
+   "source": [
+    "df.groupby(['tld', 'year']).size.sum.visualize(node_attr={'penwidth': '6'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b49da8a3",
+   "metadata": {},
+   "source": [
+    "df2 = df.groupby(['tld', 'year']).agg({\"size\": [\"sum\", \"count\"]})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dca64098",
+   "metadata": {},
+   "source": [
+    "df.groupby('name').x.mean().visualize(node_attr={'penwidth': '6'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9de95bf7",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "df2 = report_tld_year(lru)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c492742",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ded14edd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time \n",
+    "\n",
+    "df2 = df.groupby(['tld', 'year']).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "113ba27b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e87713f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e65585b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tbsize = df2[\"size\"]/(10**12)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4dbce6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = df2.assign(terrabytes=tbsize)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60557552",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5056dd46",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a57c5033",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report=df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7f78f66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "report = client.compute(report)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79a424a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04a25511",
+   "metadata": {},
+   "source": [
+    "## Create final report\n",
+    "\n",
+    "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ccfac4b",
+   "metadata": {},
+   "source": [
+    "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "708a62bf",
+   "metadata": {},
+   "source": [
+    "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n",
+    "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88915adb",
+   "metadata": {},
+   "source": [
+    "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n",
+    "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5472320",
+   "metadata": {},
+   "source": [
+    "## Save report as CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23092d7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "report = report.result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc748cf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc99a54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only create dir if there is data to pickle\n",
+    "if (len(report) and not os.path.isdir(reportdir)):\n",
+    "    os.mkdir(reportdir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d0ec8cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reportdir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4e836a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "if (verbose): print(f\"report: groupby-tld\")\n",
+    "report.to_csv(f\"{reportdir}/groupby-tld-dask3.csv.gz\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12d02352",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "report.to_parquet(f\"{reportdir}/groupby-tld-year-dask4.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cbec7b4",
+   "metadata": {},
+   "source": [
+    "## Summarize high-level stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "600650db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06f17bb3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab4090bd",
+   "metadata": {},
+   "source": [
+    "report[report[\"sum\"] == report[\"sum\"].max()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d2f464a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"size\"] > 5*10**13)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c7358c1",
+   "metadata": {},
+   "source": [
+    "report=report.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f42f3b99",
+   "metadata": {},
+   "source": [
+    "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f1f801b",
+   "metadata": {},
+   "source": [
+    "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n",
+    "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f27fc339",
+   "metadata": {},
+   "source": [
+    "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b45d06d",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ef5dea0",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2d3afd3",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc80d326",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb b/dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb
new file mode 100644
index 0000000..9e12565
--- /dev/null
+++ b/dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb
@@ -0,0 +1,1581 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5fb66d11",
+   "metadata": {},
+   "source": [
+    "# run report on pickled list policy data\n",
+    "\n",
+    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n",
+    "\n",
+    "Some progress info is available via the `verbose` flag.\n",
+    "\n",
+    "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51c07f66",
+   "metadata": {},
+   "source": [
+    "!conda info --envs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15997b7d",
+   "metadata": {},
+   "source": [
+    "!conda list"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c740ad5f",
+   "metadata": {},
+   "source": [
+    "!pip list -freeze"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5059337b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from urllib.parse import unquote\n",
+    "import sys\n",
+    "import os\n",
+    "import pathlib\n",
+    "import re\n",
+    "import dask.dataframe as dd\n",
+    "import dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2beaec9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.diagnostics import ProgressBar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d8afdae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81b2e176",
+   "metadata": {},
+   "source": [
+    "Client(scheduler_file='scheduler.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514ecfc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Client(scheduler_file='scheduler.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b17e817d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "client = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a2cdaa6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f4c10d1",
+   "metadata": {},
+   "source": [
+    "## input vars"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9533a4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dirname=\"data/list-policy_projects_2023-08-31\"  # directory to fine files to pickle\n",
+    "glob_pattern = \"*.parquet\"  # file name glob pattern to match, can be file name for individual file\n",
+    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
+    "pickledir=f\"{dirname}/parquet\"\n",
+    "reportdir=f\"{dirname}/reports\"\n",
+    "tldpath=\"/data/project/ccts/galaxy\"\n",
+    "\n",
+    "verbose = True\n",
+    "limit = 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a28d0f15",
+   "metadata": {},
+   "source": [
+    "## Utilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed367712",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get top level dir on which to aggregate\n",
+    "\n",
+    "def get_tld(df, dirname):\n",
+    "    '''\n",
+    "    df: dataframe with path column (e.g. from policy run)\n",
+    "    dirname: top level dir (TLD) that contains dirs for report\n",
+    "    \n",
+    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
+    "    '''\n",
+    "    dirpaths = dirname.split(\"/\")\n",
+    "    new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
+    "    #df=df.assign(tld=new[len(dirpaths)])\n",
+    "    #df[\"tld\"] = new[len(dirpaths)]\n",
+    "   \n",
+    "    return new[len(dirpaths)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a057a9ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get top level dir on which to aggregate\n",
+    "\n",
+    "def get_year(df, column):\n",
+    "    '''\n",
+    "    df: dataframe with path column (e.g. from policy run)\n",
+    "    dirname: top level dir (TLD) that contains dirs for report\n",
+    "    \n",
+    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
+    "    '''\n",
+    "    new = df[column].dt.year\n",
+    "    #dirpaths = dirname.split(\"/\")\n",
+    "    #new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
+    "    #df=df.assign(tld=new[len(dirpaths)])\n",
+    "    #df[\"tld\"] = new[len(dirpaths)]\n",
+    "   \n",
+    "    return new"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bc11b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def report_tld_year(df):\n",
+    "    '''\n",
+    "    Aggregate the sum and count of files by year in the top level dir (TLD)\n",
+    "    \n",
+    "    Uses dict parameter to pandas agg to apply sum and count function to size column\n",
+    "    '''\n",
+    "    report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n",
+    "    return report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd92dd03",
+   "metadata": {},
+   "source": [
+    "## Read and parse the files according to glob_pattern"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cdc4558d",
+   "metadata": {},
+   "source": [
+    "dask.config.set(scheduler='threads')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b970eea",
+   "metadata": {},
+   "source": [
+    "dask.config.set(scheduler='processes')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f95ccab4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_policy_parquet(file, columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\"):\n",
+    "    \n",
+    "    df = dd.read_parquet(file, columns=columns, engine=engine)\n",
+    "    \n",
+    "    df = client.persist(df)\n",
+    "    \n",
+    "    df=df.repartition(partition_size=\"64MB\")\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bfe3f88",
+   "metadata": {},
+   "source": [
+    "df = dd.read_parquet(f'{pickledir}/list-*.parquet', columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "060e04f9",
+   "metadata": {},
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd5bf01b",
+   "metadata": {},
+   "source": [
+    "df = client.persist(df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "480ed8e1",
+   "metadata": {},
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0f072d6",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "df=df.repartition(partition_size=\"64MB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a5f7d5b",
+   "metadata": {},
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6b85535",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "df.map_partitions(len).compute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "afe68a9c",
+   "metadata": {},
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97e29f86",
+   "metadata": {},
+   "source": [
+    "df = df[df.path.str.startswith(\"/data/project/ccts/galaxy/\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5588ef0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "maydf = read_policy_parquet(\"data/list-policy_projects_2024-05-03/parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f039d3d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "maydf = maydf[maydf.path.str.startswith(\"/data/project/ccts/galaxy/\")]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "74c4102b",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "maydf = maydf.set_index(maydf.path, npartitions=\"auto\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "082b6af9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "augdf = read_policy_parquet(\"data/list-policy_projects_2023-08-31/parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e90c800",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "augdf = augdf[augdf.path.str.startswith(\"/data/project/ccts/galaxy/\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a00b148a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "augdf=augdf.repartition(partition_size=\"64MB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0f9c5c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "maydf=maydf.repartition(partition_size=\"64MB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00a50ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(augdf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0196721",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(maydf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49511507",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "augdf = augdf.set_index(augdf.path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "173b6b00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "maydf = maydf.set_index(maydf.path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9da71216",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "joindf = maydf.join(augdf, how=\"outer\", lsuffix=\"_may\", rsuffix=\"_aug\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30e7d44a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joindf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29880891",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joindf.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8274f65b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ba51037",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.modify_aug.isna()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "250824be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.modify_may.isna()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1890c403",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "modify_comp = joindf.modify_may != joindf.modify_aug"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76d79094",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[modify_comp])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "803a863c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.size_may != joindf.size_aug])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ef39703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.size_may == joindf.size_aug])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b9953a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.uid_may != joindf.uid_aug])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cda237a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.access_may != joindf.access_aug])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cc7f6c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "len(joindf[joindf.access_may == joindf.access_aug])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0467d4df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stop"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4352f00c",
+   "metadata": {},
+   "source": [
+    "## Aggregate stats into running totals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1a345c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1=get_tld(df, tldpath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1da65bcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f4ac4d1",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "with ProgressBar():\n",
+    "    display(df1.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7706bcf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.assign(tld=df1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11e5c92a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10590450",
+   "metadata": {},
+   "source": [
+    "df = df.drop(columns=\"path\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70213716",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = get_year(df, \"access\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d777440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.assign(year=df1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ddf01b3d",
+   "metadata": {},
+   "source": [
+    "df = df.drop(columns=[\"uid\",\"access\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84d184c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d86e507e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db76d5f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = client.persist(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "471d5b18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d414a9ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a859189",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ls_path(df, path):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    return tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1e5fecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def du_by_year(df, path, time=\"access\"):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "\n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    tmp = tmp.assign(year=get_year(tmp, time))\n",
+    "    \n",
+    "    tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\"])\n",
+    "    tmp = client.persist(tmp)\n",
+    "    \n",
+    "    tmp = tmp.groupby(['tld', 'year']).sum()\n",
+    "    \n",
+    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
+    "    \n",
+    "    return tmp\n",
+    "                    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16ef8689",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def du_by_year(df, path, time=\"access\"):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "\n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    tmp = tmp.assign(year=get_year(tmp, time))\n",
+    "    \n",
+    "    tmp = tmp.drop(columns=[\"uid\", time, \"path\"])\n",
+    "    tmp = client.persist(tmp)\n",
+    "    \n",
+    "    tmp = tmp.groupby(['tld', 'year']).sum()\n",
+    "    \n",
+    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
+    "    \n",
+    "    return tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3de3cb54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def du_by_tld(df, path, time=\"access\"):\n",
+    "    tmp = df[df.path.str.match(path)]\n",
+    "\n",
+    "    \n",
+    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
+    "    \n",
+    "    #tmp = tmp.assign(year=get_year(tmp, time))\n",
+    "    \n",
+    "    tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\", \"year\"])\n",
+    "    tmp = client.persist(tmp)\n",
+    "    tmp = tmp.groupby(['tld']).sum()\n",
+    "    \n",
+    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
+    "    \n",
+    "    return tmp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0ca1d03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e7151f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1c75727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%time\n",
+    "\n",
+    "dudf = du_by_year(df, '/data/project/ccts', \"modify\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f9639f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dudf.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "489989ef",
+   "metadata": {},
+   "source": [
+    "%time\n",
+    "\n",
+    "dudf = du_by_tld(df, '/data/project/ccts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "312cfb3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf = client.persist(dudf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08783886",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dudf.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b8686b92",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf=dudf.repartition(partition_size=\"64MB\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "932cb3e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55e73440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf = client.compute(dudf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a095bd71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf = dudf.result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79a0378d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf.sort_values([\"tld\", \"year\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89abcc33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "tmp=dudf.reset_index()\n",
+    "#tmp[(tmp['tld']==\"galaxy\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5642e1d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp[tmp.tld=='galaxy'].sort_values('year')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "134dcdb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a801a5ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dudf.groupby(\"tld\").sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9379b396",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "lsdf = ls_path(df, '/data/project/ccts/galaxy')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87b04252",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "lsdf = client.persist(lsdf.tld.unique())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5599d717",
+   "metadata": {},
+   "source": [
+    "lsdf.dask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8d8e827",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "lsdf = client.compute(lsdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6acbccb",
+   "metadata": {},
+   "source": [
+    "lsdf.result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce4ba931",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a7ed7af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dfccts = df[df.path.str.match('/data/project/ccts')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf656e76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffbd1abc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts = dfccts.assign(tld=get_tld(dfccts, '/data/project/ccts/galaxy'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3748708",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c741dfdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfccts.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7045b7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "dfccts.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20beeb75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eebf3e8f",
+   "metadata": {},
+   "source": [
+    "lru_projects = ['ICOS', 'boldlab', 'hartmanlab', 'sdtrlab', 'kinglab', 'kobielab', 'MRIPhantom', 'NCRlab', 'bridgeslab', 'hsight', 'kutschlab', 'lcdl', 'metalsgroup', 'rowelab', 'szaflarski_mirman']\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b5d64a0",
+   "metadata": {},
+   "source": [
+    "condition=df1[\"tld\"].isin(lru_projects)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ccca62ff",
+   "metadata": {},
+   "source": [
+    "condition=df1[\"tld\"].isin([\"ccts\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55aa52c5",
+   "metadata": {},
+   "source": [
+    "lru=df1[condition]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b9e1df01",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "with ProgressBar():\n",
+    "    display(lru.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31205ddd",
+   "metadata": {},
+   "source": [
+    "df.groupby(['tld', 'year']).size.sum.visualize(node_attr={'penwidth': '6'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b49da8a3",
+   "metadata": {},
+   "source": [
+    "df2 = df.groupby(['tld', 'year']).agg({\"size\": [\"sum\", \"count\"]})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dca64098",
+   "metadata": {},
+   "source": [
+    "df.groupby('name').x.mean().visualize(node_attr={'penwidth': '6'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9de95bf7",
+   "metadata": {},
+   "source": [
+    "%%time\n",
+    "\n",
+    "df2 = report_tld_year(lru)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c492742",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ded14edd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time \n",
+    "\n",
+    "df2 = df.groupby(['tld', 'year']).sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "113ba27b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e87713f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e65585b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tbsize = df2[\"size\"]/(10**12)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4dbce6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = df2.assign(terrabytes=tbsize)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60557552",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5056dd46",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a57c5033",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report=df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7f78f66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "report = client.compute(report)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79a424a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04a25511",
+   "metadata": {},
+   "source": [
+    "## Create final report\n",
+    "\n",
+    "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ccfac4b",
+   "metadata": {},
+   "source": [
+    "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "708a62bf",
+   "metadata": {},
+   "source": [
+    "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n",
+    "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88915adb",
+   "metadata": {},
+   "source": [
+    "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n",
+    "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5472320",
+   "metadata": {},
+   "source": [
+    "## Save report as CSV"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23092d7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "report = report.result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc748cf4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc99a54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only create dir if there is data to pickle\n",
+    "if (len(report) and not os.path.isdir(reportdir)):\n",
+    "    os.mkdir(reportdir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d0ec8cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reportdir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4e836a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "if (verbose): print(f\"report: groupby-tld\")\n",
+    "report.to_csv(f\"{reportdir}/groupby-tld-dask3.csv.gz\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12d02352",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "report.to_parquet(f\"{reportdir}/groupby-tld-year-dask4.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7cbec7b4",
+   "metadata": {},
+   "source": [
+    "## Summarize high-level stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "600650db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06f17bb3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab4090bd",
+   "metadata": {},
+   "source": [
+    "report[report[\"sum\"] == report[\"sum\"].max()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d2f464a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[(report[\"size\"] > 5*10**13)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c7358c1",
+   "metadata": {},
+   "source": [
+    "report=report.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f42f3b99",
+   "metadata": {},
+   "source": [
+    "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f1f801b",
+   "metadata": {},
+   "source": [
+    "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n",
+    "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f27fc339",
+   "metadata": {},
+   "source": [
+    "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b45d06d",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ef5dea0",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2d3afd3",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc80d326",
+   "metadata": {},
+   "source": [
+    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab