dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5fb66d11",
   "metadata": {},
   "source": [
    "# run report on pickled list policy data\n",
    "\n",
    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n",
    "\n",
    "Some progress info is available via the `verbose` flag.\n",
    "\n",
    "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51c07f66",
   "metadata": {},
   "source": [
    "!conda info --envs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15997b7d",
   "metadata": {},
   "source": [
    "!conda list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c740ad5f",
   "metadata": {},
   "source": [
    "!pip list -freeze"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5059337b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from urllib.parse import unquote\n",
    "import sys\n",
    "import os\n",
    "import pathlib\n",
    "import re\n",
    "import dask.dataframe as dd\n",
    "import dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2beaec9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.diagnostics import ProgressBar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d8afdae",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81b2e176",
   "metadata": {},
   "source": [
    "Client(scheduler_file='scheduler.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "514ecfc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = Client(scheduler_file='scheduler.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b17e817d",
   "metadata": {},
   "source": [
    "\n",
    "client = Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a2cdaa6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "5f4c10d1",
   "metadata": {},
   "source": [
    "## input vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9533a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname=\"data/list-policy_projects_2023-08-31\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.parquet\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/parquet\"\n",
    "reportdir=f\"{dirname}/reports\"\n",
    "tldpath=\"/data/project/ccts/galaxy\"\n",
    "\n",
    "verbose = True\n",
    "limit = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a28d0f15",
   "metadata": {},
   "source": [
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed367712",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get top level dir on which to aggregate\n",
    "\n",
    "def get_tld(df, dirname):\n",
    "    '''\n",
    "    df: dataframe with path column (e.g. from policy run)\n",
    "    dirname: top level dir (TLD) that contains dirs for report\n",
    "    \n",
    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
    "    '''\n",
    "    dirpaths = dirname.split(\"/\")\n",
    "    new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
    "    #df=df.assign(tld=new[len(dirpaths)])\n",
    "    #df[\"tld\"] = new[len(dirpaths)]\n",
    "   \n",
    "    return new[len(dirpaths)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a057a9ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get top level dir on which to aggregate\n",
    "\n",
    "def get_year(df, column):\n",
    "    '''\n",
    "    df: dataframe with path column (e.g. from policy run)\n",
    "    dirname: top level dir (TLD) that contains dirs for report\n",
    "    \n",
    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
    "    '''\n",
    "    new = df[column].dt.year\n",
    "    #dirpaths = dirname.split(\"/\")\n",
    "    #new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
    "    #df=df.assign(tld=new[len(dirpaths)])\n",
    "    #df[\"tld\"] = new[len(dirpaths)]\n",
    "   \n",
    "    return new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,