From 91e8f41205701fdd8eda4b4a054df271c03b0d4b Mon Sep 17 00:00:00 2001 From: John-Paul Robinson <jpr@uab.edu> Date: Tue, 30 Jul 2024 17:44:18 -0500 Subject: [PATCH] Add dask mpi notebooks to demonstrate validatity checking using dask+parquet These nodebooks use dataframes build from parquet files to sanity check file listings from multiple sources. --- ...r-of-last-access-projects-2024-05-03.ipynb | 1292 ++++++++++++++ ...-validate-galaxy-tar-data-2024-05-03.ipynb | 1581 +++++++++++++++++ 2 files changed, 2873 insertions(+) create mode 100644 dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb create mode 100644 dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb diff --git a/dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb b/dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb new file mode 100644 index 0000000..ddd13ea --- /dev/null +++ b/dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb @@ -0,0 +1,1292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5fb66d11", + "metadata": {}, + "source": [ + "# run report on pickled list policy data\n", + "\n", + "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n", + "\n", + "Some progress info is available via the `verbose` flag.\n", + "\n", + "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data." + ] + }, + { + "cell_type": "markdown", + "id": "51c07f66", + "metadata": {}, + "source": [ + "!conda info --envs" + ] + }, + { + "cell_type": "markdown", + "id": "15997b7d", + "metadata": {}, + "source": [ + "!conda list" + ] + }, + { + "cell_type": "markdown", + "id": "c740ad5f", + "metadata": {}, + "source": [ + "!pip list -freeze" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5059337b", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "import os\n", + "import pathlib\n", + "import re\n", + "import dask.dataframe as dd\n", + "import dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2beaec9e", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.diagnostics import ProgressBar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d8afdae", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.distributed import Client" + ] + }, + { + "cell_type": "markdown", + "id": "81b2e176", + "metadata": {}, + "source": [ + "Client(scheduler_file='scheduler.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "514ecfc1", + "metadata": {}, + "outputs": [], + "source": [ + "client = Client(scheduler_file='scheduler.json')" + ] + }, + { + "cell_type": "markdown", + "id": "b17e817d", + "metadata": {}, + "source": [ + "\n", + "client = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a2cdaa6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5f4c10d1", + "metadata": {}, + "source": [ + "## input vars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9533a4c", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"data/list-policy_projects_2024-05-03\" # directory to fine files to pickle\n", + "glob_pattern = \"*.parquet\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/parquet\"\n", + "reportdir=f\"{dirname}/reports\"\n", + "tldpath=\"/data/project\"\n", + "\n", + "verbose = True\n", + "limit = 0" + ] + }, + { + "cell_type": "markdown", + "id": "a28d0f15", + "metadata": {}, + "source": [ + "## Utilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed367712", + "metadata": {}, + "outputs": [], + "source": [ + "# get top level dir on which to aggregate\n", + "\n", + "def get_tld(df, dirname):\n", + " '''\n", + " df: dataframe with path column (e.g. from policy run)\n", + " dirname: top level dir (TLD) that contains dirs for report\n", + " \n", + " The function uses the length of dirname to locate the TLD column in the split path.\n", + " '''\n", + " dirpaths = dirname.split(\"/\")\n", + " new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n", + " #df=df.assign(tld=new[len(dirpaths)])\n", + " #df[\"tld\"] = new[len(dirpaths)]\n", + " \n", + " return new[len(dirpaths)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a057a9ec", + "metadata": {}, + "outputs": [], + "source": [ + "# get top level dir on which to aggregate\n", + "\n", + "def get_year(df, column):\n", + " '''\n", + " df: dataframe with path column (e.g. from policy run)\n", + " dirname: top level dir (TLD) that contains dirs for report\n", + " \n", + " The function uses the length of dirname to locate the TLD column in the split path.\n", + " '''\n", + " new = df[column].dt.year\n", + " #dirpaths = dirname.split(\"/\")\n", + " #new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n", + " #df=df.assign(tld=new[len(dirpaths)])\n", + " #df[\"tld\"] = new[len(dirpaths)]\n", + " \n", + " return new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bc11b96", + "metadata": {}, + "outputs": [], + "source": [ + "def report_tld_year(df):\n", + " '''\n", + " Aggregate the sum and count of files by year in the top level dir (TLD)\n", + " \n", + " Uses dict parameter to pandas agg to apply sum and count function to size column\n", + " '''\n", + " report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n", + " return report" + ] + }, + { + "cell_type": "markdown", + "id": "dd92dd03", + "metadata": {}, + "source": [ + "## Read and parse the files according to glob_pattern" + ] + }, + { + "cell_type": "markdown", + "id": "cdc4558d", + "metadata": {}, + "source": [ + "dask.config.set(scheduler='threads')" + ] + }, + { + "cell_type": "markdown", + "id": "7b970eea", + "metadata": {}, + "source": [ + "dask.config.set(scheduler='processes')" + ] + }, + { + "cell_type": "markdown", + "id": "1c7fd35d", + "metadata": {}, + "source": [ + "df = dd.read_parquet(f'{pickledir}/list-1*.parquet', engine=\"pyarrow\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "389070cb", + "metadata": {}, + "outputs": [], + "source": [ + "df = dd.read_parquet(f'{pickledir}/list-*.parquet', columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1336579", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9d8f57c", + "metadata": {}, + "outputs": [], + "source": [ + "df = client.persist(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "979a78d0", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cf39cd6", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "df=df.repartition(partition_size=\"64MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24b4e62d", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "f6b85535", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "df.map_partitions(len).compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44a7005a", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "markdown", + "id": "4352f00c", + "metadata": {}, + "source": [ + "## Aggregate stats into running totals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1a345c7", + "metadata": {}, + "outputs": [], + "source": [ + "df1=get_tld(df, tldpath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1da65bcc", + "metadata": {}, + "outputs": [], + "source": [ + "df1.dask" + ] + }, + { + "cell_type": "markdown", + "id": "4f4ac4d1", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "with ProgressBar():\n", + " display(df1.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7706bcf8", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(tld=df1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11e5c92a", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "markdown", + "id": "10590450", + "metadata": {}, + "source": [ + "df = df.drop(columns=\"path\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70213716", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = get_year(df, \"access\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d777440", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(year=df1)" + ] + }, + { + "cell_type": "markdown", + "id": "ddf01b3d", + "metadata": {}, + "source": [ + "df = df.drop(columns=[\"uid\",\"access\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84d184c2", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d86e507e", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db76d5f4", + "metadata": {}, + "outputs": [], + "source": [ + "df = client.persist(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "471d5b18", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d414a9ae", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a859189", + "metadata": {}, + "outputs": [], + "source": [ + "def ls_path(df, path):\n", + " tmp = df[df.path.str.match(path)]\n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " return tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1e5fecb", + "metadata": {}, + "outputs": [], + "source": [ + "def du_by_year(df, path, time=\"access\"):\n", + " tmp = df[df.path.str.match(path)]\n", + "\n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " tmp = tmp.assign(year=get_year(tmp, time))\n", + " \n", + " tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\"])\n", + " tmp = client.persist(tmp)\n", + " \n", + " tmp = tmp.groupby(['tld', 'year']).sum()\n", + " \n", + " tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n", + " \n", + " return tmp\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16ef8689", + "metadata": {}, + "outputs": [], + "source": [ + "def du_by_year(df, path, time=\"access\"):\n", + " tmp = df[df.path.str.match(path)]\n", + "\n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " tmp = tmp.assign(year=get_year(tmp, time))\n", + " \n", + " tmp = tmp.drop(columns=[\"uid\", time, \"path\"])\n", + " tmp = client.persist(tmp)\n", + " \n", + " tmp = tmp.groupby(['tld', 'year']).sum()\n", + " \n", + " tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n", + " \n", + " return tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3de3cb54", + "metadata": {}, + "outputs": [], + "source": [ + "def du_by_tld(df, path, time=\"access\"):\n", + " tmp = df[df.path.str.match(path)]\n", + "\n", + " \n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " #tmp = tmp.assign(year=get_year(tmp, time))\n", + " \n", + " tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\", \"year\"])\n", + " tmp = client.persist(tmp)\n", + " tmp = tmp.groupby(['tld']).sum()\n", + " \n", + " tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n", + " \n", + " return tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ca1d03", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e7151f6", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c75727", + "metadata": {}, + "outputs": [], + "source": [ + "%time\n", + "\n", + "dudf = du_by_year(df, '/data/project/ccts', \"modify\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f9639f1", + "metadata": {}, + "outputs": [], + "source": [ + "dudf.dask" + ] + }, + { + "cell_type": "markdown", + "id": "489989ef", + "metadata": {}, + "source": [ + "%time\n", + "\n", + "dudf = du_by_tld(df, '/data/project/ccts')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "312cfb3f", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf = client.persist(dudf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08783886", + "metadata": {}, + "outputs": [], + "source": [ + "dudf.dask" + ] + }, + { + "cell_type": "markdown", + "id": "b8686b92", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "dudf=dudf.repartition(partition_size=\"64MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "932cb3e0", + "metadata": {}, + "outputs": [], + "source": [ + "dudf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55e73440", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf = client.compute(dudf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a095bd71", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf = dudf.result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a0378d", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf.sort_values([\"tld\", \"year\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89abcc33", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "tmp=dudf.reset_index()\n", + "#tmp[(tmp['tld']==\"galaxy\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5642e1d5", + "metadata": {}, + "outputs": [], + "source": [ + "tmp[tmp.tld=='galaxy'].sort_values('year')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134dcdb6", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a801a5ce", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf.groupby(\"tld\").sum()" + ] + }, + { + "cell_type": "markdown", + "id": "9379b396", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "lsdf = ls_path(df, '/data/project/ccts/galaxy')" + ] + }, + { + "cell_type": "markdown", + "id": "87b04252", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "lsdf = client.persist(lsdf.tld.unique())" + ] + }, + { + "cell_type": "markdown", + "id": "5599d717", + "metadata": {}, + "source": [ + "lsdf.dask" + ] + }, + { + "cell_type": "markdown", + "id": "d8d8e827", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "lsdf = client.compute(lsdf)" + ] + }, + { + "cell_type": "markdown", + "id": "e6acbccb", + "metadata": {}, + "source": [ + "lsdf.result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce4ba931", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a7ed7af", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dfccts = df[df.path.str.match('/data/project/ccts')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf656e76", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffbd1abc", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts = dfccts.assign(tld=get_tld(dfccts, '/data/project/ccts/galaxy'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3748708", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c741dfdb", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7045b7e9", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dfccts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20beeb75", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = df" + ] + }, + { + "cell_type": "markdown", + "id": "eebf3e8f", + "metadata": {}, + "source": [ + "lru_projects = ['ICOS', 'boldlab', 'hartmanlab', 'sdtrlab', 'kinglab', 'kobielab', 'MRIPhantom', 'NCRlab', 'bridgeslab', 'hsight', 'kutschlab', 'lcdl', 'metalsgroup', 'rowelab', 'szaflarski_mirman']\n" + ] + }, + { + "cell_type": "markdown", + "id": "9b5d64a0", + "metadata": {}, + "source": [ + "condition=df1[\"tld\"].isin(lru_projects)" + ] + }, + { + "cell_type": "markdown", + "id": "ccca62ff", + "metadata": {}, + "source": [ + "condition=df1[\"tld\"].isin([\"ccts\"])" + ] + }, + { + "cell_type": "markdown", + "id": "55aa52c5", + "metadata": {}, + "source": [ + "lru=df1[condition]" + ] + }, + { + "cell_type": "markdown", + "id": "b9e1df01", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "with ProgressBar():\n", + " display(lru.head())" + ] + }, + { + "cell_type": "markdown", + "id": "31205ddd", + "metadata": {}, + "source": [ + "df.groupby(['tld', 'year']).size.sum.visualize(node_attr={'penwidth': '6'})" + ] + }, + { + "cell_type": "markdown", + "id": "b49da8a3", + "metadata": {}, + "source": [ + "df2 = df.groupby(['tld', 'year']).agg({\"size\": [\"sum\", \"count\"]})" + ] + }, + { + "cell_type": "markdown", + "id": "dca64098", + "metadata": {}, + "source": [ + "df.groupby('name').x.mean().visualize(node_attr={'penwidth': '6'})" + ] + }, + { + "cell_type": "markdown", + "id": "9de95bf7", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "df2 = report_tld_year(lru)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c492742", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded14edd", + "metadata": {}, + "outputs": [], + "source": [ + "%%time \n", + "\n", + "df2 = df.groupby(['tld', 'year']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113ba27b", + "metadata": {}, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e87713f", + "metadata": {}, + "outputs": [], + "source": [ + "df2.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e65585b", + "metadata": {}, + "outputs": [], + "source": [ + "tbsize = df2[\"size\"]/(10**12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4dbce6b", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df2.assign(terrabytes=tbsize)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60557552", + "metadata": {}, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5056dd46", + "metadata": {}, + "outputs": [], + "source": [ + "df2.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a57c5033", + "metadata": {}, + "outputs": [], + "source": [ + "report=df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7f78f66", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "report = client.compute(report)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a424a9", + "metadata": {}, + "outputs": [], + "source": [ + "report" + ] + }, + { + "cell_type": "markdown", + "id": "04a25511", + "metadata": {}, + "source": [ + "## Create final report\n", + "\n", + "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529" + ] + }, + { + "cell_type": "markdown", + "id": "9ccfac4b", + "metadata": {}, + "source": [ + "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]" + ] + }, + { + "cell_type": "markdown", + "id": "708a62bf", + "metadata": {}, + "source": [ + "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n", + "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "markdown", + "id": "88915adb", + "metadata": {}, + "source": [ + "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n", + "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "markdown", + "id": "b5472320", + "metadata": {}, + "source": [ + "## Save report as CSV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23092d7e", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "report = report.result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc748cf4", + "metadata": {}, + "outputs": [], + "source": [ + "report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffc99a54", + "metadata": {}, + "outputs": [], + "source": [ + "# only create dir if there is data to pickle\n", + "if (len(report) and not os.path.isdir(reportdir)):\n", + " os.mkdir(reportdir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0ec8cf", + "metadata": {}, + "outputs": [], + "source": [ + "reportdir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4e836a3", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "if (verbose): print(f\"report: groupby-tld\")\n", + "report.to_csv(f\"{reportdir}/groupby-tld-dask3.csv.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12d02352", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "report.to_parquet(f\"{reportdir}/groupby-tld-year-dask4.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "7cbec7b4", + "metadata": {}, + "source": [ + "## Summarize high-level stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "600650db", + "metadata": {}, + "outputs": [], + "source": [ + "report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06f17bb3", + "metadata": {}, + "outputs": [], + "source": [ + "report.reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "ab4090bd", + "metadata": {}, + "source": [ + "report[report[\"sum\"] == report[\"sum\"].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d2f464a", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"size\"] > 5*10**13)]" + ] + }, + { + "cell_type": "markdown", + "id": "4c7358c1", + "metadata": {}, + "source": [ + "report=report.reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "f42f3b99", + "metadata": {}, + "source": [ + "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13" + ] + }, + { + "cell_type": "markdown", + "id": "9f1f801b", + "metadata": {}, + "source": [ + "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n", + "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "markdown", + "id": "f27fc339", + "metadata": {}, + "source": [ + "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])" + ] + }, + { + "cell_type": "markdown", + "id": "2b45d06d", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]" + ] + }, + { + "cell_type": "markdown", + "id": "7ef5dea0", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "d2d3afd3", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "bc80d326", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb b/dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb new file mode 100644 index 0000000..9e12565 --- /dev/null +++ b/dask-mpi-validate-galaxy-tar-data-2024-05-03.ipynb @@ -0,0 +1,1581 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5fb66d11", + "metadata": {}, + "source": [ + "# run report on pickled list policy data\n", + "\n", + "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n", + "\n", + "Some progress info is available via the `verbose` flag.\n", + "\n", + "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data." + ] + }, + { + "cell_type": "markdown", + "id": "51c07f66", + "metadata": {}, + "source": [ + "!conda info --envs" + ] + }, + { + "cell_type": "markdown", + "id": "15997b7d", + "metadata": {}, + "source": [ + "!conda list" + ] + }, + { + "cell_type": "markdown", + "id": "c740ad5f", + "metadata": {}, + "source": [ + "!pip list -freeze" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5059337b", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "import os\n", + "import pathlib\n", + "import re\n", + "import dask.dataframe as dd\n", + "import dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2beaec9e", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.diagnostics import ProgressBar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d8afdae", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.distributed import Client" + ] + }, + { + "cell_type": "markdown", + "id": "81b2e176", + "metadata": {}, + "source": [ + "Client(scheduler_file='scheduler.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "514ecfc1", + "metadata": {}, + "outputs": [], + "source": [ + "client = Client(scheduler_file='scheduler.json')" + ] + }, + { + "cell_type": "markdown", + "id": "b17e817d", + "metadata": {}, + "source": [ + "\n", + "client = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a2cdaa6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5f4c10d1", + "metadata": {}, + "source": [ + "## input vars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9533a4c", + "metadata": {}, + "outputs": [], + "source": [ + "dirname=\"data/list-policy_projects_2023-08-31\" # directory to fine files to pickle\n", + "glob_pattern = \"*.parquet\" # file name glob pattern to match, can be file name for individual file\n", + "line_regex_filter = \".*\" # regex to match lines of interest in file\n", + "pickledir=f\"{dirname}/parquet\"\n", + "reportdir=f\"{dirname}/reports\"\n", + "tldpath=\"/data/project/ccts/galaxy\"\n", + "\n", + "verbose = True\n", + "limit = 0" + ] + }, + { + "cell_type": "markdown", + "id": "a28d0f15", + "metadata": {}, + "source": [ + "## Utilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed367712", + "metadata": {}, + "outputs": [], + "source": [ + "# get top level dir on which to aggregate\n", + "\n", + "def get_tld(df, dirname):\n", + " '''\n", + " df: dataframe with path column (e.g. from policy run)\n", + " dirname: top level dir (TLD) that contains dirs for report\n", + " \n", + " The function uses the length of dirname to locate the TLD column in the split path.\n", + " '''\n", + " dirpaths = dirname.split(\"/\")\n", + " new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n", + " #df=df.assign(tld=new[len(dirpaths)])\n", + " #df[\"tld\"] = new[len(dirpaths)]\n", + " \n", + " return new[len(dirpaths)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a057a9ec", + "metadata": {}, + "outputs": [], + "source": [ + "# get top level dir on which to aggregate\n", + "\n", + "def get_year(df, column):\n", + " '''\n", + " df: dataframe with path column (e.g. from policy run)\n", + " dirname: top level dir (TLD) that contains dirs for report\n", + " \n", + " The function uses the length of dirname to locate the TLD column in the split path.\n", + " '''\n", + " new = df[column].dt.year\n", + " #dirpaths = dirname.split(\"/\")\n", + " #new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n", + " #df=df.assign(tld=new[len(dirpaths)])\n", + " #df[\"tld\"] = new[len(dirpaths)]\n", + " \n", + " return new" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bc11b96", + "metadata": {}, + "outputs": [], + "source": [ + "def report_tld_year(df):\n", + " '''\n", + " Aggregate the sum and count of files by year in the top level dir (TLD)\n", + " \n", + " Uses dict parameter to pandas agg to apply sum and count function to size column\n", + " '''\n", + " report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n", + " return report" + ] + }, + { + "cell_type": "markdown", + "id": "dd92dd03", + "metadata": {}, + "source": [ + "## Read and parse the files according to glob_pattern" + ] + }, + { + "cell_type": "markdown", + "id": "cdc4558d", + "metadata": {}, + "source": [ + "dask.config.set(scheduler='threads')" + ] + }, + { + "cell_type": "markdown", + "id": "7b970eea", + "metadata": {}, + "source": [ + "dask.config.set(scheduler='processes')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f95ccab4", + "metadata": {}, + "outputs": [], + "source": [ + "def read_policy_parquet(file, columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\"):\n", + " \n", + " df = dd.read_parquet(file, columns=columns, engine=engine)\n", + " \n", + " df = client.persist(df)\n", + " \n", + " df=df.repartition(partition_size=\"64MB\")\n", + "\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "8bfe3f88", + "metadata": {}, + "source": [ + "df = dd.read_parquet(f'{pickledir}/list-*.parquet', columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "060e04f9", + "metadata": {}, + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "bd5bf01b", + "metadata": {}, + "source": [ + "df = client.persist(df)" + ] + }, + { + "cell_type": "markdown", + "id": "480ed8e1", + "metadata": {}, + "source": [ + "df.dask" + ] + }, + { + "cell_type": "markdown", + "id": "f0f072d6", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "df=df.repartition(partition_size=\"64MB\")" + ] + }, + { + "cell_type": "markdown", + "id": "0a5f7d5b", + "metadata": {}, + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "f6b85535", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "df.map_partitions(len).compute()" + ] + }, + { + "cell_type": "markdown", + "id": "afe68a9c", + "metadata": {}, + "source": [ + "df.dask" + ] + }, + { + "cell_type": "markdown", + "id": "97e29f86", + "metadata": {}, + "source": [ + "df = df[df.path.str.startswith(\"/data/project/ccts/galaxy/\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5588ef0", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "maydf = read_policy_parquet(\"data/list-policy_projects_2024-05-03/parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f039d3d1", + "metadata": {}, + "outputs": [], + "source": [ + "maydf = maydf[maydf.path.str.startswith(\"/data/project/ccts/galaxy/\")]" + ] + }, + { + "cell_type": "markdown", + "id": "74c4102b", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "maydf = maydf.set_index(maydf.path, npartitions=\"auto\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "082b6af9", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "augdf = read_policy_parquet(\"data/list-policy_projects_2023-08-31/parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e90c800", + "metadata": {}, + "outputs": [], + "source": [ + "augdf = augdf[augdf.path.str.startswith(\"/data/project/ccts/galaxy/\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a00b148a", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "augdf=augdf.repartition(partition_size=\"64MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0f9c5c4", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "maydf=maydf.repartition(partition_size=\"64MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00a50ff5", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(augdf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0196721", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(maydf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49511507", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "augdf = augdf.set_index(augdf.path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "173b6b00", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "maydf = maydf.set_index(maydf.path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da71216", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "joindf = maydf.join(augdf, how=\"outer\", lsuffix=\"_may\", rsuffix=\"_aug\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30e7d44a", + "metadata": {}, + "outputs": [], + "source": [ + "joindf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29880891", + "metadata": {}, + "outputs": [], + "source": [ + "joindf.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8274f65b", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba51037", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.modify_aug.isna()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "250824be", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.modify_may.isna()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1890c403", + "metadata": {}, + "outputs": [], + "source": [ + "modify_comp = joindf.modify_may != joindf.modify_aug" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76d79094", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[modify_comp])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "803a863c", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.size_may != joindf.size_aug])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ef39703", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.size_may == joindf.size_aug])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b9953a9", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.uid_may != joindf.uid_aug])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cda237a8", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.access_may != joindf.access_aug])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cc7f6c7", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "len(joindf[joindf.access_may == joindf.access_aug])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0467d4df", + "metadata": {}, + "outputs": [], + "source": [ + "stop" + ] + }, + { + "cell_type": "markdown", + "id": "4352f00c", + "metadata": {}, + "source": [ + "## Aggregate stats into running totals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1a345c7", + "metadata": {}, + "outputs": [], + "source": [ + "df1=get_tld(df, tldpath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1da65bcc", + "metadata": {}, + "outputs": [], + "source": [ + "df1.dask" + ] + }, + { + "cell_type": "markdown", + "id": "4f4ac4d1", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "with ProgressBar():\n", + " display(df1.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7706bcf8", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(tld=df1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11e5c92a", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "markdown", + "id": "10590450", + "metadata": {}, + "source": [ + "df = df.drop(columns=\"path\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70213716", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = get_year(df, \"access\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d777440", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(year=df1)" + ] + }, + { + "cell_type": "markdown", + "id": "ddf01b3d", + "metadata": {}, + "source": [ + "df = df.drop(columns=[\"uid\",\"access\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84d184c2", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d86e507e", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db76d5f4", + "metadata": {}, + "outputs": [], + "source": [ + "df = client.persist(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "471d5b18", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d414a9ae", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a859189", + "metadata": {}, + "outputs": [], + "source": [ + "def ls_path(df, path):\n", + " tmp = df[df.path.str.match(path)]\n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " return tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1e5fecb", + "metadata": {}, + "outputs": [], + "source": [ + "def du_by_year(df, path, time=\"access\"):\n", + " tmp = df[df.path.str.match(path)]\n", + "\n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " tmp = tmp.assign(year=get_year(tmp, time))\n", + " \n", + " tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\"])\n", + " tmp = client.persist(tmp)\n", + " \n", + " tmp = tmp.groupby(['tld', 'year']).sum()\n", + " \n", + " tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n", + " \n", + " return tmp\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16ef8689", + "metadata": {}, + "outputs": [], + "source": [ + "def du_by_year(df, path, time=\"access\"):\n", + " tmp = df[df.path.str.match(path)]\n", + "\n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " tmp = tmp.assign(year=get_year(tmp, time))\n", + " \n", + " tmp = tmp.drop(columns=[\"uid\", time, \"path\"])\n", + " tmp = client.persist(tmp)\n", + " \n", + " tmp = tmp.groupby(['tld', 'year']).sum()\n", + " \n", + " tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n", + " \n", + " return tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3de3cb54", + "metadata": {}, + "outputs": [], + "source": [ + "def du_by_tld(df, path, time=\"access\"):\n", + " tmp = df[df.path.str.match(path)]\n", + "\n", + " \n", + " tmp = tmp.assign(tld=get_tld(tmp, path))\n", + " \n", + " #tmp = tmp.assign(year=get_year(tmp, time))\n", + " \n", + " tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\", \"year\"])\n", + " tmp = client.persist(tmp)\n", + " tmp = tmp.groupby(['tld']).sum()\n", + " \n", + " tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n", + " \n", + " return tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ca1d03", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e7151f6", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c75727", + "metadata": {}, + "outputs": [], + "source": [ + "%time\n", + "\n", + "dudf = du_by_year(df, '/data/project/ccts', \"modify\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f9639f1", + "metadata": {}, + "outputs": [], + "source": [ + "dudf.dask" + ] + }, + { + "cell_type": "markdown", + "id": "489989ef", + "metadata": {}, + "source": [ + "%time\n", + "\n", + "dudf = du_by_tld(df, '/data/project/ccts')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "312cfb3f", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf = client.persist(dudf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08783886", + "metadata": {}, + "outputs": [], + "source": [ + "dudf.dask" + ] + }, + { + "cell_type": "markdown", + "id": "b8686b92", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "dudf=dudf.repartition(partition_size=\"64MB\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "932cb3e0", + "metadata": {}, + "outputs": [], + "source": [ + "dudf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55e73440", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf = client.compute(dudf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a095bd71", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf = dudf.result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a0378d", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf.sort_values([\"tld\", \"year\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89abcc33", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "tmp=dudf.reset_index()\n", + "#tmp[(tmp['tld']==\"galaxy\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5642e1d5", + "metadata": {}, + "outputs": [], + "source": [ + "tmp[tmp.tld=='galaxy'].sort_values('year')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "134dcdb6", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a801a5ce", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dudf.groupby(\"tld\").sum()" + ] + }, + { + "cell_type": "markdown", + "id": "9379b396", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "lsdf = ls_path(df, '/data/project/ccts/galaxy')" + ] + }, + { + "cell_type": "markdown", + "id": "87b04252", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "lsdf = client.persist(lsdf.tld.unique())" + ] + }, + { + "cell_type": "markdown", + "id": "5599d717", + "metadata": {}, + "source": [ + "lsdf.dask" + ] + }, + { + "cell_type": "markdown", + "id": "d8d8e827", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "lsdf = client.compute(lsdf)" + ] + }, + { + "cell_type": "markdown", + "id": "e6acbccb", + "metadata": {}, + "source": [ + "lsdf.result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce4ba931", + "metadata": {}, + "outputs": [], + "source": [ + "df.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a7ed7af", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dfccts = df[df.path.str.match('/data/project/ccts')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf656e76", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffbd1abc", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts = dfccts.assign(tld=get_tld(dfccts, '/data/project/ccts/galaxy'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3748708", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c741dfdb", + "metadata": {}, + "outputs": [], + "source": [ + "dfccts.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7045b7e9", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "dfccts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20beeb75", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = df" + ] + }, + { + "cell_type": "markdown", + "id": "eebf3e8f", + "metadata": {}, + "source": [ + "lru_projects = ['ICOS', 'boldlab', 'hartmanlab', 'sdtrlab', 'kinglab', 'kobielab', 'MRIPhantom', 'NCRlab', 'bridgeslab', 'hsight', 'kutschlab', 'lcdl', 'metalsgroup', 'rowelab', 'szaflarski_mirman']\n" + ] + }, + { + "cell_type": "markdown", + "id": "9b5d64a0", + "metadata": {}, + "source": [ + "condition=df1[\"tld\"].isin(lru_projects)" + ] + }, + { + "cell_type": "markdown", + "id": "ccca62ff", + "metadata": {}, + "source": [ + "condition=df1[\"tld\"].isin([\"ccts\"])" + ] + }, + { + "cell_type": "markdown", + "id": "55aa52c5", + "metadata": {}, + "source": [ + "lru=df1[condition]" + ] + }, + { + "cell_type": "markdown", + "id": "b9e1df01", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "with ProgressBar():\n", + " display(lru.head())" + ] + }, + { + "cell_type": "markdown", + "id": "31205ddd", + "metadata": {}, + "source": [ + "df.groupby(['tld', 'year']).size.sum.visualize(node_attr={'penwidth': '6'})" + ] + }, + { + "cell_type": "markdown", + "id": "b49da8a3", + "metadata": {}, + "source": [ + "df2 = df.groupby(['tld', 'year']).agg({\"size\": [\"sum\", \"count\"]})" + ] + }, + { + "cell_type": "markdown", + "id": "dca64098", + "metadata": {}, + "source": [ + "df.groupby('name').x.mean().visualize(node_attr={'penwidth': '6'})" + ] + }, + { + "cell_type": "markdown", + "id": "9de95bf7", + "metadata": {}, + "source": [ + "%%time\n", + "\n", + "df2 = report_tld_year(lru)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c492742", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ded14edd", + "metadata": {}, + "outputs": [], + "source": [ + "%%time \n", + "\n", + "df2 = df.groupby(['tld', 'year']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "113ba27b", + "metadata": {}, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e87713f", + "metadata": {}, + "outputs": [], + "source": [ + "df2.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e65585b", + "metadata": {}, + "outputs": [], + "source": [ + "tbsize = df2[\"size\"]/(10**12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4dbce6b", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df2.assign(terrabytes=tbsize)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60557552", + "metadata": {}, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5056dd46", + "metadata": {}, + "outputs": [], + "source": [ + "df2.dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a57c5033", + "metadata": {}, + "outputs": [], + "source": [ + "report=df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7f78f66", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "report = client.compute(report)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79a424a9", + "metadata": {}, + "outputs": [], + "source": [ + "report" + ] + }, + { + "cell_type": "markdown", + "id": "04a25511", + "metadata": {}, + "source": [ + "## Create final report\n", + "\n", + "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529" + ] + }, + { + "cell_type": "markdown", + "id": "9ccfac4b", + "metadata": {}, + "source": [ + "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]" + ] + }, + { + "cell_type": "markdown", + "id": "708a62bf", + "metadata": {}, + "source": [ + "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n", + "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "markdown", + "id": "88915adb", + "metadata": {}, + "source": [ + "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n", + "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "markdown", + "id": "b5472320", + "metadata": {}, + "source": [ + "## Save report as CSV" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23092d7e", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "report = report.result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc748cf4", + "metadata": {}, + "outputs": [], + "source": [ + "report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffc99a54", + "metadata": {}, + "outputs": [], + "source": [ + "# only create dir if there is data to pickle\n", + "if (len(report) and not os.path.isdir(reportdir)):\n", + " os.mkdir(reportdir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0ec8cf", + "metadata": {}, + "outputs": [], + "source": [ + "reportdir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4e836a3", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "if (verbose): print(f\"report: groupby-tld\")\n", + "report.to_csv(f\"{reportdir}/groupby-tld-dask3.csv.gz\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12d02352", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "report.to_parquet(f\"{reportdir}/groupby-tld-year-dask4.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "7cbec7b4", + "metadata": {}, + "source": [ + "## Summarize high-level stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "600650db", + "metadata": {}, + "outputs": [], + "source": [ + "report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06f17bb3", + "metadata": {}, + "outputs": [], + "source": [ + "report.reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "ab4090bd", + "metadata": {}, + "source": [ + "report[report[\"sum\"] == report[\"sum\"].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d2f464a", + "metadata": {}, + "outputs": [], + "source": [ + "report[(report[\"size\"] > 5*10**13)]" + ] + }, + { + "cell_type": "markdown", + "id": "4c7358c1", + "metadata": {}, + "source": [ + "report=report.reset_index()" + ] + }, + { + "cell_type": "markdown", + "id": "f42f3b99", + "metadata": {}, + "source": [ + "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13" + ] + }, + { + "cell_type": "markdown", + "id": "9f1f801b", + "metadata": {}, + "source": [ + "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n", + "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)" + ] + }, + { + "cell_type": "markdown", + "id": "f27fc339", + "metadata": {}, + "source": [ + "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])" + ] + }, + { + "cell_type": "markdown", + "id": "2b45d06d", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]" + ] + }, + { + "cell_type": "markdown", + "id": "7ef5dea0", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "d2d3afd3", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "bc80d326", + "metadata": {}, + "source": [ + "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab