Skip to content
Snippets Groups Projects
dask-mpi-report-grouby-tld-year-of-last-access-projects-2024-05-03.ipynb 24.7 KiB
Newer Older
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = df2.assign(terrabytes=tbsize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60557552",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5056dd46",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a57c5033",
   "metadata": {},
   "outputs": [],
   "source": [
    "report=df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7f78f66",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "report = client.compute(report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79a424a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04a25511",
   "metadata": {},
   "source": [
    "## Create final report\n",
    "\n",
    "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ccfac4b",
   "metadata": {},
   "source": [
    "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "708a62bf",
   "metadata": {},
   "source": [
    "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n",
    "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88915adb",
   "metadata": {},
   "source": [
    "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n",
    "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5472320",
   "metadata": {},
   "source": [
    "## Save report as CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23092d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "report = report.result()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc748cf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffc99a54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# only create dir if there is data to pickle\n",
    "if (len(report) and not os.path.isdir(reportdir)):\n",
    "    os.mkdir(reportdir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d0ec8cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "reportdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4e836a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "if (verbose): print(f\"report: groupby-tld\")\n",
    "report.to_csv(f\"{reportdir}/groupby-tld-dask3.csv.gz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12d02352",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "report.to_parquet(f\"{reportdir}/groupby-tld-year-dask4.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cbec7b4",
   "metadata": {},
   "source": [
    "## Summarize high-level stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "600650db",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06f17bb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "report.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab4090bd",
   "metadata": {},
   "source": [
    "report[report[\"sum\"] == report[\"sum\"].max()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d2f464a",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"size\"] > 5*10**13)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c7358c1",
   "metadata": {},
   "source": [
    "report=report.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f42f3b99",
   "metadata": {},
   "source": [
    "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f1f801b",
   "metadata": {},
   "source": [
    "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n",
    "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f27fc339",
   "metadata": {},
   "source": [
    "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b45d06d",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ef5dea0",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d2d3afd3",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc80d326",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}