{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sqlalchemy\n", "import polars as pl\n", "import numpy as np\n", "from pathlib import Path\n", "import colormaps as cm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hive_dir = Path('/data/rc/gpfs-policy/data/gpfs-hive/data-project/')\n", "db = Path('/data/rc/gpfs-policy/data/gpfs-hive/db/data-project.db')\n", "engine = sqlalchemy.create_engine(f\"sqlite:///{db}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pl.read_database(\"SELECT * FROM churn WHERE prior_log_dt >= '2024-11-14'\",connection=engine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = (\n", " df.with_columns([\n", " pl.sum_horizontal('modified','deleted','created').alias('total_churn'),\n", " pl.sum_horizontal('created_bytes','deleted_bytes','modified_bytes_net').alias('total_churn_bytes'),\n", " pl.col('log_dt','prior_log_dt').str.to_datetime(),\n", " pl.col('tld').cast(pl.Categorical)\n", " ])\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tld_agg = (\n", " df\n", " .group_by('tld')\n", " .agg(\n", " pl.sum('total_churn','total_churn_bytes','accessed','accessed_bytes')\n", " )\n", " .sort('total_churn',descending=True)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "no_churn = tld_agg.filter(pl.col('total_churn').eq(0)).select('tld')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "storage_df = (\n", " pl.scan_parquet(\n", " hive_dir.joinpath('**/*.parquet'),\n", " parallel='prefiltered',\n", " hive_partitioning=True,\n", " try_parse_hive_dates=False\n", " )\n", " .filter(\n", " pl.col('tld').is_in(no_churn),\n", " pl.col('acq').eq('2025-01-15')\n", " )\n", " .select(['tld','size','kballoc'])\n", " .with_columns(pl.col('tld').cast(pl.Categorical))\n", " .collect(engine='streaming')\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inactive_storage = storage_df.group_by('tld').agg(pl.sum('size','kballoc'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inactive_storage['kballoc'].sum()/(1024**3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plotting" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import plotly.graph_objects as go\n", "\n", "def convert_colormap(rgb):\n", " r, g, b = (rgb*255).astype(int)\n", " return f\"rgb({r},{g},{b})\"\n", "\n", "colormap = cm.oslo_r.colors\n", "colorscale = [[i / 255, convert_colormap(colormap[i])] for i in range(256)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Churn\n", "These plots only include the directories where at least one file was churned during the time period. This ignores directories where files were accessed but never changed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "active = df.filter(pl.col('tld').is_in(no_churn).not_()).clone()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "order = (\n", " active\n", " .group_by('tld')\n", " .agg([\n", " pl.col('total_churn').ne(0).sum()\n", " ])\n", " .sort('total_churn', descending=True)\n", ")['tld'].to_list()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Total Churn (File Count) Timeseries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = go.Figure(\n", " data = go.Heatmap(\n", " z = np.log10(active['total_churn']),\n", " y = active['log_dt'],\n", " x = active['tld'],\n", " xgap=2,\n", " colorscale=colorscale,\n", " colorbar=dict(\n", " tickvals=np.arange(0,9),\n", " ticktext=[str(10**d) for d in np.arange(0,9)],\n", " tickfont=dict(\n", " size = 14\n", " ),\n", " title=dict(\n", " text='Churn (files altered)',\n", " font=dict(\n", " size = 16\n", " )\n", " )\n", " ),\n", " hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',\n", " customdata=active['total_churn']\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = fig.update_layout(\n", " template = 'plotly_white',\n", " height = 1000,\n", " width = 2000,\n", " title_text = 'Time Course of Total Churn For Project Directories Over 2 Months',\n", " title_x = 0.5,\n", " title_xanchor = 'center',\n", " title_font_size = 30,\n", "\n", " xaxis = dict(\n", " title = dict(\n", " text = 'Directory Name',\n", " font_size = 20\n", " ),\n", " gridwidth = 2,\n", " showgrid = True,\n", " gridcolor='black'\n", " ),\n", " \n", " yaxis = dict(\n", " showgrid = False,\n", " title = dict(\n", " text = 'Policy Run Date',\n", " font_size = 20\n", " ),\n", " gridcolor = 'black',\n", " ),\n", " \n", " coloraxis_colorbar=dict(\n", " title=\"Raw Values\", # Change the title of the z-axis\n", " title_font=dict(size=20) # Increase the font size\n", " ),\n", "\n", " margin=dict(t=100, b=20, l=40, r=40)\n", ")\n", "\n", "fig = fig.update_xaxes(\n", " categoryorder='array',\n", " categoryarray=order,\n", " tickfont={'size':14},\n", " ticklabelshift = 3,\n", " tickson = 'boundaries',\n", " gridwidth=2\n", ")\n", "\n", "fig = fig.update_yaxes(\n", " tickfont={'size':16},\n", " tickformat = \"%Y-%m-%d\",\n", " tick0 = '2024-11-15',\n", " ticklabelstep=2,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Total Churn (Bytes) Timeseries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f2 = go.Figure(\n", " data = go.Heatmap(\n", " z = np.log2(active['total_churn_bytes']),\n", " y = active['log_dt'],\n", " x = active['tld'],\n", " xgap=2,\n", " colorscale=colorscale,\n", " colorbar=dict(\n", " tickvals=np.log2([1, 1024, 1024**2, 1024**3, 1024**4, 100 * 1024**4]),\n", " ticktext=['1 B', '1 KiB', '1 MiB', '1 GiB', '1 TiB', '100 TiB'],\n", " tickmode='array',\n", " tickfont=dict(\n", " size = 14\n", " ),\n", " title=dict(\n", " text='Churn (bytes altered)',\n", " font=dict(\n", " size = 16\n", " )\n", " )\n", " ),\n", " hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',\n", " customdata=active['total_churn_bytes']\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f2 = f2.update_layout(\n", " template = 'plotly_white',\n", " height = 1000,\n", " width = 2000,\n", " title_text = 'Time Course of Total Churn (Bytes) For Project Directories Over 2 Months',\n", " title_x = 0.5,\n", " title_xanchor = 'center',\n", " title_font_size = 30,\n", "\n", " xaxis = dict(\n", " title = dict(\n", " text = 'Directory Name',\n", " font_size = 20\n", " ),\n", " gridwidth = 2,\n", " showgrid = True,\n", " gridcolor='black'\n", " ),\n", " \n", " yaxis = dict(\n", " showgrid = False,\n", " title = dict(\n", " text = 'Policy Run Date',\n", " font_size = 20\n", " ),\n", " gridcolor = 'black',\n", " ),\n", " \n", " coloraxis_colorbar=dict(\n", " title=\"Raw Values\", # Change the title of the z-axis\n", " title_font=dict(size=20) # Increase the font size\n", " ),\n", "\n", " margin=dict(t=100, b=20, l=40, r=40)\n", ")\n", "\n", "f2 = f2.update_xaxes(\n", " categoryorder='array',\n", " categoryarray=order,\n", " tickfont={'size':14},\n", " ticklabelshift = 3,\n", " tickson = 'boundaries',\n", " gridwidth=2\n", ")\n", "\n", "f2 = f2.update_yaxes(\n", " tickfont={'size':16},\n", " tickformat = \"%Y-%m-%d\",\n", " tick0 = '2024-11-15',\n", " ticklabelstep=2,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f2.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Files Accessed" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accessed_no_churn = tld_agg.filter(pl.col('total_churn').eq(0), pl.col('accessed').gt(0)).select('tld')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accessed_no_churn_order = (\n", " df\n", " .filter(pl.col('tld').is_in(accessed_no_churn))\n", " .group_by('tld')\n", " .agg(pl.col('accessed').ne(0).sum())\n", " .sort('accessed',descending=True)\n", ")['tld'].to_list()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accessed_order = order + accessed_no_churn_order" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tick_text = [f'<span style=\"color:black\">{project}</span>' for project in order] + \\\n", " [f'<span style=\"color:red\">{project}</span>' for project in accessed_no_churn_order]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accessed_df = df.filter(pl.col('tld').is_in(accessed_order))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f3 = go.Figure(\n", " data = go.Heatmap(\n", " z = np.log10(accessed_df['accessed']),\n", " y = accessed_df['log_dt'],\n", " x = accessed_df['tld'],\n", " xgap=2,\n", " colorscale=colorscale,\n", " colorbar=dict(\n", " title=\"Count\",\n", " tickvals=list(range(7)),\n", " ticktext=[10**n for n in range(7)],\n", " tickmode='array',\n", " tickfont=dict(\n", " size = 14\n", " )\n", " ),\n", " hovertemplate='Dir: %{x}<br>Date: %{y}<br>Accessed: %{customdata}<extra></extra>',\n", " customdata=accessed_df['accessed']\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f3 = f3.update_layout(\n", " template = 'plotly_white',\n", " height = 1000,\n", " width = 2200,\n", " title = dict(\n", " text = 'Time Course of Files Accessed But Not Churned For Projects Since 2024-11-14',\n", " x = 0.5,\n", " xanchor = 'center',\n", " font_size = 30\n", " ),\n", "\n", " xaxis = dict(\n", " title = dict(\n", " text = 'Directory Name',\n", " font_size = 20\n", " ),\n", " gridwidth = 2,\n", " showgrid = True,\n", " gridcolor='black',\n", " categoryorder='array',\n", " categoryarray=accessed_order,\n", " tickvals=list(range(0,len(tick_text))),\n", " ticktext=tick_text,\n", " tickfont={'size':14},\n", " ticklabelshift = 3,\n", " tickson = 'boundaries'\n", " ),\n", " \n", " yaxis = dict(\n", " showgrid = False,\n", " title = dict(\n", " text = 'Policy Run Date',\n", " font_size = 20\n", " ),\n", " gridcolor = 'black',\n", " tickfont={'size':16},\n", " tickformat = \"%Y-%m-%d\",\n", " tick0 = '2024-11-15',\n", " ticklabelstep=2,\n", " ),\n", "\n", " margin=dict(t=100, b=20, l=40, r=40)\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f3.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.3" } }, "nbformat": 4, "nbformat_minor": 2 }