Skip to content
Snippets Groups Projects
churn-analysis.ipynb 14.4 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlalchemy\n",
    "import polars as pl\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hive_dir = Path('/data/rc/gpfs-policy/data/gpfs-hive/data-project/')\n",
    "db = Path('/data/rc/gpfs-policy/data/gpfs-hive/db/data-project.db')\n",
    "engine = sqlalchemy.create_engine(f\"sqlite:///{db}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_database(\"SELECT * FROM churn WHERE prior_log_dt >= '2024-11-14'\",connection=engine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = (\n",
    "    df.with_columns([\n",
    "        pl.sum_horizontal('modified','deleted','created').alias('total_churn'),\n",
    "        pl.sum_horizontal('created_bytes','deleted_bytes','modified_bytes_net').alias('total_churn_bytes'),\n",
    "        pl.col('log_dt','prior_log_dt').str.to_datetime(),\n",
    "        pl.col('tld').cast(pl.Categorical)\n",
    "    ])\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tld_agg = (\n",
    "    df\n",
    "    .group_by('tld')\n",
    "    .agg(\n",
    "        pl.sum('total_churn','total_churn_bytes','accessed','accessed_bytes')\n",
    "    )\n",
    "    .sort('total_churn',descending=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "no_churn = tld_agg.filter(pl.col('total_churn').eq(0)).select('tld')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "storage_df = (\n",
    "    pl.scan_parquet(\n",
    "        hive_dir.joinpath('**/*.parquet'),\n",
    "        parallel='prefiltered',\n",
    "        hive_partitioning=True,\n",
    "        try_parse_hive_dates=False\n",
    "    )\n",
    "    .filter(\n",
    "        pl.col('tld').is_in(no_churn),\n",
    "        pl.col('acq').eq('2025-01-15')\n",
    "    )\n",
    "    .select(['tld','size','kballoc'])\n",
    "    .with_columns(pl.col('tld').cast(pl.Categorical))\n",
    "    .collect(engine='streaming')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inactive_storage = storage_df.group_by('tld').agg(pl.sum('size','kballoc'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inactive_storage['kballoc'].sum()/(1024**3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "def convert_colormap(rgb):\n",
    "    r, g, b = (rgb*255).astype(int)\n",
    "    return f\"rgb({r},{g},{b})\"\n",
    "\n",
    "colormap = cm.oslo_r.colors\n",
    "colorscale = [[i / 255, convert_colormap(colormap[i])] for i in range(256)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Churn\n",
    "These plots only include the directories where at least one file was churned during the time period. This ignores directories where files were accessed but never changed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "active = df.filter(pl.col('tld').is_in(no_churn).not_()).clone()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "order = (\n",
    "    active\n",
    "    .group_by('tld')\n",
    "    .agg([\n",
    "        pl.col('total_churn').ne(0).sum()\n",
    "    ])\n",
    "    .sort('total_churn', descending=True)\n",
    ")['tld'].to_list()"
   "metadata": {},
   "source": [
    "#### Total Churn (File Count) Timeseries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "fig = go.Figure(\n",
    "    data = go.Heatmap(\n",
    "        z = np.log10(active['total_churn']),\n",
    "        y = active['log_dt'],\n",
    "        x = active['tld'],\n",
    "        xgap=2,\n",
    "        colorscale=colorscale,\n",
    "        colorbar=dict(\n",
    "            tickvals=np.arange(0,9),\n",
    "            ticktext=[str(10**d) for d in np.arange(0,9)],\n",
    "            tickfont=dict(\n",
    "                size = 14\n",
    "            ),\n",
    "            title=dict(\n",
    "                text='Churn (files altered)',\n",
    "                font=dict(\n",
    "                    size = 16\n",
    "                )\n",
    "            )\n",
    "        ),\n",
    "        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',\n",
    "        customdata=active['total_churn']\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = fig.update_layout(\n",
    "    template = 'plotly_white',\n",
    "    height = 1000,\n",
    "    width = 2000,\n",
    "    title_text = 'Time Course of Total Churn For Project Directories Over 2 Months',\n",
    "    title_x = 0.5,\n",
    "    title_xanchor = 'center',\n",
    "    title_font_size = 30,\n",
    "\n",
    "    xaxis = dict(\n",
    "        title = dict(\n",
    "            text = 'Directory Name',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridwidth = 2,\n",
    "        showgrid = True,\n",
    "        gridcolor='black'\n",
    "    ),\n",
    "    \n",
    "    yaxis = dict(\n",
    "        showgrid = False,\n",
    "        title = dict(\n",
    "            text = 'Policy Run Date',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridcolor = 'black',\n",
    "    ),\n",
    "    \n",
    "    coloraxis_colorbar=dict(\n",
    "        title=\"Raw Values\",  # Change the title of the z-axis\n",
    "        title_font=dict(size=20)  # Increase the font size\n",
    "    ),\n",
    "\n",
    "    margin=dict(t=100, b=20, l=40, r=40)\n",
    ")\n",
    "\n",
    "fig = fig.update_xaxes(\n",
    "    categoryorder='array',\n",
    "    categoryarray=order,\n",
    "    tickfont={'size':14},\n",
    "    ticklabelshift = 3,\n",
    "    tickson = 'boundaries',\n",
    "    gridwidth=2\n",
    ")\n",
    "\n",
    "fig = fig.update_yaxes(\n",
    "    tickfont={'size':16},\n",
    "    tickformat = \"%Y-%m-%d\",\n",
    "    tick0 = '2024-11-15',\n",
    "    ticklabelstep=2,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total Churn (Bytes) Timeseries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "f2 = go.Figure(\n",
    "    data = go.Heatmap(\n",
    "        z = np.log2(active['total_churn_bytes']),\n",
    "        y = active['log_dt'],\n",
    "        x = active['tld'],\n",
    "        xgap=2,\n",
    "        colorscale=colorscale,\n",
    "        colorbar=dict(\n",
    "            tickvals=np.log2([1, 1024, 1024**2, 1024**3, 1024**4, 100 * 1024**4]),\n",
    "            ticktext=['1 B', '1 KiB', '1 MiB', '1 GiB', '1 TiB', '100 TiB'],\n",
    "            tickmode='array',\n",
    "            tickfont=dict(\n",
    "                size = 14\n",
    "            ),\n",
    "            title=dict(\n",
    "                text='Churn (bytes altered)',\n",
    "                font=dict(\n",
    "                    size = 16\n",
    "                )\n",
    "            )\n",
    "        ),\n",
    "        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',\n",
    "        customdata=active['total_churn_bytes']\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f2 = f2.update_layout(\n",
    "    template = 'plotly_white',\n",
    "    height = 1000,\n",
    "    width = 2000,\n",
    "    title_text = 'Time Course of Total Churn (Bytes) For Project Directories Over 2 Months',\n",
    "    title_x = 0.5,\n",
    "    title_xanchor = 'center',\n",
    "    title_font_size = 30,\n",
    "\n",
    "    xaxis = dict(\n",
    "        title = dict(\n",
    "            text = 'Directory Name',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridwidth = 2,\n",
    "        showgrid = True,\n",
    "        gridcolor='black'\n",
    "    ),\n",
    "    \n",
    "    yaxis = dict(\n",
    "        showgrid = False,\n",
    "        title = dict(\n",
    "            text = 'Policy Run Date',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridcolor = 'black',\n",
    "    ),\n",
    "    \n",
    "    coloraxis_colorbar=dict(\n",
    "        title=\"Raw Values\",  # Change the title of the z-axis\n",
    "        title_font=dict(size=20)  # Increase the font size\n",
    "    ),\n",
    "\n",
    "    margin=dict(t=100, b=20, l=40, r=40)\n",
    ")\n",
    "\n",
    "f2 = f2.update_xaxes(\n",
    "    categoryorder='array',\n",
    "    categoryarray=order,\n",
    "    tickfont={'size':14},\n",
    "    ticklabelshift = 3,\n",
    "    tickson = 'boundaries',\n",
    "    gridwidth=2\n",
    ")\n",
    "\n",
    "f2 = f2.update_yaxes(\n",
    "    tickfont={'size':16},\n",
    "    tickformat = \"%Y-%m-%d\",\n",
    "    tick0 = '2024-11-15',\n",
    "    ticklabelstep=2,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f2.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Files Accessed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accessed_no_churn = tld_agg.filter(pl.col('total_churn').eq(0), pl.col('accessed').gt(0)).select('tld')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accessed_no_churn_order = (\n",
    "    df\n",
    "    .filter(pl.col('tld').is_in(accessed_no_churn))\n",
    "    .group_by('tld')\n",
    "    .agg(pl.col('accessed').ne(0).sum())\n",
    "    .sort('accessed',descending=True)\n",
    ")['tld'].to_list()"
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accessed_order = order + accessed_no_churn_order"
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tick_text = [f'<span style=\"color:black\">{project}</span>' for project in order] + \\\n",
    "            [f'<span style=\"color:red\">{project}</span>' for project in accessed_no_churn_order]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accessed_df = df.filter(pl.col('tld').is_in(accessed_order))"
   "execution_count": null,
   "source": [
    "f3 = go.Figure(\n",
    "    data = go.Heatmap(\n",
    "        z = np.log10(accessed_df['accessed']),\n",
    "        y = accessed_df['log_dt'],\n",
    "        x = accessed_df['tld'],\n",
    "        xgap=2,\n",
    "        colorscale=colorscale,\n",
    "        colorbar=dict(\n",
    "            title=\"Count\",\n",
    "            tickvals=list(range(7)),\n",
    "            ticktext=[10**n for n in range(7)],\n",
    "            tickmode='array',\n",
    "            tickfont=dict(\n",
    "                size = 14\n",
    "            )\n",
    "        ),\n",
    "        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Accessed: %{customdata}<extra></extra>',\n",
    "        customdata=accessed_df['accessed']\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f3 = f3.update_layout(\n",
    "    template = 'plotly_white',\n",
    "    height = 1000,\n",
    "    width = 2200,\n",
    "    title = dict(\n",
    "        text = 'Time Course of Files Accessed But Not Churned For Projects Since 2024-11-14',\n",
    "        x = 0.5,\n",
    "        xanchor = 'center',\n",
    "        font_size = 30\n",
    "    ),\n",
    "\n",
    "    xaxis = dict(\n",
    "        title = dict(\n",
    "            text = 'Directory Name',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridwidth = 2,\n",
    "        showgrid = True,\n",
    "        gridcolor='black',\n",
    "        categoryorder='array',\n",
    "        categoryarray=accessed_order,\n",
    "        tickvals=list(range(0,len(tick_text))),\n",
    "        ticktext=tick_text,\n",
    "        tickfont={'size':14},\n",
    "        ticklabelshift = 3,\n",
    "        tickson = 'boundaries'\n",
    "    ),\n",
    "    \n",
    "    yaxis = dict(\n",
    "        showgrid = False,\n",
    "        title = dict(\n",
    "            text = 'Policy Run Date',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridcolor = 'black',\n",
    "        tickfont={'size':16},\n",
    "        tickformat = \"%Y-%m-%d\",\n",
    "        tick0 = '2024-11-15',\n",
    "        ticklabelstep=2,\n",
    "    ),\n",
    "\n",
    "    margin=dict(t=100, b=20, l=40, r=40)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}