Skip to content
Snippets Groups Projects
churn-analysis.ipynb 6.64 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlalchemy\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "import cudf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "hive_dir = '/data/rc/gpfs-policy/data/gpfs-hive/data-project/'\n",
    "db = Path('/data/rc/gpfs-policy/data/gpfs-hive/db/data-project.db')\n",
    "engine = sqlalchemy.create_engine(f\"sqlite:///{db}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(\"SELECT * FROM churn WHERE prior_log_dt >= '2024-11-14'\",engine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['total_churn'] = df['created'] + df['deleted'] + df['modified']\n",
    "df[['log_dt','prior_log_dt']] = df[['log_dt','prior_log_dt']].apply(lambda x: pd.to_datetime(x))\n",
    "df['tld'] = df['tld'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tld_agg = df.groupby('tld',observed=True)['total_churn'].sum().sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "no_churn = tld_agg.loc[tld_agg.eq(0)].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "cdf = cudf.read_parquet(hive_dir,filters = [('tld','in',no_churn.to_list()),('acq','==','2025-01-15')],columns=['tld','size','kballoc'],categorical_partitions=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "cdf['tld'] = cdf['tld'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "inactive_storage = cdf.groupby('tld',observed=True)[['size','kballoc']].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inactive_storage['kballoc'].divide(1024**3).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "active = df.loc[~df['tld'].isin(no_churn.to_list())].copy()\n",
    "active['tld'] = active['tld'].cat.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# order by total churn over the whole time period\n",
    "order = active.groupby('tld',observed=True)['total_churn'].sum().sort_values(ascending=False).index.as_ordered()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# order by daily activity, percentage of days in the time period where at least one change was made\n",
    "order = active.groupby('tld',observed=True)['total_churn'].apply(lambda x: x.ne(0).sum()).sort_values(ascending=False).index.as_ordered()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = go.Figure(\n",
    "    data = go.Heatmap(\n",
    "        z = np.log10(active['total_churn']),\n",
    "        y = active['log_dt'],\n",
    "        x = active['tld'],\n",
    "        xgap=1,\n",
    "        colorscale='thermal',\n",
    "        colorbar=dict(\n",
    "            tickvals=np.arange(0,9),\n",
    "            ticktext=[str(10**d) for d in np.arange(0,9)],\n",
    "            tickfont=dict(\n",
    "                size = 14\n",
    "            ),\n",
    "            title=dict(\n",
    "                text='Churn (files altered)',\n",
    "                font=dict(\n",
    "                    size = 16\n",
    "                )\n",
    "            )\n",
    "        ),\n",
    "        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',\n",
    "        customdata=active['total_churn']\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = fig.update_layout(\n",
    "    template = 'plotly_white',\n",
    "    height = 1000,\n",
    "    width = 2000,\n",
    "    title_text = 'Time Course of Total Churn For Project Directories Over 2 Months',\n",
    "    title_x = 0.5,\n",
    "    title_xanchor = 'center',\n",
    "    title_font_size = 30,\n",
    "\n",
    "    xaxis = dict(\n",
    "        title = dict(\n",
    "            text = 'Directory Name',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridwidth = 2,\n",
    "        showgrid = True,\n",
    "        gridcolor='black'\n",
    "    ),\n",
    "    \n",
    "    yaxis = dict(\n",
    "        showgrid = False,\n",
    "        title = dict(\n",
    "            text = 'Policy Run Date',\n",
    "            font_size = 20\n",
    "        ),\n",
    "        gridcolor = 'black',\n",
    "    ),\n",
    "    \n",
    "    coloraxis_colorbar=dict(\n",
    "        title=\"Raw Values\",  # Change the title of the z-axis\n",
    "        titlefont=dict(size=20)  # Increase the font size\n",
    "    ),\n",
    "\n",
    "    margin=dict(t=100, b=20, l=40, r=40)\n",
    ")\n",
    "\n",
    "fig = fig.update_xaxes(\n",
    "    categoryorder='array',\n",
    "    categoryarray=order,\n",
    "    tickfont={'size':14},\n",
    "    ticklabelshift = 3,\n",
    "    tickson = 'boundaries',\n",
    "    gridwidth=2\n",
    ")\n",
    "\n",
    "fig = fig.update_yaxes(\n",
    "    tickfont={'size':16},\n",
    "    tickformat = \"%Y-%m-%d\",\n",
    "    tick0 = '2024-11-15',\n",
    "    ticklabelstep=2,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}