Skip to content
Snippets Groups Projects
churn-analysis.ipynb 1.84 MiB
Newer Older
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlalchemy\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "import cudf\n",
    "import colormaps as cm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "hive_dir = '/data/rc/gpfs-policy/data/gpfs-hive/data-project/'\n",
    "db = Path('/data/rc/gpfs-policy/data/gpfs-hive/db/data-project.db')\n",
    "engine = sqlalchemy.create_engine(f\"sqlite:///{db}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_sql(\"SELECT * FROM churn WHERE prior_log_dt >= '2024-11-14'\",engine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['total_churn'] = df['created'] + df['deleted'] + df['modified']\n",
    "df['total_churn_bytes'] = df['created_bytes'] + df['deleted_bytes'] + df['modified_bytes_net']\n",
    "df[['log_dt','prior_log_dt']] = df[['log_dt','prior_log_dt']].apply(lambda x: pd.to_datetime(x))\n",
    "df['tld'] = df['tld'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tld_agg = df.groupby('tld',observed=True)[['total_churn','total_churn_bytes','accessed','accessed_bytes']].sum().sort_values('total_churn',ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "no_churn = tld_agg.loc[tld_agg['total_churn'].eq(0)].index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "cdf = cudf.read_parquet(hive_dir,filters = [('tld','in',no_churn.to_list()),('acq','==','2025-01-15')],columns=['tld','size','kballoc'],categorical_partitions=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "cdf['tld'] = cdf['tld'].astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "inactive_storage = cdf.groupby('tld',observed=True)[['size','kballoc']].sum()"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "np.float64(447.3787513971329)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inactive_storage['kballoc'].divide(1024**3).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "import plotly.graph_objects as go\n",
    "\n",
    "def convert_colormap(rgb):\n",
    "    r, g, b = (rgb*255).astype(int)\n",
    "    return f\"rgb({r},{g},{b})\"\n",
    "\n",
    "colormap = cm.oslo_r.colors\n",
    "colorscale = [[i / 255, convert_colormap(colormap[i])] for i in range(256)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Churn\n",
    "These plots only include the directories where at least one file was churned during the time period. This ignores directories where files were accessed but never changed"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "active = df.loc[~df['tld'].isin(no_churn.to_list())].copy()\n",
    "active['tld'] = active['tld'].cat.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [],
   "source": [
    "# order by daily activity, percentage of days in the time period where at least one change was made\n",
    "order = active.groupby('tld',observed=True)['total_churn'].apply(lambda x: x.ne(0).sum()).sort_values(ascending=False).index.as_ordered()"
   ]
  },
  {
   "metadata": {},
   "source": [
    "#### Total Churn (File Count) Timeseries"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mdefende/.conda/envs/gpfs/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:\n",
      "\n",
      "divide by zero encountered in log10\n",
      "\n"
     ]
    }
   ],
   "source": [
    "fig = go.Figure(\n",
    "    data = go.Heatmap(\n",
    "        z = np.log10(active['total_churn']),\n",
    "        y = active['log_dt'],\n",
    "        x = active['tld'],\n",
    "        xgap=2,\n",
    "        colorscale=colorscale,\n",
    "        colorbar=dict(\n",
    "            tickvals=np.arange(0,9),\n",
    "            ticktext=[str(10**d) for d in np.arange(0,9)],\n",
    "            tickfont=dict(\n",
    "                size = 14\n",
Loading
Loading full blame...