{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sqlalchemy\n", "import pandas as pd\n", "import numpy as np\n", "from pathlib import Path\n", "import cudf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "hive_dir = '/data/rc/gpfs-policy/data/gpfs-hive/data-project/'\n", "db = Path('/data/rc/gpfs-policy/data/gpfs-hive/db/data-project.db')\n", "engine = sqlalchemy.create_engine(f\"sqlite:///{db}\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.read_sql(\"SELECT * FROM churn WHERE prior_log_dt >= '2024-11-14'\",engine)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df['total_churn'] = df['created'] + df['deleted'] + df['modified']\n", "df[['log_dt','prior_log_dt']] = df[['log_dt','prior_log_dt']].apply(lambda x: pd.to_datetime(x))\n", "df['tld'] = df['tld'].astype('category')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "tld_agg = df.groupby('tld',observed=True)['total_churn'].sum().sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "no_churn = tld_agg.loc[tld_agg.eq(0)].index" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "cdf = cudf.read_parquet(hive_dir,filters = [('tld','in',no_churn.to_list()),('acq','==','2025-01-15')],columns=['tld','size','kballoc'],categorical_partitions=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "cdf['tld'] = cdf['tld'].astype('category')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "inactive_storage = cdf.groupby('tld',observed=True)[['size','kballoc']].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "inactive_storage['kballoc'].divide(1024**3).sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plotting" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "active = df.loc[~df['tld'].isin(no_churn.to_list())].copy()\n", "active['tld'] = active['tld'].cat.remove_unused_categories()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# order by total churn over the whole time period\n", "order = active.groupby('tld',observed=True)['total_churn'].sum().sort_values(ascending=False).index.as_ordered()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# order by daily activity, percentage of days in the time period where at least one change was made\n", "order = active.groupby('tld',observed=True)['total_churn'].apply(lambda x: x.ne(0).sum()).sort_values(ascending=False).index.as_ordered()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import plotly.graph_objects as go" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig = go.Figure(\n", " data = go.Heatmap(\n", " z = np.log10(active['total_churn']),\n", " y = active['log_dt'],\n", " x = active['tld'],\n", " xgap=1,\n", " colorscale='thermal',\n", " colorbar=dict(\n", " tickvals=np.arange(0,9),\n", " ticktext=[str(10**d) for d in np.arange(0,9)],\n", " tickfont=dict(\n", " size = 14\n", " ),\n", " title=dict(\n", " text='Churn (files altered)',\n", " font=dict(\n", " size = 16\n", " )\n", " )\n", " ),\n", " hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',\n", " customdata=active['total_churn']\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "fig = fig.update_layout(\n", " template = 'plotly_white',\n", " height = 1000,\n", " width = 2000,\n", " title_text = 'Time Course of Total Churn For Project Directories Over 2 Months',\n", " title_x = 0.5,\n", " title_xanchor = 'center',\n", " title_font_size = 30,\n", "\n", " xaxis = dict(\n", " title = dict(\n", " text = 'Directory Name',\n", " font_size = 20\n", " ),\n", " gridwidth = 2,\n", " showgrid = True,\n", " gridcolor='black'\n", " ),\n", " \n", " yaxis = dict(\n", " showgrid = False,\n", " title = dict(\n", " text = 'Policy Run Date',\n", " font_size = 20\n", " ),\n", " gridcolor = 'black',\n", " ),\n", " \n", " coloraxis_colorbar=dict(\n", " title=\"Raw Values\", # Change the title of the z-axis\n", " titlefont=dict(size=20) # Increase the font size\n", " ),\n", "\n", " margin=dict(t=100, b=20, l=40, r=40)\n", ")\n", "\n", "fig = fig.update_xaxes(\n", " categoryorder='array',\n", " categoryarray=order,\n", " tickfont={'size':14},\n", " ticklabelshift = 3,\n", " tickson = 'boundaries',\n", " gridwidth=2\n", ")\n", "\n", "fig = fig.update_yaxes(\n", " tickfont={'size':16},\n", " tickformat = \"%Y-%m-%d\",\n", " tick0 = '2024-11-15',\n", " ticklabelstep=2,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 }