In [1]:
import sqlalchemy
import pandas as pd
import numpy as np
from pathlib import Path
import cudf
import colormaps as cm

In [2]:
hive_dir = '/data/rc/gpfs-policy/data/gpfs-hive/data-project/'
db = Path('/data/rc/gpfs-policy/data/gpfs-hive/db/data-project.db')
engine = sqlalchemy.create_engine(f"sqlite:///{db}")

In [3]:
df = pd.read_sql("SELECT * FROM churn WHERE prior_log_dt >= '2024-11-14'",engine)

In [4]:
df['total_churn'] = df['created'] + df['deleted'] + df['modified']
df['total_churn_bytes'] = df['created_bytes'] + df['deleted_bytes'] + df['modified_bytes_net']
df[['log_dt','prior_log_dt']] = df[['log_dt','prior_log_dt']].apply(lambda x: pd.to_datetime(x))
df['tld'] = df['tld'].astype('category')

In [5]:
tld_agg = df.groupby('tld',observed=True)[['total_churn','total_churn_bytes','accessed','accessed_bytes']].sum().sort_values('total_churn',ascending=False)

In [91]:
no_churn = tld_agg.loc[tld_agg['total_churn'].eq(0)].index

In [7]:
cdf = cudf.read_parquet(hive_dir,filters = [('tld','in',no_churn.to_list()),('acq','==','2025-01-15')],columns=['tld','size','kballoc'],categorical_partitions=True)

In [8]:
cdf['tld'] = cdf['tld'].astype('category')

In [9]:
inactive_storage = cdf.groupby('tld',observed=True)[['size','kballoc']].sum()

In [10]:
inactive_storage['kballoc'].divide(1024**3).sum()

np.float64(447.3787513971329)

## Plotting

In [96]:
import plotly.graph_objects as go

def convert_colormap(rgb):
    r, g, b = (rgb*255).astype(int)
    return f"rgb({r},{g},{b})"

colormap = cm.oslo_r.colors
colorscale = [[i / 255, convert_colormap(colormap[i])] for i in range(256)]

### Churn
These plots only include the directories where at least one file was churned during the time period. This ignores directories where files were accessed but never changed

In [7]:
active = df.loc[~df['tld'].isin(no_churn.to_list())].copy()
active['tld'] = active['tld'].cat.remove_unused_categories()

In [9]:
# order by daily activity, percentage of days in the time period where at least one change was made
order = active.groupby('tld',observed=True)['total_churn'].apply(lambda x: x.ne(0).sum()).sort_values(ascending=False).index.as_ordered()

#### Total Churn (File Count) Timeseries

In [197]:
fig = go.Figure(
    data = go.Heatmap(
        z = np.log10(active['total_churn']),
        y = active['log_dt'],
        x = active['tld'],
        xgap=2,
        colorscale=colorscale,
        colorbar=dict(
            tickvals=np.arange(0,9),
            ticktext=[str(10**d) for d in np.arange(0,9)],
            tickfont=dict(
                size = 14
            ),
            title=dict(
                text='Churn (files altered)',
                font=dict(
                    size = 16
                )
            )
        ),
        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',
        customdata=active['total_churn']
    )
)


divide by zero encountered in log10



In [198]:
fig = fig.update_layout(
    template = 'plotly_white',
    height = 1000,
    width = 2000,
    title_text = 'Time Course of Total Churn For Project Directories Over 2 Months',
    title_x = 0.5,
    title_xanchor = 'center',
    title_font_size = 30,

    xaxis = dict(
        title = dict(
            text = 'Directory Name',
            font_size = 20
        ),
        gridwidth = 2,
        showgrid = True,
        gridcolor='black'
    ),
    
    yaxis = dict(
        showgrid = False,
        title = dict(
            text = 'Policy Run Date',
            font_size = 20
        ),
        gridcolor = 'black',
    ),
    
    coloraxis_colorbar=dict(
        title="Raw Values",  # Change the title of the z-axis
        titlefont=dict(size=20)  # Increase the font size
    ),

    margin=dict(t=100, b=20, l=40, r=40)
)

fig = fig.update_xaxes(
    categoryorder='array',
    categoryarray=order,
    tickfont={'size':14},
    ticklabelshift = 3,
    tickson = 'boundaries',
    gridwidth=2
)

fig = fig.update_yaxes(
    tickfont={'size':16},
    tickformat = "%Y-%m-%d",
    tick0 = '2024-11-15',
    ticklabelstep=2,
)

In [199]:
fig.show()

### Total Churn (Bytes) Timeseries

In [66]:
f2 = go.Figure(
    data = go.Heatmap(
        z = np.log2(active['total_churn_bytes']),
        y = active['log_dt'],
        x = active['tld'],
        xgap=2,
        colorscale=colorscale,
        colorbar=dict(
            tickvals=np.log2([1, 1024, 1024**2, 1024**3, 1024**4, 100 * 1024**4]),
            ticktext=['1 B', '1 KiB', '1 MiB', '1 GiB', '1 TiB', '100 TiB'],
            tickmode='array',
            tickfont=dict(
                size = 14
            ),
            title=dict(
                text='Churn (bytes altered)',
                font=dict(
                    size = 16
                )
            )
        ),
        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Churn: %{customdata}<extra></extra>',
        customdata=active['total_churn_bytes']
    )
)


divide by zero encountered in log2


invalid value encountered in log2



In [67]:
f2 = f2.update_layout(
    template = 'plotly_white',
    height = 1000,
    width = 2000,
    title_text = 'Time Course of Total Churn (Bytes) For Project Directories Over 2 Months',
    title_x = 0.5,
    title_xanchor = 'center',
    title_font_size = 30,

    xaxis = dict(
        title = dict(
            text = 'Directory Name',
            font_size = 20
        ),
        gridwidth = 2,
        showgrid = True,
        gridcolor='black'
    ),
    
    yaxis = dict(
        showgrid = False,
        title = dict(
            text = 'Policy Run Date',
            font_size = 20
        ),
        gridcolor = 'black',
    ),
    
    coloraxis_colorbar=dict(
        title="Raw Values",  # Change the title of the z-axis
        titlefont=dict(size=20)  # Increase the font size
    ),

    margin=dict(t=100, b=20, l=40, r=40)
)

f2 = f2.update_xaxes(
    categoryorder='array',
    categoryarray=order,
    tickfont={'size':14},
    ticklabelshift = 3,
    tickson = 'boundaries',
    gridwidth=2
)

f2 = f2.update_yaxes(
    tickfont={'size':16},
    tickformat = "%Y-%m-%d",
    tick0 = '2024-11-15',
    ticklabelstep=2,
)

### Files Accessed

In [160]:
accessed_no_churn = tld_agg.loc[(tld_agg['total_churn'].eq(0)) & (tld_agg['accessed'].gt(0))].index
accessed_no_churn_order = (
    df
    .loc[df['tld'].isin(accessed_no_churn)]
    .groupby('tld',observed=True)['accessed']
    .apply(lambda x: x.ne(0).sum())
    .sort_values(ascending=False)
)

In [145]:
accessed_order = order.to_list() + accessed_no_churn_order

In [146]:
tick_text = [f'<span style="color:black">{project}</span>' for project in order.to_list()] + \
            [f'<span style="color:red">{project}</span>' for project in accessed_no_churn_order]

In [147]:
accessed_df = df.loc[df['tld'].isin(accessed_order)]

In [153]:
f3 = go.Figure(
    data = go.Heatmap(
        z = np.log10(accessed_df['accessed']),
        y = accessed_df['log_dt'],
        x = accessed_df['tld'],
        xgap=2,
        colorscale=colorscale,
        colorbar=dict(
            title="Count",
            tickvals=list(range(7)),
            ticktext=[10**n for n in range(7)],
            tickmode='array',
            tickfont=dict(
                size = 14
            )
        ),
        hovertemplate='Dir: %{x}<br>Date: %{y}<br>Accessed: %{customdata}<extra></extra>',
        customdata=accessed_df['accessed']
    )
)


divide by zero encountered in log10



In [158]:
f3 = f3.update_layout(
    template = 'plotly_white',
    height = 1000,
    width = 2200,
    title = dict(
        text = 'Time Course of Files Accessed But Not Churned For Projects Since 2024-11-14',
        x = 0.5,
        xanchor = 'center',
        font_size = 30
    ),

    xaxis = dict(
        title = dict(
            text = 'Directory Name',
            font_size = 20
        ),
        gridwidth = 2,
        showgrid = True,
        gridcolor='black',
        categoryorder='array',
        categoryarray=accessed_order,
        tickvals=list(range(0,len(tick_text))),
        ticktext=tick_text,
        tickfont={'size':14},
        ticklabelshift = 3,
        tickson = 'boundaries'
    ),
    
    yaxis = dict(
        showgrid = False,
        title = dict(
            text = 'Policy Run Date',
            font_size = 20
        ),
        gridcolor = 'black',
        tickfont={'size':16},
        tickformat = "%Y-%m-%d",
        tick0 = '2024-11-15',
        ticklabelstep=2,
    ),

    margin=dict(t=100, b=20, l=40, r=40)
)

In [159]:
f3.show()