Clean up repo for Jupyter notebooks

Add ignore rule for notebook cache directory. Remove outputs, execution, count and other notebook JSON elements that cause spurious diffs. This is based on the automatic jquery cleaning at commit. http://timstaley.co.uk/posts/making-git-and-jupyter-notebooks-play-nice/

Clean up repo for Jupyter notebooks
Add ignore rule for notebook cache directory. Remove outputs, execution, count and other notebook JSON elements that cause spurious diffs. This is based on the automatic jquery cleaning at commit. http://timstaley.co.uk/posts/making-git-and-jupyter-notebooks-play-nice/
2886ea60 · John-Paul Robinson · e4b6bf28 · 2886ea60 · 2886ea60 · 2886ea60
Commit 2886ea60 authored 6 months ago by John-Paul Robinson
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ err/

 # Ignore cache directories
 __pycache__
+.ipynb_checkpoints/

 # Ignore quarto outputs
 quarto*
@@ -27,4 +28,4 @@ poetry.toml
 .vscode

 # Ignore random extra files
-extra/
\ No newline at end of file
+extra/
--- a/example-dask-setup.ipynb
+++ b/example-dask-setup.ipynb
--- a/gpfs-aggregation.ipynb
+++ b/gpfs-aggregation.ipynb
--- a/src/rc_gpfs/report/reports.ipynb
+++ b/src/rc_gpfs/report/reports.ipynb
@@ -2,20 +2,9 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<dask.config.set at 0x2aaab64e3150>"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from dask_cuda import LocalCUDACluster\n",
    "from dask.distributed import Client\n",
@@ -32,7 +21,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -54,7 +43,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -64,7 +53,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -80,7 +69,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -97,7 +86,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -118,7 +107,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -135,7 +124,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -144,7 +133,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -173,7 +162,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -182,7 +171,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -191,32 +180,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "4743970     >18M\n",
-       "4744054     >18M\n",
-       "4744053     >18M\n",
-       "4744052     >18M\n",
-       "4744051     >18M\n",
-       "           ...  \n",
-       "523698     4M-5M\n",
-       "523696     5M-6M\n",
-       "523629     5M-6M\n",
-       "523625     5M-6M\n",
-       "523712     4M-5M\n",
-       "Length: 5000000, dtype: category\n",
-       "Categories (19, object): ['>18M' < '17M-18M' < '16M-17M' < '15M-16M' ... '3M-4M' < '2M-3M' < '1M-2M' < '<1M']"
-      ]
-     },
-     "execution_count": 83,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "cudf_cut(v['access_epoch'],cutoffs,grp_labels)"
   ]
@@ -375,22 +341,9 @@
  }
 ],
 "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,

 %% Cell type:code id: tags:

-``` python
+``` 
 from dask_cuda import LocalCUDACluster
 from dask.distributed import Client
 import dask
 import dask.dataframe as dd
 import cudf
 import pandas as pd
 from pandas.tseries.offsets import DateOffset
 import re
 from pathlib import Path

 dask.config.set({"dataframe.backend": "cudf"})
 ```

-%% Output
-
-    <dask.config.set at 0x2aaab64e3150>
-
 %% Cell type:code id: tags:

-``` python
+``` 
 parquet_ds = '/data/rc/gpfs-policy/data/list-policy_data-user_2024-09-18/parquet'
 run_date = pd.to_datetime(re.search(r'[\d]{4}-[\d]{2}-[\d]{2}',parquet_ds).group(),format='%Y-%m-%d')
 delta_vals = [i for i in range(1,19)]
 delta_unit = 'M' # 'D','W','M','Y'
 report_dir = Path(parquet_ds).parent.joinpath('reports')
 report_dir.mkdir(exist_ok=True)
 report_dir.chmod(mode = 0o2770) # 0o2770 sets user/group rwx and group sticky bit
 ```

 %% Cell type:markdown id: tags:

 ## Setup Cluster and Data

 %% Cell type:code id: tags:

-``` python
+``` 
 cluster = LocalCUDACluster(threads_per_worker=10,device_memory_limit=None)
 client=Client(cluster)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 ddf = dd.read_parquet(parquet_ds,split_row_groups=False)
 ```

 %% Cell type:markdown id: tags:

 ### Define Cutoffs In DateTimes

 %% Cell type:code id: tags:

-``` python
+``` 
 def create_timedelta(val,unit):
    if unit == 'D':  # Days
        return DateOffset(days=val)
    elif unit == 'W':  # Weeks
        return DateOffset(weeks=val)
    elif unit == 'M':  # Months
        return DateOffset(months=val)
    elif unit == 'Y':  # Years
        return DateOffset(years=val)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 deltas = pd.Series([create_timedelta(c,delta_unit) for c in delta_vals])
 cutoffs = pd.to_datetime(run_date - deltas)
 cutoffs = (
    pd.concat(
        [
            cutoffs,
            pd.Series([run_date,pd.to_datetime('1970-01-01')])
        ]
    )
    .astype('int64')
    .sort_values()
    .to_list()
 )
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 def create_cutoff_group_labels(vals,unit):
    vals.sort(reverse=True) # values sorted in descending order since the largest number represents the oldest group
    deltas = [f'{d}{unit}' for d in vals]
    groups = []
    groups.append(f'>{deltas[0]}')
    for i in range(len(deltas)-1):
        groups.append(f'{deltas[i+1]}-{deltas[i]}')
    groups.append(f'<{deltas[-1]}')
    return groups
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 grp_labels = create_cutoff_group_labels(delta_vals,delta_unit)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 ddf['access_epoch'] = ddf['access'].astype('int64')
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 def _cut(ser,bins,labels,with_cuda,**kwargs) -> pd.Series | cudf.Series:
        right = kwargs.pop('right',False)
        if with_cuda:
            func = cudf.cut
            ser = ser.astype('int64')
        else:
            func = pd.cut

        grps = func(ser,bins=bins,labels=labels,right=right,**kwargs)
        if labels is not None:
            grps = grps.cat.reorder_categories(labels[::-1], ordered = True)
        return grps
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 ddf['dt_grp'] = ddf['access_epoch'].map_partitions(cudf_cut,cutoffs,grp_labels)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 v = ddf.get_partition(0).compute()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 cudf_cut(v['access_epoch'],cutoffs,grp_labels)
 ```

-%% Output
-
-    4743970     >18M
-    4744054     >18M
-    4744053     >18M
-    4744052     >18M
-    4744051     >18M
-               ...
-    523698     4M-5M
-    523696     5M-6M
-    523629     5M-6M
-    523625     5M-6M
-    523712     4M-5M
-    Length: 5000000, dtype: category
-    Categories (19, object): ['>18M' < '17M-18M' < '16M-17M' < '15M-16M' ... '3M-4M' < '2M-3M' < '1M-2M' < '<1M']
-
 %% Cell type:markdown id: tags:

 ## Breakdown by TLD and ATime Cutoffs

 %% Cell type:code id: tags:

-``` python
+``` 
 df_agg = ddf.groupby(['tld','dt_grp'],observed=True)['size'].agg(['sum','count']).compute().sort_index(level=[0,1]).to_pandas().reset_index()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df_agg['dt_grp'].cat.as_ordered()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df_agg['dt_grp'] = df_agg['dt_grp'].cat.reorder_categories(grp_labels[::-1], ordered = True)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df_agg = df_agg.rename(columns={'sum':'bytes','count':'file_count'})
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df_agg.to_parquet(report_dir.joinpath('tld_atime-age_agg.parquet'))
 ```

 %% Cell type:markdown id: tags:

 ### TLD and File Age Plots

 %% Cell type:code id: tags:

-``` python
+``` 
 import pandas as pd
 from pathlib import Path
 import plotly.express as px
 import plotly.graph_objects as go
 from plotly import colors
 import plotting
 import importlib
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df_agg = pd.read_parquet('/data/rc/gpfs-policy/data/list-policy_data-user_2024-09-18/reports/tld_atime-age_agg.parquet')
 age_agg = df_agg.groupby('dt_grp',observed=True,as_index=False)[['file_count','bytes']].sum()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 exp,unit = plotting.choose_appropriate_storage_unit(age_agg['bytes'])
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 age_agg[unit] = age_agg['bytes']/(1024**exp)
 age_agg[['file_count_cum',f'{unit}_cum']] = age_agg[['file_count',unit]].cumsum()
 age_agg[[unit,f'{unit}_cum']] = age_agg[[unit,f'{unit}_cum']].round(3)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 importlib.reload(plotting)
 storage_plot = plotting.create_bar_plot(df=age_agg,x='dt_grp',y=[unit,f'{unit}_cum'],textposition='outside',
                         title=f'{unit} per atime Group', xlabel='Access Time Age', ylabel=f'Storage Used ({unit})')
 storage_plot.show()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 importlib.reload(plotting)
 legend_labels = ['Raw','Cumlative']
 cols = ['file_count','file_count_cum']
 file_count_plot = plotting.create_bar_plot(df=age_agg,x='dt_grp',y=cols,legend_labels=legend_labels,
                                           textposition='outside',text_decimals=0, title='File Count per atime Group',
                                           xlabel='Access Time Age', ylabel='File Count')
 file_count_plot.show()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 importlib.reload(plotting)
 pareto = plotting.create_pareto_chart(df=age_agg,x='dt_grp',y=unit, title='Storage per atime Group',
                                      xlabel='Access Time Age', ylabel=f'Storage Used {unit}',textposition_scatter='top left')
 #pareto.update_layout(width = 800)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 pareto.show()
 ```