Skip to content
Snippets Groups Projects
Commit 2886ea60 authored by John-Paul Robinson's avatar John-Paul Robinson
Browse files

Clean up repo for Jupyter notebooks

Add ignore rule for notebook cache directory.
Remove outputs, execution, count and other notebook JSON elements
that cause spurious diffs.
This is based on the automatic jquery cleaning at commit.

http://timstaley.co.uk/posts/making-git-and-jupyter-notebooks-play-nice/
parent e4b6bf28
No related tags found
1 merge request!38Clean up repo for Jupyter notebooks
......@@ -8,6 +8,7 @@ err/
# Ignore cache directories
__pycache__
.ipynb_checkpoints/
# Ignore quarto outputs
quarto*
......@@ -27,4 +28,4 @@ poetry.toml
.vscode
# Ignore random extra files
extra/
\ No newline at end of file
extra/
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
``` python
```
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask
import dask.dataframe as dd
import cudf
import pandas as pd
from pandas.tseries.offsets import DateOffset
import re
from pathlib import Path
dask.config.set({"dataframe.backend": "cudf"})
```
%% Output
<dask.config.set at 0x2aaab64e3150>
%% Cell type:code id: tags:
``` python
```
parquet_ds = '/data/rc/gpfs-policy/data/list-policy_data-user_2024-09-18/parquet'
run_date = pd.to_datetime(re.search(r'[\d]{4}-[\d]{2}-[\d]{2}',parquet_ds).group(),format='%Y-%m-%d')
delta_vals = [i for i in range(1,19)]
delta_unit = 'M' # 'D','W','M','Y'
report_dir = Path(parquet_ds).parent.joinpath('reports')
report_dir.mkdir(exist_ok=True)
report_dir.chmod(mode = 0o2770) # 0o2770 sets user/group rwx and group sticky bit
```
%% Cell type:markdown id: tags:
## Setup Cluster and Data
%% Cell type:code id: tags:
``` python
```
cluster = LocalCUDACluster(threads_per_worker=10,device_memory_limit=None)
client=Client(cluster)
```
%% Cell type:code id: tags:
``` python
```
ddf = dd.read_parquet(parquet_ds,split_row_groups=False)
```
%% Cell type:markdown id: tags:
### Define Cutoffs In DateTimes
%% Cell type:code id: tags:
``` python
```
def create_timedelta(val,unit):
if unit == 'D': # Days
return DateOffset(days=val)
elif unit == 'W': # Weeks
return DateOffset(weeks=val)
elif unit == 'M': # Months
return DateOffset(months=val)
elif unit == 'Y': # Years
return DateOffset(years=val)
```
%% Cell type:code id: tags:
``` python
```
deltas = pd.Series([create_timedelta(c,delta_unit) for c in delta_vals])
cutoffs = pd.to_datetime(run_date - deltas)
cutoffs = (
pd.concat(
[
cutoffs,
pd.Series([run_date,pd.to_datetime('1970-01-01')])
]
)
.astype('int64')
.sort_values()
.to_list()
)
```
%% Cell type:code id: tags:
``` python
```
def create_cutoff_group_labels(vals,unit):
vals.sort(reverse=True) # values sorted in descending order since the largest number represents the oldest group
deltas = [f'{d}{unit}' for d in vals]
groups = []
groups.append(f'>{deltas[0]}')
for i in range(len(deltas)-1):
groups.append(f'{deltas[i+1]}-{deltas[i]}')
groups.append(f'<{deltas[-1]}')
return groups
```
%% Cell type:code id: tags:
``` python
```
grp_labels = create_cutoff_group_labels(delta_vals,delta_unit)
```
%% Cell type:code id: tags:
``` python
```
ddf['access_epoch'] = ddf['access'].astype('int64')
```
%% Cell type:code id: tags:
``` python
```
def _cut(ser,bins,labels,with_cuda,**kwargs) -> pd.Series | cudf.Series:
right = kwargs.pop('right',False)
if with_cuda:
func = cudf.cut
ser = ser.astype('int64')
else:
func = pd.cut
grps = func(ser,bins=bins,labels=labels,right=right,**kwargs)
if labels is not None:
grps = grps.cat.reorder_categories(labels[::-1], ordered = True)
return grps
```
%% Cell type:code id: tags:
``` python
```
ddf['dt_grp'] = ddf['access_epoch'].map_partitions(cudf_cut,cutoffs,grp_labels)
```
%% Cell type:code id: tags:
``` python
```
v = ddf.get_partition(0).compute()
```
%% Cell type:code id: tags:
``` python
```
cudf_cut(v['access_epoch'],cutoffs,grp_labels)
```
%% Output
4743970 >18M
4744054 >18M
4744053 >18M
4744052 >18M
4744051 >18M
...
523698 4M-5M
523696 5M-6M
523629 5M-6M
523625 5M-6M
523712 4M-5M
Length: 5000000, dtype: category
Categories (19, object): ['>18M' < '17M-18M' < '16M-17M' < '15M-16M' ... '3M-4M' < '2M-3M' < '1M-2M' < '<1M']
%% Cell type:markdown id: tags:
## Breakdown by TLD and ATime Cutoffs
%% Cell type:code id: tags:
``` python
```
df_agg = ddf.groupby(['tld','dt_grp'],observed=True)['size'].agg(['sum','count']).compute().sort_index(level=[0,1]).to_pandas().reset_index()
```
%% Cell type:code id: tags:
``` python
```
df_agg['dt_grp'].cat.as_ordered()
```
%% Cell type:code id: tags:
``` python
```
df_agg['dt_grp'] = df_agg['dt_grp'].cat.reorder_categories(grp_labels[::-1], ordered = True)
```
%% Cell type:code id: tags:
``` python
```
df_agg = df_agg.rename(columns={'sum':'bytes','count':'file_count'})
```
%% Cell type:code id: tags:
``` python
```
df_agg.to_parquet(report_dir.joinpath('tld_atime-age_agg.parquet'))
```
%% Cell type:markdown id: tags:
### TLD and File Age Plots
%% Cell type:code id: tags:
``` python
```
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly import colors
import plotting
import importlib
```
%% Cell type:code id: tags:
``` python
```
df_agg = pd.read_parquet('/data/rc/gpfs-policy/data/list-policy_data-user_2024-09-18/reports/tld_atime-age_agg.parquet')
age_agg = df_agg.groupby('dt_grp',observed=True,as_index=False)[['file_count','bytes']].sum()
```
%% Cell type:code id: tags:
``` python
```
exp,unit = plotting.choose_appropriate_storage_unit(age_agg['bytes'])
```
%% Cell type:code id: tags:
``` python
```
age_agg[unit] = age_agg['bytes']/(1024**exp)
age_agg[['file_count_cum',f'{unit}_cum']] = age_agg[['file_count',unit]].cumsum()
age_agg[[unit,f'{unit}_cum']] = age_agg[[unit,f'{unit}_cum']].round(3)
```
%% Cell type:code id: tags:
``` python
```
importlib.reload(plotting)
storage_plot = plotting.create_bar_plot(df=age_agg,x='dt_grp',y=[unit,f'{unit}_cum'],textposition='outside',
title=f'{unit} per atime Group', xlabel='Access Time Age', ylabel=f'Storage Used ({unit})')
storage_plot.show()
```
%% Cell type:code id: tags:
``` python
```
importlib.reload(plotting)
legend_labels = ['Raw','Cumlative']
cols = ['file_count','file_count_cum']
file_count_plot = plotting.create_bar_plot(df=age_agg,x='dt_grp',y=cols,legend_labels=legend_labels,
textposition='outside',text_decimals=0, title='File Count per atime Group',
xlabel='Access Time Age', ylabel='File Count')
file_count_plot.show()
```
%% Cell type:code id: tags:
``` python
```
importlib.reload(plotting)
pareto = plotting.create_pareto_chart(df=age_agg,x='dt_grp',y=unit, title='Storage per atime Group',
xlabel='Access Time Age', ylabel=f'Storage Used {unit}',textposition_scatter='top left')
#pareto.update_layout(width = 800)
```
%% Cell type:code id: tags:
``` python
```
pareto.show()
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment