# run report on pickled list policy data

The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir "`dirname`/reports" dir by default.

Some progress info is available via the `verbose` flag.

The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data.

!conda info --envs

!conda list

!pip list -freeze

In [None]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys
import os
import pathlib
import re

## input vars

In [None]:
dirname="" # directory to fine files to pickle
glob_pattern = "*.gz" # file name glob pattern to match, can be file name for individual file
line_regex_filter = ".*" # regex to match lines of interest in file
pickledir=f"{dirname}/pickles"
reportdir=f"{dirname}/reports"
tldpath="/"

verbose = False

In [None]:
dirname="data/list-17404604.list.gather-info.d" # directory to fine files to pickle
glob_pattern = "*.gz" # file name glob pattern to match, can be file name for individual file
line_regex_filter = ".*" # regex to match lines of interest in file
pickledir=f"{dirname}/pickles"
reportdir=f"{dirname}/reports"
tldpath="/data/projects"

verbose = True
limit = 0

In [None]:
dirname="data/list-20859348.list.gather-info.d" # directory to fine files to pickle
glob_pattern = "*.gz" # file name glob pattern to match, can be file name for individual file
line_regex_filter = ".*" # regex to match lines of interest in file
pickledir=f"{dirname}/pickles"
reportdir=f"{dirname}/reports"
tldpath="/data/project/datascienceteam"

verbose = True
limit = 0

In [None]:
dirname="data/list-20191520.list.gather-info.d" # directory to fine files to pickle
glob_pattern = "*.gz" # file name glob pattern to match, can be file name for individual file
line_regex_filter = ".*" # regex to match lines of interest in file
pickledir=f"{dirname}/pickles"
reportdir=f"{dirname}/reports"
tldpath="/data/project/thymelab"

verbose = True
limit = 0

## Utilities

In [None]:
# get top level dir on which to aggregate

def get_tld(df, dirname):
 dirpaths = dirname.split("/")
 new=df["path"].str.split("/", n=len(dirpaths)+1, expand=True)
 df["tld"] = new[len(dirpaths)]
 
 return df

In [None]:
def report_tld_year(df):
 report = df.groupby(['tld', df.access.dt.year]).agg({"size": ["sum", "count"]})
 return report

## Read and parse the files according to glob_pattern

In [None]:
dirpath = pathlib.Path(pickledir)

files = list()
for file in list(dirpath.glob(glob_pattern)):
 files.append(str(file))

## Aggregate stats into running totals

In [None]:
#report = pd.DataFrame()

reports=[]

for count, file in enumerate(files):
 if (verbose): print(f"parse: {file}")
 filename=os.path.basename(file)
 df = get_tld(pd.read_pickle(file), tldpath)
 df = report_tld_year(df)
 if (limit and count == limit):
 break
 # roll up into running total https://stackoverflow.com/a/55828762/8928529
 reports.append(df) 
 del(df)
 
report=pd.concat(reports)

In [None]:
report=report.groupby(['tld', 'access']).sum()

## Create final report

Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529

In [None]:
report.columns = [col[1] for col in report.columns.values]

In [None]:
report["average_size"] = report["sum"]/report["count"]

In [None]:
report["terabytes"] = report["sum"]/(10**12)
report["terabytes"] = report["terabytes"].map('{:,.2f}'.format)

In [None]:
report["gigabytes"] = report["sum"]/(10**9)
report["gigabytes"] = report["gigabytes"].map('{:,.2f}'.format)

In [None]:
report

In [None]:
if (verbose): print(report)

## Save report as CSV

In [None]:
# only create dir if there is data to pickle
if (len(report) and not os.path.isdir(reportdir)):
 os.mkdir(reportdir)

In [None]:
reportdir

In [None]:
if (verbose): print(f"report: groupby-tld")
report.to_csv(f"{reportdir}/groupby-tld.csv.gz")
report.to_pickle(f"{reportdir}/groupby-tld-year.pkl.gz")

In [None]:
report[report["sum"] == report["sum"].max()]

In [None]:
report[(report["sum"] > 5*10**13)]

In [None]:
report=report.reset_index()

In [None]:
summer = report.groupby("tld").agg("sum", "sum") #[report["sum"] > 10**13

In [None]:
summer["terabytes"] = summer["sum"]/(10**12)
summer["terabytes"] = summer["terabytes"].map('{:,.2f}'.format)

In [None]:
print(summer[summer["sum"] > 10**13].sort_values("sum", ascending=False)[['count', 'terabytes']])

In [None]:
report[(report["sum"] > 10**13) & (report["access"] <= 2021)]

In [None]:
report[(report["sum"] > 10**13) & (report["access"] <= 2021)]["sum"].sum()

In [None]:
report[(report["sum"] <= 10**13) & (report["access"] <= 2021)]["sum"].sum()

In [None]:
report[(report["sum"] > 10**13) & (report["access"] < 2023)]["sum"].sum()/10**12