# Notebook to explore parsing of the gpfs policy outputs

This is a collection of cells to understand data.
No particular endpoint in mind.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

This is the format of each line in the policy output;

 5001:000fffffffffffff:0000000000004741:4b8f012b:0:2c172b:10002:0:40!basedir/path/to/file:13!scratch_tier1;253!|size=444|kballoc=0|access=2022-01-01 06:58:37.177440|create=2022-01-01 06:21:33.356110|modify=2022-01-01 06:23:47.011273|uid=10973|gid=10973|heat=+0.00000000000000E+000|pool=scratch_tier1|path=/rootdir/basedir/path/to/file|misc=FAu|

In [None]:
file="data/mmapplypolicy.61746.962D9400.list.no_extern_list_list-30day-with-excludes_slurm-12551165_2022-03-03-04:00:09"

In [None]:
file="data/mmapplypolicy.54197.413B7AB5.list.no_extern_list_list-only-temporary-scratch_slurm-12790116_2022-03-14-18:47:51"

In [None]:
file="data/mmapplypolicy.120904.9DBFF7E6.list.no_extern_list_list-30day-with-excludes_slurm-13113652_2022-04-05-04:00:28"

In [None]:
file="data/mmapplypolicy.35838.667249E1.list.no_extern_list_list-30day-with-excludes_slurm-15685457_2022-08-23-04:00:23"

In [None]:
file="data/mmapplypolicy.41557.67790FB6.list.no_extern_list_list-path_slurm-15844227_2022-08-29-13:24:52"

## Parser functions

First we define the stucture of the file then the columns we want to use.

In [None]:
fields=['ignore', 'size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']

usecols=['size', 'kballoc', 'atime', 'ctime', 'mtime', 'uid', 'gid', 'heat', 'pool', 'path', 'misc']

In [None]:
def splitter(x):
 '''
 split each name=value field on = and return the value
 '''
 print(x)
 return x.split("=", 1)[1]

Set up a splitters dictionary to process all the used fields with the splitter function.
https://realpython.com/python-defaultdict/

In [None]:
splitters = {}

for name in usecols:
 splitters.setdefault(name, splitter)

In [None]:
%%time
df = pd.read_csv(file,
 lineterminator='\n',
 sep="|", header=0, 
 #on_bad_lines="warn", 
 index_col=False,
 #nrows=1000000,
 names=fields,
 usecols=usecols,
 converters=splitters,
 parse_dates=['atime', 'ctime', 'mtime'],
 )

In [None]:
df.info()

Clean up data types for numeric values

In [None]:
for intcol in ['size', 'kballoc', 'uid', 'gid']:
 df[intcol] = df[intcol].astype("int")

In [None]:
df.head(3)

Quick summary of total storage allocated used by 30+day files

In [None]:
df["kballoc"].sum()/1024

In [None]:
df["size"].sum()/1024/1024

In [None]:
df["atime"].min()

In [None]:
df[["atime","uid"]].sort_values(by="atime")

In [None]:
df[["uid","size"]].groupby("uid").sum()/1000/1000/1000/1000

In [None]:
(df[["uid","size"]].groupby("uid").sum()/1000/1000/1000/1000).sum()

In [None]:
df["atime"].sort_values().head()

In [None]:
df["uid"].head()

In [None]:
df["misc"].unique()

In [None]:
df["isfile"]=df["misc"].str.contains('F')

In [None]:
len(df["uid"].unique())

In [None]:
df["uid"].unique()

Get usernames from uid values via the pwd password db iteration module https://stackoverflow.com/a/421670/8928529

In [None]:
import pwd

In [None]:
pwd.getpwuid(12137)[0].split(":")

In [None]:
def getuser(uid):
 return pwd.getpwuid(int(uid))[0].split(":")[0]

In [None]:
getuser(10973)

In [None]:
# add new column for resolved uids
df["uname"]=""

In [None]:
# set uname for uid
for uid in sorted(df["uid"].unique()):
 uname = pwd.getpwuid(int(uid))[0].split(":")[0]
 print("uid: {} name: {}".format(uid, uname))
 df.loc[df["uid"]==uid, ["uname"]] = uname

In [None]:
df[df["uid"]==10005]

In [None]:
sorted(df["heat"].unique())

In [None]:
df["path"] = df["path"].astype("str")

In [None]:
df = pd.concat([df, df["path"].apply("str").split("/", 4, expand=True)[[1,3,4]].rename(columns={1: "fs", 3:"scratchdir", 4:"filename"})], axis=1)

In [None]:
df = df.rename(columns={"sratchdir": "scratchdir"})

In [None]:
df.columns

In [None]:
userdata = df[["scratchdir", "size", "kballoc", "isfile"]].groupby(["scratchdir"]).sum()

In [None]:
userdata

In [None]:
userdata["size"]/1000/1000/1000

In [None]:
df["path"].apply("str").split("/", 4, expand=True)[[3,4]]

In [None]:
df["path"].apply("str").split("/", 4, expand=True)

In [None]:
bytesdays=df[["atime","size"]]

In [None]:
bd=bytesdays.set_index("atime")

In [None]:
bd=bd.resample('D').sum()

In [None]:
bd["sum"]=bd.cumsum()

In [None]:
bd[:"2022-02-15"]

In [None]:
size, gb = bd[bd["size"]>0].loc[:"2022-01-01"].sum()

In [None]:
gb

In [None]:
bd.loc[:"2021-12-31"].sum()

In [None]:
bd.loc[:"2022-01-01"].sum()

In [None]:
bd.loc["2022-01-01":]

In [None]:
bd[bd["size"]>0]/1024/1024/1024 #.plot()

In [None]:
bd["gb"] = bd["sum"]/1024/1024/1024

In [None]:
bd["gb"]

In [None]:
b2d=bd["2021-10-01":]

In [None]:
1024*1024*1024*1024

In [None]:
bd7=b2d[["gb"]].rolling(7, center=True).sum()

In [None]:
# Plot houry, daily, 7-day rolling mean
fig, ax = plt.subplots()
#ax.plot(kW, marker='.', markersize=2, color='gray', linestyle='None', label='Hourly Average')
ax.plot(b2d["gb"], color='brown', linewidth=2, label='1-day Average')
ax.plot(bd7["gb"], color='black', linewidth=1, label='7-day Rolling Average')
label='Trend (7 day Rolling Sum)'
ax.legend()
ax.set_ylabel('Size (GBytes)')
ax.set_title('Cheaha Trends in Scratch Usage');