Skip to content
Snippets Groups Projects
Commit 104dc621 authored by John-Paul Robinson's avatar John-Paul Robinson
Browse files

Create notebook to merge multiple atime reports into single report

Takes a directory of atime reports generated by array job and
merges into a single per-user report.
parent df8ade2c
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:0f3842e0 tags:
```
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys
import os
import glob
recs = []
count=0
progress=0
report_name = "temporary-scratch"
dir = "data/list-17075953.list.gather-info.d"
#report_name = "temporary-scratch"
#dir = 'data/list-17094088.list.gather-info.d'
os.chdir(dir)
```
%% Cell type:code id:3b8c0042 tags:
```
#os.chdir('../..')
```
%% Cell type:code id:b379845f tags:
```
!pwd
```
%% Cell type:code id:d3d9f107 tags:
```
!which python
```
%% Cell type:code id:c2a94675 tags:
```
frames = []
for file in glob.glob("*.gz"):
print(f"processing: {file}")
# combine picked dfs into one df
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
df = pd.read_pickle(file)
frames.append(df)
df = pd.concat(frames)
del(frames)
```
%% Cell type:code id:a6a4e9bf tags:
```
df.info()
```
%% Cell type:code id:4ca6e641 tags:
```
print(f"max atime: {df['atime'].max()}\n")
```
%% Cell type:code id:cc576380 tags:
```
df.groupby(["uid"], sort=False)["atime"].max().sort_values()
```
%% Cell type:markdown id:1a482ac6 tags:
Get user names https://stackoverflow.com/a/421670/8928529
%% Cell type:code id:7c4e531e tags:
```
import pwd
def getuser(uid):
return pwd.getpwuid(int(uid))[0].split(":")[0]
```
%% Cell type:code id:6980fc30 tags:
```
# set uname for uid
for uid in sorted(df["uid"].unique()):
uname = pwd.getpwuid(int(uid))[0].split(":")[0]
print("uid: {} name: {}".format(uid, uname))
df.loc[df["uid"]==uid, ["uname"]] = uname
```
%% Cell type:code id:29af2c16 tags:
```
space=df.groupby(["uname"], sort=False)["size"].sum().sort_values().to_frame()
```
%% Cell type:code id:77b361b1 tags:
```
space["GB"]=space["size"]/1000/1000/1000
space["GiB"]=space["size"]/1024/1024/1024
```
%% Cell type:code id:90b897a5 tags:
```
space=df.groupby(["uname"], sort=False)["size"].sum().sort_values().to_frame()
```
%% Cell type:code id:5065d98e tags:
```
space
```
%% Cell type:code id:1bf8adb7 tags:
```
atime=df.groupby(["uname"], sort=False)["atime"].max().sort_values().to_frame()
```
%% Cell type:code id:fb33c1de tags:
```
atime.rename(columns={"atime": "newest"}, inplace=True)
```
%% Cell type:code id:ce376ec2 tags:
```
atime["oldest"]=df.groupby(["uname"], sort=False)["atime"].min().sort_values().to_frame()
```
%% Cell type:code id:6fd5346a tags:
```
atime["newest_date"] = pd.to_datetime(atime["newest"].dt.strftime('%Y-%m-%d'))
atime["oldest_date"] = pd.to_datetime(atime["oldest"].dt.strftime('%Y-%m-%d'))
```
%% Cell type:code id:422ecc0c tags:
```
report=pd.concat([atime, space], axis=1)
```
%% Cell type:code id:1c979329 tags:
```
pd.set_option('display.max_rows', None)
```
%% Cell type:code id:32139106 tags:
```
# https://stackoverflow.com/a/20937592/8928529
pd.options.display.float_format = '{:,.2f}'.format
```
%% Cell type:code id:8d40d60f tags:
```
report[["newest_date", "oldest_date", "GB", "GiB"]]
```
%% Cell type:code id:871995b1 tags:
```
print(f"report: {report_name}")
print(report[["newest_date", "oldest_date", "GB", "GiB"]])
print("--------\ntotals: {0:,.2f}GB {1:,.2f}GiB".format(report['GB'].sum(), report['GiB'].sum()))
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment