Skip to content
Snippets Groups Projects
Commit 84b2dfa4 authored by John-Paul Robinson's avatar John-Paul Robinson
Browse files

Merge branch 'feat-create-last-access-reports' into main

parents 28617e9f 104dc621
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys
recs = []
count=0
progress=0
file = sys.argv[1]
with open(file) as gpfs_data:
for line in gpfs_data:
#print(unquote(line))
left, right = unquote(line).split(" -- ", 1)
fname = right.strip()
inode, meta = left.split('|', 1)
_, inode, _ = inode.split()
#print(meta)
meta, _ = meta.rsplit('|', 1)
#print(meta)
props = []
for prop in meta.split('|'):
props.append(prop.split('='))
#props.append(['path', fname])
#print(props)
props = dict(props)
props["inode"] = inode
for key in ["heat", "pool", "mode", "misc"]:
del props[key]
recs.append(props)
count += 1
progress += 1
if (progress // 20000000):
print(f"{datetime.datetime.now()} progress: {count}: {fname}")
progress = 0
if (count > 200000000):
break
df = pd.DataFrame(recs)
df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})
for intcol in ['size', 'kballoc', 'uid', 'gid']:
df[intcol] = df[intcol].astype("int")
for intcol in ['atime', 'ctime', 'mtime']:
df[intcol] = df[intcol].astype('datetime64[ns]')
print(df.groupby(["uid"], sort=False)["atime"].max())
df.to_pickle(f"{file}.gz")
%% Cell type:code id:0f3842e0 tags:
```
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys
import os
import glob
recs = []
count=0
progress=0
report_name = "temporary-scratch"
dir = "data/list-17075953.list.gather-info.d"
#report_name = "temporary-scratch"
#dir = 'data/list-17094088.list.gather-info.d'
os.chdir(dir)
```
%% Cell type:code id:3b8c0042 tags:
```
#os.chdir('../..')
```
%% Cell type:code id:b379845f tags:
```
!pwd
```
%% Cell type:code id:d3d9f107 tags:
```
!which python
```
%% Cell type:code id:c2a94675 tags:
```
frames = []
for file in glob.glob("*.gz"):
print(f"processing: {file}")
# combine picked dfs into one df
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
df = pd.read_pickle(file)
frames.append(df)
df = pd.concat(frames)
del(frames)
```
%% Cell type:code id:a6a4e9bf tags:
```
df.info()
```
%% Cell type:code id:4ca6e641 tags:
```
print(f"max atime: {df['atime'].max()}\n")
```
%% Cell type:code id:cc576380 tags:
```
df.groupby(["uid"], sort=False)["atime"].max().sort_values()
```
%% Cell type:markdown id:1a482ac6 tags:
Get user names https://stackoverflow.com/a/421670/8928529
%% Cell type:code id:7c4e531e tags:
```
import pwd
def getuser(uid):
return pwd.getpwuid(int(uid))[0].split(":")[0]
```
%% Cell type:code id:6980fc30 tags:
```
# set uname for uid
for uid in sorted(df["uid"].unique()):
uname = pwd.getpwuid(int(uid))[0].split(":")[0]
print("uid: {} name: {}".format(uid, uname))
df.loc[df["uid"]==uid, ["uname"]] = uname
```
%% Cell type:code id:29af2c16 tags:
```
space=df.groupby(["uname"], sort=False)["size"].sum().sort_values().to_frame()
```
%% Cell type:code id:77b361b1 tags:
```
space["GB"]=space["size"]/1000/1000/1000
space["GiB"]=space["size"]/1024/1024/1024
```
%% Cell type:code id:90b897a5 tags:
```
space=df.groupby(["uname"], sort=False)["size"].sum().sort_values().to_frame()
```
%% Cell type:code id:5065d98e tags:
```
space
```
%% Cell type:code id:1bf8adb7 tags:
```
atime=df.groupby(["uname"], sort=False)["atime"].max().sort_values().to_frame()
```
%% Cell type:code id:fb33c1de tags:
```
atime.rename(columns={"atime": "newest"}, inplace=True)
```
%% Cell type:code id:ce376ec2 tags:
```
atime["oldest"]=df.groupby(["uname"], sort=False)["atime"].min().sort_values().to_frame()
```
%% Cell type:code id:6fd5346a tags:
```
atime["newest_date"] = pd.to_datetime(atime["newest"].dt.strftime('%Y-%m-%d'))
atime["oldest_date"] = pd.to_datetime(atime["oldest"].dt.strftime('%Y-%m-%d'))
```
%% Cell type:code id:422ecc0c tags:
```
report=pd.concat([atime, space], axis=1)
```
%% Cell type:code id:1c979329 tags:
```
pd.set_option('display.max_rows', None)
```
%% Cell type:code id:32139106 tags:
```
# https://stackoverflow.com/a/20937592/8928529
pd.options.display.float_format = '{:,.2f}'.format
```
%% Cell type:code id:8d40d60f tags:
```
report[["newest_date", "oldest_date", "GB", "GiB"]]
```
%% Cell type:code id:871995b1 tags:
```
print(f"report: {report_name}")
print(report[["newest_date", "oldest_date", "GB", "GiB"]])
print("--------\ntotals: {0:,.2f}GB {1:,.2f}GiB".format(report['GB'].sum(), report['GiB'].sum()))
```
%% Cell type:code id:63ee8026 tags:
```
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys
recs = []
count=0
progress=0
file = "data/list-17075953.list.gather-info.d/list-000"
with open(file) as gpfs_data:
for line in gpfs_data:
#print(unquote(line))
left, right = unquote(line).split(" -- ", 1)
fname = right.strip()
inode, meta = left.split('|', 1)
_, inode, _ = inode.split()
#print(meta)
meta, _ = meta.rsplit('|', 1)
#print(meta)
props = []
for prop in meta.split('|'):
props.append(prop.split('='))
#props.append(['path', fname])
#print(props)
props = dict(props)
props["inode"] = inode
for key in ["heat", "pool", "mode", "misc"]:
del props[key]
recs.append(props)
count += 1
progress += 1
if (progress // 20000000):
print(f"{datetime.datetime.now()} progress: {count}: {fname}")
progress = 0
if (count > 200000000):
break
```
%% Cell type:code id:aa0f2ba7 tags:
```
df = pd.DataFrame(recs)
```
%% Cell type:code id:d6308a64 tags:
```
df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})
```
%% Cell type:code id:c60d8868 tags:
```
for intcol in ['size', 'kballoc', 'uid', 'gid']:
df[intcol] = df[intcol].astype("int")
for intcol in ['atime', 'ctime', 'mtime']:
df[intcol] = df[intcol].astype('datetime64[ns]')
```
%% Cell type:code id:4ca6e641 tags:
```
print(f"max atime: {df['atime'].max()}\n")
```
%% Cell type:code id:71fd3b6f tags:
```
df.info()
```
%% Cell type:code id:cc576380 tags:
```
print(df.groupby(["uid"], sort=False)["atime"].max())
```
#!/bin/bash
suffix=`printf %.3d $SLURM_ARRAY_TASK_ID`
#cd data/list-16144464.list.gather-info.d
#cd data/list-17075953.list.gather-info.d
cd data/list-17094088.list.gather-info.d
echo -n "list-$suffix "
../../last-access-per-user.py list-$suffix
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment