Skip to content
Snippets Groups Projects
last-access-per-user.py 1.41 KiB
Newer Older
#!/usr/bin/env python

import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys

recs = []
count=0
progress=0

file = sys.argv[1]

with open(file) as gpfs_data:
    for line in gpfs_data:
        #print(unquote(line))
        left, right = unquote(line).split(" -- ", 1)
        fname = right.strip()
        inode, meta = left.split('|', 1)
        _, inode, _ = inode.split() 
        #print(meta)
        meta, _  = meta.rsplit('|', 1)
        #print(meta)
        props = []
        for prop in meta.split('|'):
            props.append(prop.split('='))
        #props.append(['path', fname])
        #print(props)
        props = dict(props)
        props["inode"] = inode
        for key in ["heat", "pool", "mode", "misc"]:
            del props[key] 
        recs.append(props)
        count += 1
        progress += 1
        if (progress // 20000000):
            print(f"{datetime.datetime.now()} progress: {count}: {fname}")
            progress = 0
        if (count > 200000000):
            break

df = pd.DataFrame(recs)

df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})

for intcol in ['size', 'kballoc', 'uid', 'gid']:
    df[intcol] = df[intcol].astype("int")

for intcol in ['atime', 'ctime', 'mtime']:
    df[intcol] = df[intcol].astype('datetime64[ns]')

print(df.groupby(["uid"], sort=False)["atime"].max())

df.to_pickle(f"{file}.gz")