#!/usr/bin/env python import datetime import pandas as pd import matplotlib.pyplot as plt from urllib.parse import unquote import sys recs = [] count=0 progress=0 file = sys.argv[1] with open(file) as gpfs_data: for line in gpfs_data: #print(unquote(line)) left, right = unquote(line).split(" -- ", 1) fname = right.strip() inode, meta = left.split('|', 1) _, inode, _ = inode.split() #print(meta) meta, _ = meta.rsplit('|', 1) #print(meta) props = [] for prop in meta.split('|'): props.append(prop.split('=')) #props.append(['path', fname]) #print(props) props = dict(props) props["inode"] = inode for key in ["heat", "pool", "mode", "misc"]: del props[key] recs.append(props) count += 1 progress += 1 if (progress // 20000000): print(f"{datetime.datetime.now()} progress: {count}: {fname}") progress = 0 if (count > 200000000): break df = pd.DataFrame(recs) df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'}) for intcol in ['size', 'kballoc', 'uid', 'gid']: df[intcol] = df[intcol].astype("int") for intcol in ['atime', 'ctime', 'mtime']: df[intcol] = df[intcol].astype('datetime64[ns]') print(df.groupby(["uid"], sort=False)["atime"].max()) df.to_pickle(f"{file}.gz")