Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import sys
recs = []
count=0
progress=0
file = sys.argv[1]
with open(file) as gpfs_data:
for line in gpfs_data:
#print(unquote(line))
left, right = unquote(line).split(" -- ", 1)
fname = right.strip()
inode, meta = left.split('|', 1)
_, inode, _ = inode.split()
#print(meta)
meta, _ = meta.rsplit('|', 1)
#print(meta)
props = []
for prop in meta.split('|'):
props.append(prop.split('='))
#props.append(['path', fname])
#print(props)
props = dict(props)
props["inode"] = inode
for key in ["heat", "pool", "mode", "misc"]:
del props[key]
recs.append(props)
count += 1
progress += 1
if (progress // 20000000):
print(f"{datetime.datetime.now()} progress: {count}: {fname}")
progress = 0
if (count > 200000000):
break
df = pd.DataFrame(recs)
df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})
for intcol in ['size', 'kballoc', 'uid', 'gid']:
df[intcol] = df[intcol].astype("int")
for intcol in ['atime', 'ctime', 'mtime']:
df[intcol] = df[intcol].astype('datetime64[ns]')
print(df.groupby(["uid"], sort=False)["atime"].max())
df.to_pickle(f"{file}.gz")