From df8ade2c7c8d408dc4dccc8c0ce1856982d655f9 Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Fri, 2 Dec 2022 10:52:32 -0600
Subject: [PATCH] Create per-user last atime report generator and wrapper batch
 script

The report atime generator reads a provided input and generates
a per-user atime report.
The wrapper sbatch allows running the script in an array job
to support scaling across large data sets split into many files.
---
 last-access-per-user.py | 55 +++++++++++++++++++++++++++++++++++++++++
 run-max-atime-per-user  | 10 ++++++++
 2 files changed, 65 insertions(+)
 create mode 100755 last-access-per-user.py
 create mode 100755 run-max-atime-per-user

diff --git a/last-access-per-user.py b/last-access-per-user.py
new file mode 100755
index 0000000..c28167e
--- /dev/null
+++ b/last-access-per-user.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+import datetime
+import pandas as pd
+import matplotlib.pyplot as plt
+from urllib.parse import unquote
+import sys
+
+recs = []
+count=0
+progress=0
+
+file = sys.argv[1]
+
+with open(file) as gpfs_data:
+    for line in gpfs_data:
+        #print(unquote(line))
+        left, right = unquote(line).split(" -- ", 1)
+        fname = right.strip()
+        inode, meta = left.split('|', 1)
+        _, inode, _ = inode.split() 
+        #print(meta)
+        meta, _  = meta.rsplit('|', 1)
+        #print(meta)
+        props = []
+        for prop in meta.split('|'):
+            props.append(prop.split('='))
+        #props.append(['path', fname])
+        #print(props)
+        props = dict(props)
+        props["inode"] = inode
+        for key in ["heat", "pool", "mode", "misc"]:
+            del props[key] 
+        recs.append(props)
+        count += 1
+        progress += 1
+        if (progress // 20000000):
+            print(f"{datetime.datetime.now()} progress: {count}: {fname}")
+            progress = 0
+        if (count > 200000000):
+            break
+
+df = pd.DataFrame(recs)
+
+df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})
+
+for intcol in ['size', 'kballoc', 'uid', 'gid']:
+    df[intcol] = df[intcol].astype("int")
+
+for intcol in ['atime', 'ctime', 'mtime']:
+    df[intcol] = df[intcol].astype('datetime64[ns]')
+
+print(df.groupby(["uid"], sort=False)["atime"].max())
+
+df.to_pickle(f"{file}.gz")
diff --git a/run-max-atime-per-user b/run-max-atime-per-user
new file mode 100755
index 0000000..812698e
--- /dev/null
+++ b/run-max-atime-per-user
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+suffix=`printf %.3d $SLURM_ARRAY_TASK_ID`
+
+#cd data/list-16144464.list.gather-info.d
+#cd data/list-17075953.list.gather-info.d
+cd data/list-17094088.list.gather-info.d
+
+echo -n "list-$suffix "
+../../last-access-per-user.py list-$suffix
-- 
GitLab