From df8ade2c7c8d408dc4dccc8c0ce1856982d655f9 Mon Sep 17 00:00:00 2001 From: John-Paul Robinson <jpr@uab.edu> Date: Fri, 2 Dec 2022 10:52:32 -0600 Subject: [PATCH] Create per-user last atime report generator and wrapper batch script The report atime generator reads a provided input and generates a per-user atime report. The wrapper sbatch allows running the script in an array job to support scaling across large data sets split into many files. --- last-access-per-user.py | 55 +++++++++++++++++++++++++++++++++++++++++ run-max-atime-per-user | 10 ++++++++ 2 files changed, 65 insertions(+) create mode 100755 last-access-per-user.py create mode 100755 run-max-atime-per-user diff --git a/last-access-per-user.py b/last-access-per-user.py new file mode 100755 index 0000000..c28167e --- /dev/null +++ b/last-access-per-user.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +import datetime +import pandas as pd +import matplotlib.pyplot as plt +from urllib.parse import unquote +import sys + +recs = [] +count=0 +progress=0 + +file = sys.argv[1] + +with open(file) as gpfs_data: + for line in gpfs_data: + #print(unquote(line)) + left, right = unquote(line).split(" -- ", 1) + fname = right.strip() + inode, meta = left.split('|', 1) + _, inode, _ = inode.split() + #print(meta) + meta, _ = meta.rsplit('|', 1) + #print(meta) + props = [] + for prop in meta.split('|'): + props.append(prop.split('=')) + #props.append(['path', fname]) + #print(props) + props = dict(props) + props["inode"] = inode + for key in ["heat", "pool", "mode", "misc"]: + del props[key] + recs.append(props) + count += 1 + progress += 1 + if (progress // 20000000): + print(f"{datetime.datetime.now()} progress: {count}: {fname}") + progress = 0 + if (count > 200000000): + break + +df = pd.DataFrame(recs) + +df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'}) + +for intcol in ['size', 'kballoc', 'uid', 'gid']: + df[intcol] = df[intcol].astype("int") + +for intcol in ['atime', 'ctime', 'mtime']: + df[intcol] = df[intcol].astype('datetime64[ns]') + +print(df.groupby(["uid"], sort=False)["atime"].max()) + +df.to_pickle(f"{file}.gz") diff --git a/run-max-atime-per-user b/run-max-atime-per-user new file mode 100755 index 0000000..812698e --- /dev/null +++ b/run-max-atime-per-user @@ -0,0 +1,10 @@ +#!/bin/bash + +suffix=`printf %.3d $SLURM_ARRAY_TASK_ID` + +#cd data/list-16144464.list.gather-info.d +#cd data/list-17075953.list.gather-info.d +cd data/list-17094088.list.gather-info.d + +echo -n "list-$suffix " +../../last-access-per-user.py list-$suffix -- GitLab