From 2b954a9cf38ea9d7f8a18073b24668461894d17e Mon Sep 17 00:00:00 2001
From: John-Paul Robinson <jpr@uab.edu>
Date: Fri, 2 Dec 2022 10:48:10 -0600
Subject: [PATCH] Exploratory notebook to generate logic for report

Figure out logic to parse list files and produce atime summaries.
---
 max-access-per-user.ipynb | 124 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 max-access-per-user.ipynb

diff --git a/max-access-per-user.ipynb b/max-access-per-user.ipynb
new file mode 100644
index 0000000..8424774
--- /dev/null
+++ b/max-access-per-user.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63ee8026",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from urllib.parse import unquote\n",
+    "import sys\n",
+    "\n",
+    "recs = []\n",
+    "count=0\n",
+    "progress=0\n",
+    "\n",
+    "file = \"data/list-17075953.list.gather-info.d/list-000\"\n",
+    "\n",
+    "with open(file) as gpfs_data:\n",
+    "    for line in gpfs_data:\n",
+    "        #print(unquote(line))\n",
+    "        left, right = unquote(line).split(\" -- \", 1)\n",
+    "        fname = right.strip()\n",
+    "        inode, meta = left.split('|', 1)\n",
+    "        _, inode, _ = inode.split() \n",
+    "        #print(meta)\n",
+    "        meta, _  = meta.rsplit('|', 1)\n",
+    "        #print(meta)\n",
+    "        props = []\n",
+    "        for prop in meta.split('|'):\n",
+    "            props.append(prop.split('='))\n",
+    "        #props.append(['path', fname])\n",
+    "        #print(props)\n",
+    "        props = dict(props)\n",
+    "        props[\"inode\"] = inode\n",
+    "        for key in [\"heat\", \"pool\", \"mode\", \"misc\"]:\n",
+    "            del props[key] \n",
+    "        recs.append(props)\n",
+    "        count += 1\n",
+    "        progress += 1\n",
+    "        if (progress // 20000000):\n",
+    "            print(f\"{datetime.datetime.now()} progress: {count}: {fname}\")\n",
+    "            progress = 0\n",
+    "        if (count > 200000000):\n",
+    "            break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa0f2ba7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(recs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6308a64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c60d8868",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for intcol in ['size', 'kballoc', 'uid', 'gid']:\n",
+    "    df[intcol] = df[intcol].astype(\"int\")\n",
+    "\n",
+    "for intcol in ['atime', 'ctime', 'mtime']:\n",
+    "    df[intcol] = df[intcol].astype('datetime64[ns]')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ca6e641",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"max atime: {df['atime'].max()}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71fd3b6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc576380",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df.groupby([\"uid\"], sort=False)[\"atime\"].max())"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab