From 2b954a9cf38ea9d7f8a18073b24668461894d17e Mon Sep 17 00:00:00 2001 From: John-Paul Robinson <jpr@uab.edu> Date: Fri, 2 Dec 2022 10:48:10 -0600 Subject: [PATCH] Exploratory notebook to generate logic for report Figure out logic to parse list files and produce atime summaries. --- max-access-per-user.ipynb | 124 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 max-access-per-user.ipynb diff --git a/max-access-per-user.ipynb b/max-access-per-user.ipynb new file mode 100644 index 0000000..8424774 --- /dev/null +++ b/max-access-per-user.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "63ee8026", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "\n", + "recs = []\n", + "count=0\n", + "progress=0\n", + "\n", + "file = \"data/list-17075953.list.gather-info.d/list-000\"\n", + "\n", + "with open(file) as gpfs_data:\n", + " for line in gpfs_data:\n", + " #print(unquote(line))\n", + " left, right = unquote(line).split(\" -- \", 1)\n", + " fname = right.strip()\n", + " inode, meta = left.split('|', 1)\n", + " _, inode, _ = inode.split() \n", + " #print(meta)\n", + " meta, _ = meta.rsplit('|', 1)\n", + " #print(meta)\n", + " props = []\n", + " for prop in meta.split('|'):\n", + " props.append(prop.split('='))\n", + " #props.append(['path', fname])\n", + " #print(props)\n", + " props = dict(props)\n", + " props[\"inode\"] = inode\n", + " for key in [\"heat\", \"pool\", \"mode\", \"misc\"]:\n", + " del props[key] \n", + " recs.append(props)\n", + " count += 1\n", + " progress += 1\n", + " if (progress // 20000000):\n", + " print(f\"{datetime.datetime.now()} progress: {count}: {fname}\")\n", + " progress = 0\n", + " if (count > 200000000):\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa0f2ba7", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(recs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6308a64", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c60d8868", + "metadata": {}, + "outputs": [], + "source": [ + "for intcol in ['size', 'kballoc', 'uid', 'gid']:\n", + " df[intcol] = df[intcol].astype(\"int\")\n", + "\n", + "for intcol in ['atime', 'ctime', 'mtime']:\n", + " df[intcol] = df[intcol].astype('datetime64[ns]')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca6e641", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"max atime: {df['atime'].max()}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71fd3b6f", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc576380", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.groupby([\"uid\"], sort=False)[\"atime\"].max())" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab