Merge branch 'feat-create-last-access-reports' into main

84b2dfa4 · John-Paul Robinson · 28617e9f · 104dc621 · 84b2dfa4 · 84b2dfa4
Commit 84b2dfa4 authored 2 years ago by John-Paul Robinson
--- a/last-access-per-user.py
+++ b/last-access-per-user.py
+#!/usr/bin/env python
+
+import datetime
+import pandas as pd
+import matplotlib.pyplot as plt
+from urllib.parse import unquote
+import sys
+
+recs = []
+count=0
+progress=0
+
+file = sys.argv[1]
+
+with open(file) as gpfs_data:
+    for line in gpfs_data:
+        #print(unquote(line))
+        left, right = unquote(line).split(" -- ", 1)
+        fname = right.strip()
+        inode, meta = left.split('|', 1)
+        _, inode, _ = inode.split() 
+        #print(meta)
+        meta, _  = meta.rsplit('|', 1)
+        #print(meta)
+        props = []
+        for prop in meta.split('|'):
+            props.append(prop.split('='))
+        #props.append(['path', fname])
+        #print(props)
+        props = dict(props)
+        props["inode"] = inode
+        for key in ["heat", "pool", "mode", "misc"]:
+            del props[key] 
+        recs.append(props)
+        count += 1
+        progress += 1
+        if (progress // 20000000):
+            print(f"{datetime.datetime.now()} progress: {count}: {fname}")
+            progress = 0
+        if (count > 200000000):
+            break
+
+df = pd.DataFrame(recs)
+
+df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})
+
+for intcol in ['size', 'kballoc', 'uid', 'gid']:
+    df[intcol] = df[intcol].astype("int")
+
+for intcol in ['atime', 'ctime', 'mtime']:
+    df[intcol] = df[intcol].astype('datetime64[ns]')
+
+print(df.groupby(["uid"], sort=False)["atime"].max())
+
+df.to_pickle(f"{file}.gz")
--- a/max-access-per-user-merged.ipynb
+++ b/max-access-per-user-merged.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f3842e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from urllib.parse import unquote\n",
+    "import sys\n",
+    "import os\n",
+    "import glob\n",
+    "\n",
+    "recs = []\n",
+    "count=0\n",
+    "progress=0\n",
+    "\n",
+    "report_name = \"temporary-scratch\"\n",
+    "dir = \"data/list-17075953.list.gather-info.d\"\n",
+    "#report_name = \"temporary-scratch\"\n",
+    "#dir = 'data/list-17094088.list.gather-info.d'\n",
+    "\n",
+    "os.chdir(dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b8c0042",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#os.chdir('../..')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b379845f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3d9f107",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!which python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a94675",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frames = []\n",
+    "\n",
+    "for file in glob.glob(\"*.gz\"):\n",
+    "    print(f\"processing: {file}\")\n",
+    "    # combine picked dfs into one df\n",
+    "    # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html\n",
+    "    df = pd.read_pickle(file)\n",
+    "    frames.append(df)\n",
+    "\n",
+    "df = pd.concat(frames)\n",
+    "del(frames)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6a4e9bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ca6e641",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"max atime: {df['atime'].max()}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc576380",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.groupby([\"uid\"], sort=False)[\"atime\"].max().sort_values()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a482ac6",
+   "metadata": {},
+   "source": [
+    "Get user names https://stackoverflow.com/a/421670/8928529"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c4e531e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pwd\n",
+    "def getuser(uid):\n",
+    "    return pwd.getpwuid(int(uid))[0].split(\":\")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6980fc30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set uname for uid\n",
+    "for uid in sorted(df[\"uid\"].unique()):\n",
+    "    uname = pwd.getpwuid(int(uid))[0].split(\":\")[0]\n",
+    "    print(\"uid: {} name: {}\".format(uid, uname))\n",
+    "    df.loc[df[\"uid\"]==uid, [\"uname\"]] = uname"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29af2c16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77b361b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "space[\"GB\"]=space[\"size\"]/1000/1000/1000\n",
+    "space[\"GiB\"]=space[\"size\"]/1024/1024/1024"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90b897a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5065d98e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "space"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bf8adb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atime=df.groupby([\"uname\"], sort=False)[\"atime\"].max().sort_values().to_frame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb33c1de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atime.rename(columns={\"atime\": \"newest\"}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce376ec2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atime[\"oldest\"]=df.groupby([\"uname\"], sort=False)[\"atime\"].min().sort_values().to_frame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fd5346a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atime[\"newest_date\"] = pd.to_datetime(atime[\"newest\"].dt.strftime('%Y-%m-%d'))\n",
+    "atime[\"oldest_date\"] = pd.to_datetime(atime[\"oldest\"].dt.strftime('%Y-%m-%d'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "422ecc0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report=pd.concat([atime, space], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c979329",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_rows', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32139106",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://stackoverflow.com/a/20937592/8928529\n",
+    "pd.options.display.float_format = '{:,.2f}'.format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d40d60f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "871995b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"report: {report_name}\")\n",
+    "print(report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]])\n",
+    "print(\"--------\\ntotals: {0:,.2f}GB     {1:,.2f}GiB\".format(report['GB'].sum(), report['GiB'].sum()))"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:0f3842e0 tags:
+
+``` 
+import datetime
+import pandas as pd
+import matplotlib.pyplot as plt
+from urllib.parse import unquote
+import sys
+import os
+import glob
+
+recs = []
+count=0
+progress=0
+
+report_name = "temporary-scratch"
+dir = "data/list-17075953.list.gather-info.d"
+#report_name = "temporary-scratch"
+#dir = 'data/list-17094088.list.gather-info.d'
+
+os.chdir(dir)
+```
+
+%% Cell type:code id:3b8c0042 tags:
+
+``` 
+#os.chdir('../..')
+```
+
+%% Cell type:code id:b379845f tags:
+
+``` 
+!pwd
+```
+
+%% Cell type:code id:d3d9f107 tags:
+
+``` 
+!which python
+```
+
+%% Cell type:code id:c2a94675 tags:
+
+``` 
+frames = []
+
+for file in glob.glob("*.gz"):
+    print(f"processing: {file}")
+    # combine picked dfs into one df
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
+    df = pd.read_pickle(file)
+    frames.append(df)
+
+df = pd.concat(frames)
+del(frames)
+```
+
+%% Cell type:code id:a6a4e9bf tags:
+
+``` 
+df.info()
+```
+
+%% Cell type:code id:4ca6e641 tags:
+
+``` 
+print(f"max atime: {df['atime'].max()}\n")
+```
+
+%% Cell type:code id:cc576380 tags:
+
+``` 
+df.groupby(["uid"], sort=False)["atime"].max().sort_values()
+```
+
+%% Cell type:markdown id:1a482ac6 tags:
+
+Get user names https://stackoverflow.com/a/421670/8928529
+
+%% Cell type:code id:7c4e531e tags:
+
+``` 
+import pwd
+def getuser(uid):
+    return pwd.getpwuid(int(uid))[0].split(":")[0]
+```
+
+%% Cell type:code id:6980fc30 tags:
+
+``` 
+# set uname for uid
+for uid in sorted(df["uid"].unique()):
+    uname = pwd.getpwuid(int(uid))[0].split(":")[0]
+    print("uid: {} name: {}".format(uid, uname))
+    df.loc[df["uid"]==uid, ["uname"]] = uname
+```
+
+%% Cell type:code id:29af2c16 tags:
+
+``` 
+space=df.groupby(["uname"], sort=False)["size"].sum().sort_values().to_frame()
+```
+
+%% Cell type:code id:77b361b1 tags:
+
+``` 
+space["GB"]=space["size"]/1000/1000/1000
+space["GiB"]=space["size"]/1024/1024/1024
+```
+
+%% Cell type:code id:90b897a5 tags:
+
+``` 
+space=df.groupby(["uname"], sort=False)["size"].sum().sort_values().to_frame()
+```
+
+%% Cell type:code id:5065d98e tags:
+
+``` 
+space
+```
+
+%% Cell type:code id:1bf8adb7 tags:
+
+``` 
+atime=df.groupby(["uname"], sort=False)["atime"].max().sort_values().to_frame()
+```
+
+%% Cell type:code id:fb33c1de tags:
+
+``` 
+atime.rename(columns={"atime": "newest"}, inplace=True)
+```
+
+%% Cell type:code id:ce376ec2 tags:
+
+``` 
+atime["oldest"]=df.groupby(["uname"], sort=False)["atime"].min().sort_values().to_frame()
+```
+
+%% Cell type:code id:6fd5346a tags:
+
+``` 
+atime["newest_date"] = pd.to_datetime(atime["newest"].dt.strftime('%Y-%m-%d'))
+atime["oldest_date"] = pd.to_datetime(atime["oldest"].dt.strftime('%Y-%m-%d'))
+```
+
+%% Cell type:code id:422ecc0c tags:
+
+``` 
+report=pd.concat([atime, space], axis=1)
+```
+
+%% Cell type:code id:1c979329 tags:
+
+``` 
+pd.set_option('display.max_rows', None)
+```
+
+%% Cell type:code id:32139106 tags:
+
+``` 
+# https://stackoverflow.com/a/20937592/8928529
+pd.options.display.float_format = '{:,.2f}'.format
+```
+
+%% Cell type:code id:8d40d60f tags:
+
+``` 
+report[["newest_date", "oldest_date", "GB", "GiB"]]
+```
+
+%% Cell type:code id:871995b1 tags:
+
+``` 
+print(f"report: {report_name}")
+print(report[["newest_date", "oldest_date", "GB", "GiB"]])
+print("--------\ntotals: {0:,.2f}GB     {1:,.2f}GiB".format(report['GB'].sum(), report['GiB'].sum()))
+```
--- a/max-access-per-user.ipynb
+++ b/max-access-per-user.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63ee8026",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from urllib.parse import unquote\n",
+    "import sys\n",
+    "\n",
+    "recs = []\n",
+    "count=0\n",
+    "progress=0\n",
+    "\n",
+    "file = \"data/list-17075953.list.gather-info.d/list-000\"\n",
+    "\n",
+    "with open(file) as gpfs_data:\n",
+    "    for line in gpfs_data:\n",
+    "        #print(unquote(line))\n",
+    "        left, right = unquote(line).split(\" -- \", 1)\n",
+    "        fname = right.strip()\n",
+    "        inode, meta = left.split('|', 1)\n",
+    "        _, inode, _ = inode.split() \n",
+    "        #print(meta)\n",
+    "        meta, _  = meta.rsplit('|', 1)\n",
+    "        #print(meta)\n",
+    "        props = []\n",
+    "        for prop in meta.split('|'):\n",
+    "            props.append(prop.split('='))\n",
+    "        #props.append(['path', fname])\n",
+    "        #print(props)\n",
+    "        props = dict(props)\n",
+    "        props[\"inode\"] = inode\n",
+    "        for key in [\"heat\", \"pool\", \"mode\", \"misc\"]:\n",
+    "            del props[key] \n",
+    "        recs.append(props)\n",
+    "        count += 1\n",
+    "        progress += 1\n",
+    "        if (progress // 20000000):\n",
+    "            print(f\"{datetime.datetime.now()} progress: {count}: {fname}\")\n",
+    "            progress = 0\n",
+    "        if (count > 200000000):\n",
+    "            break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa0f2ba7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(recs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6308a64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c60d8868",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for intcol in ['size', 'kballoc', 'uid', 'gid']:\n",
+    "    df[intcol] = df[intcol].astype(\"int\")\n",
+    "\n",
+    "for intcol in ['atime', 'ctime', 'mtime']:\n",
+    "    df[intcol] = df[intcol].astype('datetime64[ns]')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ca6e641",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"max atime: {df['atime'].max()}\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71fd3b6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc576380",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(df.groupby([\"uid\"], sort=False)[\"atime\"].max())"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:code id:63ee8026 tags:
+
+``` 
+import datetime
+import pandas as pd
+import matplotlib.pyplot as plt
+from urllib.parse import unquote
+import sys
+
+recs = []
+count=0
+progress=0
+
+file = "data/list-17075953.list.gather-info.d/list-000"
+
+with open(file) as gpfs_data:
+    for line in gpfs_data:
+        #print(unquote(line))
+        left, right = unquote(line).split(" -- ", 1)
+        fname = right.strip()
+        inode, meta = left.split('|', 1)
+        _, inode, _ = inode.split()
+        #print(meta)
+        meta, _  = meta.rsplit('|', 1)
+        #print(meta)
+        props = []
+        for prop in meta.split('|'):
+            props.append(prop.split('='))
+        #props.append(['path', fname])
+        #print(props)
+        props = dict(props)
+        props["inode"] = inode
+        for key in ["heat", "pool", "mode", "misc"]:
+            del props[key]
+        recs.append(props)
+        count += 1
+        progress += 1
+        if (progress // 20000000):
+            print(f"{datetime.datetime.now()} progress: {count}: {fname}")
+            progress = 0
+        if (count > 200000000):
+            break
+```
+
+%% Cell type:code id:aa0f2ba7 tags:
+
+``` 
+df = pd.DataFrame(recs)
+```
+
+%% Cell type:code id:d6308a64 tags:
+
+``` 
+df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})
+```
+
+%% Cell type:code id:c60d8868 tags:
+
+``` 
+for intcol in ['size', 'kballoc', 'uid', 'gid']:
+    df[intcol] = df[intcol].astype("int")
+
+for intcol in ['atime', 'ctime', 'mtime']:
+    df[intcol] = df[intcol].astype('datetime64[ns]')
+```
+
+%% Cell type:code id:4ca6e641 tags:
+
+``` 
+print(f"max atime: {df['atime'].max()}\n")
+```
+
+%% Cell type:code id:71fd3b6f tags:
+
+``` 
+df.info()
+```
+
+%% Cell type:code id:cc576380 tags:
+
+``` 
+print(df.groupby(["uid"], sort=False)["atime"].max())
+```
--- a/run-max-atime-per-user
+++ b/run-max-atime-per-user
+#!/bin/bash
+
+suffix=`printf %.3d $SLURM_ARRAY_TASK_ID`
+
+#cd data/list-16144464.list.gather-info.d
+#cd data/list-17075953.list.gather-info.d
+cd data/list-17094088.list.gather-info.d
+
+echo -n "list-$suffix "
+../../last-access-per-user.py list-$suffix