diff --git a/last-access-per-user.py b/last-access-per-user.py new file mode 100755 index 0000000000000000000000000000000000000000..c28167e969abe7dadea3af517d6eb3d7e5160b31 --- /dev/null +++ b/last-access-per-user.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +import datetime +import pandas as pd +import matplotlib.pyplot as plt +from urllib.parse import unquote +import sys + +recs = [] +count=0 +progress=0 + +file = sys.argv[1] + +with open(file) as gpfs_data: + for line in gpfs_data: + #print(unquote(line)) + left, right = unquote(line).split(" -- ", 1) + fname = right.strip() + inode, meta = left.split('|', 1) + _, inode, _ = inode.split() + #print(meta) + meta, _ = meta.rsplit('|', 1) + #print(meta) + props = [] + for prop in meta.split('|'): + props.append(prop.split('=')) + #props.append(['path', fname]) + #print(props) + props = dict(props) + props["inode"] = inode + for key in ["heat", "pool", "mode", "misc"]: + del props[key] + recs.append(props) + count += 1 + progress += 1 + if (progress // 20000000): + print(f"{datetime.datetime.now()} progress: {count}: {fname}") + progress = 0 + if (count > 200000000): + break + +df = pd.DataFrame(recs) + +df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'}) + +for intcol in ['size', 'kballoc', 'uid', 'gid']: + df[intcol] = df[intcol].astype("int") + +for intcol in ['atime', 'ctime', 'mtime']: + df[intcol] = df[intcol].astype('datetime64[ns]') + +print(df.groupby(["uid"], sort=False)["atime"].max()) + +df.to_pickle(f"{file}.gz") diff --git a/max-access-per-user-merged.ipynb b/max-access-per-user-merged.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..aa4e14a9fba804baba783b8fe14900201c766992 --- /dev/null +++ b/max-access-per-user-merged.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0f3842e0", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "import os\n", + "import glob\n", + "\n", + "recs = []\n", + "count=0\n", + "progress=0\n", + "\n", + "report_name = \"temporary-scratch\"\n", + "dir = \"data/list-17075953.list.gather-info.d\"\n", + "#report_name = \"temporary-scratch\"\n", + "#dir = 'data/list-17094088.list.gather-info.d'\n", + "\n", + "os.chdir(dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8c0042", + "metadata": {}, + "outputs": [], + "source": [ + "#os.chdir('../..')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b379845f", + "metadata": {}, + "outputs": [], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3d9f107", + "metadata": {}, + "outputs": [], + "source": [ + "!which python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a94675", + "metadata": {}, + "outputs": [], + "source": [ + "frames = []\n", + "\n", + "for file in glob.glob(\"*.gz\"):\n", + " print(f\"processing: {file}\")\n", + " # combine picked dfs into one df\n", + " # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html\n", + " df = pd.read_pickle(file)\n", + " frames.append(df)\n", + "\n", + "df = pd.concat(frames)\n", + "del(frames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a4e9bf", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca6e641", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"max atime: {df['atime'].max()}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc576380", + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby([\"uid\"], sort=False)[\"atime\"].max().sort_values()" + ] + }, + { + "cell_type": "markdown", + "id": "1a482ac6", + "metadata": {}, + "source": [ + "Get user names https://stackoverflow.com/a/421670/8928529" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c4e531e", + "metadata": {}, + "outputs": [], + "source": [ + "import pwd\n", + "def getuser(uid):\n", + " return pwd.getpwuid(int(uid))[0].split(\":\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6980fc30", + "metadata": {}, + "outputs": [], + "source": [ + "# set uname for uid\n", + "for uid in sorted(df[\"uid\"].unique()):\n", + " uname = pwd.getpwuid(int(uid))[0].split(\":\")[0]\n", + " print(\"uid: {} name: {}\".format(uid, uname))\n", + " df.loc[df[\"uid\"]==uid, [\"uname\"]] = uname" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29af2c16", + "metadata": {}, + "outputs": [], + "source": [ + "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77b361b1", + "metadata": {}, + "outputs": [], + "source": [ + "space[\"GB\"]=space[\"size\"]/1000/1000/1000\n", + "space[\"GiB\"]=space[\"size\"]/1024/1024/1024" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90b897a5", + "metadata": {}, + "outputs": [], + "source": [ + "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5065d98e", + "metadata": {}, + "outputs": [], + "source": [ + "space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf8adb7", + "metadata": {}, + "outputs": [], + "source": [ + "atime=df.groupby([\"uname\"], sort=False)[\"atime\"].max().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb33c1de", + "metadata": {}, + "outputs": [], + "source": [ + "atime.rename(columns={\"atime\": \"newest\"}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce376ec2", + "metadata": {}, + "outputs": [], + "source": [ + "atime[\"oldest\"]=df.groupby([\"uname\"], sort=False)[\"atime\"].min().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fd5346a", + "metadata": {}, + "outputs": [], + "source": [ + "atime[\"newest_date\"] = pd.to_datetime(atime[\"newest\"].dt.strftime('%Y-%m-%d'))\n", + "atime[\"oldest_date\"] = pd.to_datetime(atime[\"oldest\"].dt.strftime('%Y-%m-%d'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "422ecc0c", + "metadata": {}, + "outputs": [], + "source": [ + "report=pd.concat([atime, space], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c979329", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32139106", + "metadata": {}, + "outputs": [], + "source": [ + "# https://stackoverflow.com/a/20937592/8928529\n", + "pd.options.display.float_format = '{:,.2f}'.format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d40d60f", + "metadata": {}, + "outputs": [], + "source": [ + "report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "871995b1", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"report: {report_name}\")\n", + "print(report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]])\n", + "print(\"--------\\ntotals: {0:,.2f}GB {1:,.2f}GiB\".format(report['GB'].sum(), report['GiB'].sum()))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/max-access-per-user.ipynb b/max-access-per-user.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8424774058f4359aa0938867834e03e1b250673a --- /dev/null +++ b/max-access-per-user.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "63ee8026", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "\n", + "recs = []\n", + "count=0\n", + "progress=0\n", + "\n", + "file = \"data/list-17075953.list.gather-info.d/list-000\"\n", + "\n", + "with open(file) as gpfs_data:\n", + " for line in gpfs_data:\n", + " #print(unquote(line))\n", + " left, right = unquote(line).split(\" -- \", 1)\n", + " fname = right.strip()\n", + " inode, meta = left.split('|', 1)\n", + " _, inode, _ = inode.split() \n", + " #print(meta)\n", + " meta, _ = meta.rsplit('|', 1)\n", + " #print(meta)\n", + " props = []\n", + " for prop in meta.split('|'):\n", + " props.append(prop.split('='))\n", + " #props.append(['path', fname])\n", + " #print(props)\n", + " props = dict(props)\n", + " props[\"inode\"] = inode\n", + " for key in [\"heat\", \"pool\", \"mode\", \"misc\"]:\n", + " del props[key] \n", + " recs.append(props)\n", + " count += 1\n", + " progress += 1\n", + " if (progress // 20000000):\n", + " print(f\"{datetime.datetime.now()} progress: {count}: {fname}\")\n", + " progress = 0\n", + " if (count > 200000000):\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa0f2ba7", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(recs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6308a64", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c60d8868", + "metadata": {}, + "outputs": [], + "source": [ + "for intcol in ['size', 'kballoc', 'uid', 'gid']:\n", + " df[intcol] = df[intcol].astype(\"int\")\n", + "\n", + "for intcol in ['atime', 'ctime', 'mtime']:\n", + " df[intcol] = df[intcol].astype('datetime64[ns]')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca6e641", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"max atime: {df['atime'].max()}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71fd3b6f", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc576380", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.groupby([\"uid\"], sort=False)[\"atime\"].max())" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/run-max-atime-per-user b/run-max-atime-per-user new file mode 100755 index 0000000000000000000000000000000000000000..812698e6805c4f9d5ed1b19ff68e322c41655196 --- /dev/null +++ b/run-max-atime-per-user @@ -0,0 +1,10 @@ +#!/bin/bash + +suffix=`printf %.3d $SLURM_ARRAY_TASK_ID` + +#cd data/list-16144464.list.gather-info.d +#cd data/list-17075953.list.gather-info.d +cd data/list-17094088.list.gather-info.d + +echo -n "list-$suffix " +../../last-access-per-user.py list-$suffix