From 104dc621a00a9229c01f929a780d72fa9dda80a1 Mon Sep 17 00:00:00 2001 From: John-Paul Robinson <jpr@uab.edu> Date: Fri, 2 Dec 2022 10:57:27 -0600 Subject: [PATCH] Create notebook to merge multiple atime reports into single report Takes a directory of atime reports generated by array job and merges into a single per-user report. --- max-access-per-user-merged.ipynb | 288 +++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 max-access-per-user-merged.ipynb diff --git a/max-access-per-user-merged.ipynb b/max-access-per-user-merged.ipynb new file mode 100644 index 0000000..aa4e14a --- /dev/null +++ b/max-access-per-user-merged.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0f3842e0", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from urllib.parse import unquote\n", + "import sys\n", + "import os\n", + "import glob\n", + "\n", + "recs = []\n", + "count=0\n", + "progress=0\n", + "\n", + "report_name = \"temporary-scratch\"\n", + "dir = \"data/list-17075953.list.gather-info.d\"\n", + "#report_name = \"temporary-scratch\"\n", + "#dir = 'data/list-17094088.list.gather-info.d'\n", + "\n", + "os.chdir(dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b8c0042", + "metadata": {}, + "outputs": [], + "source": [ + "#os.chdir('../..')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b379845f", + "metadata": {}, + "outputs": [], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3d9f107", + "metadata": {}, + "outputs": [], + "source": [ + "!which python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a94675", + "metadata": {}, + "outputs": [], + "source": [ + "frames = []\n", + "\n", + "for file in glob.glob(\"*.gz\"):\n", + " print(f\"processing: {file}\")\n", + " # combine picked dfs into one df\n", + " # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html\n", + " df = pd.read_pickle(file)\n", + " frames.append(df)\n", + "\n", + "df = pd.concat(frames)\n", + "del(frames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a4e9bf", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca6e641", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"max atime: {df['atime'].max()}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc576380", + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby([\"uid\"], sort=False)[\"atime\"].max().sort_values()" + ] + }, + { + "cell_type": "markdown", + "id": "1a482ac6", + "metadata": {}, + "source": [ + "Get user names https://stackoverflow.com/a/421670/8928529" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c4e531e", + "metadata": {}, + "outputs": [], + "source": [ + "import pwd\n", + "def getuser(uid):\n", + " return pwd.getpwuid(int(uid))[0].split(\":\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6980fc30", + "metadata": {}, + "outputs": [], + "source": [ + "# set uname for uid\n", + "for uid in sorted(df[\"uid\"].unique()):\n", + " uname = pwd.getpwuid(int(uid))[0].split(\":\")[0]\n", + " print(\"uid: {} name: {}\".format(uid, uname))\n", + " df.loc[df[\"uid\"]==uid, [\"uname\"]] = uname" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29af2c16", + "metadata": {}, + "outputs": [], + "source": [ + "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77b361b1", + "metadata": {}, + "outputs": [], + "source": [ + "space[\"GB\"]=space[\"size\"]/1000/1000/1000\n", + "space[\"GiB\"]=space[\"size\"]/1024/1024/1024" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90b897a5", + "metadata": {}, + "outputs": [], + "source": [ + "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5065d98e", + "metadata": {}, + "outputs": [], + "source": [ + "space" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf8adb7", + "metadata": {}, + "outputs": [], + "source": [ + "atime=df.groupby([\"uname\"], sort=False)[\"atime\"].max().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb33c1de", + "metadata": {}, + "outputs": [], + "source": [ + "atime.rename(columns={\"atime\": \"newest\"}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce376ec2", + "metadata": {}, + "outputs": [], + "source": [ + "atime[\"oldest\"]=df.groupby([\"uname\"], sort=False)[\"atime\"].min().sort_values().to_frame()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fd5346a", + "metadata": {}, + "outputs": [], + "source": [ + "atime[\"newest_date\"] = pd.to_datetime(atime[\"newest\"].dt.strftime('%Y-%m-%d'))\n", + "atime[\"oldest_date\"] = pd.to_datetime(atime[\"oldest\"].dt.strftime('%Y-%m-%d'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "422ecc0c", + "metadata": {}, + "outputs": [], + "source": [ + "report=pd.concat([atime, space], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c979329", + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option('display.max_rows', None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32139106", + "metadata": {}, + "outputs": [], + "source": [ + "# https://stackoverflow.com/a/20937592/8928529\n", + "pd.options.display.float_format = '{:,.2f}'.format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d40d60f", + "metadata": {}, + "outputs": [], + "source": [ + "report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "871995b1", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"report: {report_name}\")\n", + "print(report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]])\n", + "print(\"--------\\ntotals: {0:,.2f}GB {1:,.2f}GiB\".format(report['GB'].sum(), report['GiB'].sum()))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab