max-access-per-user-merged.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f3842e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from urllib.parse import unquote\n",
    "import sys\n",
    "import os\n",
    "import glob\n",
    "\n",
    "recs = []\n",
    "count=0\n",
    "progress=0\n",
    "\n",
    "report_name = \"temporary-scratch\"\n",
    "dir = \"data/list-17075953.list.gather-info.d\"\n",
    "#report_name = \"temporary-scratch\"\n",
    "#dir = 'data/list-17094088.list.gather-info.d'\\\n",
    "glob_pattern=\"*.gz\"\n",
    "\n",
    "report_name = \"old-scratch\"\n",
    "dir = \"data/list-16144464.list.gather-info.d\"\n",
    "dir = \"data/list-policy_old-scratch_2022-09-15/pickles\"\n",
    "glob_pattern = \"list-*.gz\"\n",
    "\n",
    "os.chdir(dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b8c0042",
   "metadata": {},
   "outputs": [],
   "source": [
    "#os.chdir('../..')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b379845f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3d9f107",
   "metadata": {},
   "outputs": [],
   "source": [
    "!which python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2a94675",
   "metadata": {},
   "outputs": [],
   "source": [
    "frames = []\n",
    "df = pd.DataFrame()\n",
    "\n",
    "for file in glob.glob(glob_pattern):\n",
    "    print(f\"processing: {file}\")\n",
    "    # combine picked dfs into one df\n",
    "    # https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html\n",
    "    dfnew = pd.read_pickle(file)\n",
    "    #frames.append(df)\n",
    "    df = pd.concat([df, dfnew])\n",
    "#del(frames)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6a4e9bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ca6e641",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"max atime: {df['access'].max()}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc576380",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby([\"uid\"], sort=False)[\"access\"].max().sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a482ac6",
   "metadata": {},
   "source": [
    "Get user names https://stackoverflow.com/a/421670/8928529"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c4e531e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pwd\n",
    "def getuser(uid):\n",
    "    return pwd.getpwuid(int(uid))[0].split(\":\")[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6980fc30",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set uname for uid\n",
    "for uid in sorted(df[\"uid\"].unique()):\n",
    "    uname = pwd.getpwuid(int(uid))[0].split(\":\")[0]\n",
    "    print(\"uid: {} name: {}\".format(uid, uname))\n",
    "    df.loc[df[\"uid\"]==uid, [\"uname\"]] = uname"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29af2c16",
   "metadata": {},
   "outputs": [],
   "source": [
    "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77b361b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "space[\"GB\"]=space[\"size\"]/1000/1000/1000\n",
    "space[\"GiB\"]=space[\"size\"]/1024/1024/1024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90b897a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "space=df.groupby([\"uname\"], sort=False)[\"size\"].sum().sort_values().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5065d98e",
   "metadata": {},
   "outputs": [],
   "source": [
    "space"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bf8adb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "atime=df.groupby([\"uname\"], sort=False)[\"atime\"].max().sort_values().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb33c1de",
   "metadata": {},
   "outputs": [],
   "source": [
    "atime.rename(columns={\"atime\": \"newest\"}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce376ec2",
   "metadata": {},
   "outputs": [],
   "source": [
    "atime[\"oldest\"]=df.groupby([\"uname\"], sort=False)[\"atime\"].min().sort_values().to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fd5346a",
   "metadata": {},
   "outputs": [],
   "source": [
    "atime[\"newest_date\"] = pd.to_datetime(atime[\"newest\"].dt.strftime('%Y-%m-%d'))\n",
    "atime[\"oldest_date\"] = pd.to_datetime(atime[\"oldest\"].dt.strftime('%Y-%m-%d'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "422ecc0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "report=pd.concat([atime, space], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c979329",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32139106",
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://stackoverflow.com/a/20937592/8928529\n",
    "pd.options.display.float_format = '{:,.2f}'.format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d40d60f",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "871995b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"report: {report_name}\")\n",
    "print(report[[\"newest_date\", \"oldest_date\", \"GB\", \"GiB\"]])\n",
    "print(\"--------\\ntotals: {0:,.2f}GB     {1:,.2f}GiB\".format(report['GB'].sum(), report['GiB'].sum()))"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}