{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "63ee8026",
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from urllib.parse import unquote\n",
"import sys\n",
"\n",
"recs = []\n",
"count=0\n",
"progress=0\n",
"\n",
"file = \"data/list-17075953.list.gather-info.d/list-000\"\n",
"\n",
"with open(file) as gpfs_data:\n",
" for line in gpfs_data:\n",
" #print(unquote(line))\n",
" left, right = unquote(line).split(\" -- \", 1)\n",
" fname = right.strip()\n",
" inode, meta = left.split('|', 1)\n",
" _, inode, _ = inode.split() \n",
" #print(meta)\n",
" meta, _ = meta.rsplit('|', 1)\n",
" #print(meta)\n",
" props = []\n",
" for prop in meta.split('|'):\n",
" props.append(prop.split('='))\n",
" #props.append(['path', fname])\n",
" #print(props)\n",
" props = dict(props)\n",
" props[\"inode\"] = inode\n",
" for key in [\"heat\", \"pool\", \"mode\", \"misc\"]:\n",
" del props[key] \n",
" recs.append(props)\n",
" count += 1\n",
" progress += 1\n",
" if (progress // 20000000):\n",
" print(f\"{datetime.datetime.now()} progress: {count}: {fname}\")\n",
" progress = 0\n",
" if (count > 200000000):\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa0f2ba7",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(recs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6308a64",
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={'access': 'atime', 'create': 'ctime', 'modify': 'mtime'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c60d8868",
"metadata": {},
"outputs": [],
"source": [
"for intcol in ['size', 'kballoc', 'uid', 'gid']:\n",
" df[intcol] = df[intcol].astype(\"int\")\n",
"\n",
"for intcol in ['atime', 'ctime', 'mtime']:\n",
" df[intcol] = df[intcol].astype('datetime64[ns]')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ca6e641",
"metadata": {},
"outputs": [],
"source": [
"print(f\"max atime: {df['atime'].max()}\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71fd3b6f",
"metadata": {},
"outputs": [],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc576380",
"metadata": {},
"outputs": [],
"source": [
"print(df.groupby([\"uid\"], sort=False)[\"atime\"].max())"
]
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}