added documentation

26c21463 · Ryan Randles Jones · 384ff64c · 26c21463
Commit 26c21463 authored 4 years ago by Ryan Randles Jones
--- a/Runtime-and-CoreCount.ipynb
+++ b/Runtime-and-CoreCount.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Notebook Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "import sqlite3\n",
+    "import slurm2sql\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "import seaborn as sb\n",
+    "import plotly.express as px\n",
+    "import matplotlib.ticker as ticker\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "from RC_styles import rc_styles as style"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "from sklearn.cluster import KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# creates database of info from March 2020 using sqlite 3\n",
+    "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# df is starting database\n",
+    "df = pd.read_sql('SELECT * FROM slurm', db)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# voluntary\n",
+    "\n",
+    "# for displaying all available column options\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# converts units in ReqMemCPU column from bytes to gigs\n",
+    "df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# converts Elapsed time to hours (from seconds)\n",
+    "df['Elapsed'] = df['Elapsed'].div(3600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# df_completed is dataframe of all completed jobs\n",
+    "df_completed = df[df.State.str.contains('COMPLETED')]\n",
+    "#df_completed.head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ReqMemCPU,Corecount,Runtime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# sets min and max parameters for ReqMemCPU\n",
+    "UpperlimitGB = 50\n",
+    "LowerlimitGB = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# sets min and max parameters for AllocCPUS\n",
+    "UpperlimitAllocCPU = 20\n",
+    "LowerlimitAllocCPU = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# dataset of needed columns for all graphs below\n",
+    "df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
+    "df_1.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# rounds ReqMemCPU up to nearest whole number\n",
+    "df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# rounds Elapsed up to nearest 2 decimal places\n",
+    "df_1['Elapsed'] = df_1['Elapsed'].round(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# sorts dataset by AllocCPUS for easy visualization\n",
+    "df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)\n",
+    "df_1_sorted.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n",
+    "df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n",
+    "df_runtime.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# creates a facet grid from df_runtime dataset\n",
+    "# Elapsed time in hours and ReqMemCPU in gigs\n",
+    "style.default_axes_and_ticks()\n",
+    "style.figsize()\n",
+    "\n",
+    "full_facet = sb.PairGrid(df_runtime)\n",
+    "full_facet.map(plt.scatter);\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "style.default_axes_and_ticks()\n",
+    "style.figsize()\n",
+    "\n",
+    "runtime_graph = sns.scatterplot(x=\"ReqMemCPU\", y=\"AllocCPUS\",data=df_runtime)\n",
+    "\n",
+    "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n",
+    "\n",
+    "plt.xlabel('ReqMemCPU(gigs)')\n",
+    "plt.ylabel('AllocCPUS')\n",
+    "#plt.yscale(\"log\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above for clustering\n",
+    "df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n",
+    "df_runtime_cluster.tail(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n",
+    "Sum_of_squared_distances = []\n",
+    "K = range(1,10)\n",
+    "for k in K:\n",
+    "    km = KMeans(n_clusters=k)\n",
+    "    km = km.fit(df_runtime_cluster)\n",
+    "    Sum_of_squared_distances.append(km.inertia_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n",
+    "plt.plot(K, Sum_of_squared_distances, 'bx-')\n",
+    "plt.xlabel('k')\n",
+    "plt.ylabel('Sum_of_squared_distances')\n",
+    "plt.title('Elbow Method For Optimal k')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# sets to clusters and returns the cluster points\n",
+    "kmeans = KMeans(n_clusters=3, random_state=111)\n",
+    "kmeans.fit(df_runtime_cluster)\n",
+    "print(kmeans.cluster_centers_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n",
+    "style.default_axes_and_ticks()\n",
+    "style.figsize()\n",
+    "\n",
+    "reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n",
+    "reqmem_alloc.map(sns.regplot, color=\"blue\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# clustered graph\n",
+    "style.default_axes_and_ticks()\n",
+    "style.figsize()\n",
+    "\n",
+    "runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n",
+    "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
+    "\n",
+    "plt.xlabel('ReqMemCPU(gigs)')\n",
+    "plt.ylabel('Elapsed(hours)')\n",
+    "plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run\n",
+    "\n",
+    "# clustered graph\n",
+    "style.default_axes_and_ticks()\n",
+    "style.figsize()\n",
+    "\n",
+    "alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n",
+    "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
+    "\n",
+    "plt.xlabel('AllocCPUS')\n",
+    "plt.ylabel('Elapsed(hours)')\n",
+    "plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+# Notebook Setup
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+import sqlite3
+import slurm2sql
+import pandas as pd
+import matplotlib.pyplot as plt
+%matplotlib inline
+import seaborn as sns
+import seaborn as sb
+import plotly.express as px
+import matplotlib.ticker as ticker
+import numpy as np
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+from RC_styles import rc_styles as style
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+from sklearn.cluster import KMeans
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# creates database of info from March 2020 using sqlite 3
+db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# df is starting database
+df = pd.read_sql('SELECT * FROM slurm', db)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# voluntary
+
+# for displaying all available column options
+pd.set_option('display.max_columns', None)
+df.head(5)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# converts units in ReqMemCPU column from bytes to gigs
+df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# converts Elapsed time to hours (from seconds)
+df['Elapsed'] = df['Elapsed'].div(3600)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# df_completed is dataframe of all completed jobs
+df_completed = df[df.State.str.contains('COMPLETED')]
+#df_completed.head(5)
+```
+
+%% Cell type:markdown id: tags:
+
+# ReqMemCPU,Corecount,Runtime
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# sets min and max parameters for ReqMemCPU
+UpperlimitGB = 50
+LowerlimitGB = 0
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# sets min and max parameters for AllocCPUS
+UpperlimitAllocCPU = 20
+LowerlimitAllocCPU = 0
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# dataset of needed columns for all graphs below
+df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
+df_1.head(5)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# rounds ReqMemCPU up to nearest whole number
+df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# rounds Elapsed up to nearest 2 decimal places
+df_1['Elapsed'] = df_1['Elapsed'].round(2)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# sorts dataset by AllocCPUS for easy visualization
+df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)
+df_1_sorted.head(5)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
+df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]
+df_runtime.head(5)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# creates a facet grid from df_runtime dataset
+# Elapsed time in hours and ReqMemCPU in gigs
+style.default_axes_and_ticks()
+style.figsize()
+
+full_facet = sb.PairGrid(df_runtime)
+full_facet.map(plt.scatter);
+plt.show()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+
+style.default_axes_and_ticks()
+style.figsize()
+
+runtime_graph = sns.scatterplot(x="ReqMemCPU", y="AllocCPUS",data=df_runtime)
+
+plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)
+
+plt.xlabel('ReqMemCPU(gigs)')
+plt.ylabel('AllocCPUS')
+#plt.yscale("log")
+
+plt.show()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above for clustering
+df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]
+df_runtime_cluster.tail(5)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
+Sum_of_squared_distances = []
+K = range(1,10)
+for k in K:
+    km = KMeans(n_clusters=k)
+    km = km.fit(df_runtime_cluster)
+    Sum_of_squared_distances.append(km.inertia_)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
+plt.plot(K, Sum_of_squared_distances, 'bx-')
+plt.xlabel('k')
+plt.ylabel('Sum_of_squared_distances')
+plt.title('Elbow Method For Optimal k')
+plt.show()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# sets to clusters and returns the cluster points
+kmeans = KMeans(n_clusters=3, random_state=111)
+kmeans.fit(df_runtime_cluster)
+print(kmeans.cluster_centers_)
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# facet grid of the two graphs being clustered using df_runtime_cluster dataset
+style.default_axes_and_ticks()
+style.figsize()
+
+reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4)
+reqmem_alloc.map(sns.regplot, color="blue")
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# clustered graph
+style.default_axes_and_ticks()
+style.figsize()
+
+runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
+plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
+
+plt.xlabel('ReqMemCPU(gigs)')
+plt.ylabel('Elapsed(hours)')
+plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)
+plt.show()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# must run
+
+# clustered graph
+style.default_axes_and_ticks()
+style.figsize()
+
+alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
+plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
+
+plt.xlabel('AllocCPUS')
+plt.ylabel('Elapsed(hours)')
+plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU)
+plt.show()
+```
+
+%% Cell type:code id: tags:
+
+``` 
+```