From 8ca3de4d25619613fcdff18e138dfdf351b89215 Mon Sep 17 00:00:00 2001
From: Ryan Randles Jones <rrand11@login001.cm.cluster>
Date: Wed, 30 Sep 2020 11:04:02 -0500
Subject: [PATCH] finalized graphs with normalization options

---
 Runtime-and-CoreCount.ipynb | 312 ++++++++++++++++++++++++++++++++----
 1 file changed, 283 insertions(+), 29 deletions(-)

diff --git a/Runtime-and-CoreCount.ipynb b/Runtime-and-CoreCount.ipynb
index 8448720..5c421e1 100644
--- a/Runtime-and-CoreCount.ipynb
+++ b/Runtime-and-CoreCount.ipynb
@@ -7,6 +7,16 @@
     "# Notebook Setup"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# year-date-month\n",
+    "#start_date = '2020-10-09'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -25,7 +35,8 @@
     "import plotly.express as px\n",
     "import matplotlib.ticker as ticker\n",
     "import numpy as np\n",
-    "from mpl_toolkits.mplot3d import Axes3D"
+    "from mpl_toolkits.mplot3d import Axes3D\n",
+    "import os"
    ]
   },
   {
@@ -50,6 +61,17 @@
     "from sklearn.cluster import KMeans"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#connecting to database\n",
+    "#db = sqlite3.connect('runtime_and_core_count.db')\n",
+    "#print(db)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -59,7 +81,29 @@
     "# must run\n",
     "\n",
     "# creates database of info from March 2020 using sqlite 3\n",
-    "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
+    "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')\n",
+    "#print(db)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#slurm2sql.slurm2sql(db, ['-S 2020-09-08 -E 2020-09-15 -a  --allocations -o Job,Submit,Start,End'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " #creating a database based on the start date\n",
+    "#slurm2sql.slurm2sql(db, ['-S', '2020-01-09', '-a'])\n",
+    "#print(db)\n",
+    "#print(start_date)"
    ]
   },
   {
@@ -71,7 +115,21 @@
     "# must run\n",
     "\n",
     "# df is starting database\n",
-    "df = pd.read_sql('SELECT * FROM slurm', db)"
+    "df = pd.read_sql('SELECT * FROM slurm', db)\n",
+    "#df = pd.read_sql('SELECT JobID,Submit,Start,End FROM slurm', db)\n",
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " #Deleting the database\n",
+    "#os.remove('runtime_and_core_count.db')\n",
+    "#os.remove('runtime_and_core_count.db-shm')\n",
+    "#os.remove('runtime_and_core_count.db-wal') "
    ]
   },
   {
@@ -84,7 +142,7 @@
     "\n",
     "# for displaying all available column options\n",
     "pd.set_option('display.max_columns', None)\n",
-    "df.head(5)"
+    "df.count()"
    ]
   },
   {
@@ -316,16 +374,49 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#must run\n",
+    "#must run if dataset will not be normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
     "\n",
     "#ReqMemCPU = 0 - 50 gigs\n",
     "#AllocCPUS = 0 - 50 cores\n",
     "#Elapsed = 0 - 150.02 hours\n",
     "\n",
+    "# data set without normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs\n",
     "df_runtime_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
     "df_runtime_cluster.head(5)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run if dataset will be 0-1 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
+    "\n",
+    "# 0-1 normalized dataset\n",
+    "# used for 0-1 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n",
+    "column_maxes_runtime = df_runtime_cluster.max()\n",
+    "df_runtime_cluster_max = column_maxes_runtime.max()\n",
+    "normalized_runtime_df = df_runtime_cluster / df_runtime_cluster_max\n",
+    "\n",
+    "print(normalized_runtime_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
+    "\n",
+    "# log10 normalized dataset\n",
+    "# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n",
+    "\n",
+    "log_runtime_df = np.log10(df_runtime_cluster+1)\n",
+    "log_runtime_df.describe()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -360,31 +451,41 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "column_maxes_runtime = df_runtime_cluster.max()\n",
-    "df_runtime_cluster_max = column_maxes_runtime.max()\n",
-    "normalized_runtime_df = df_runtime_cluster / df_runtime_cluster_max\n",
-    "\n",
-    "print(normalized_runtime_df)"
+    "# Elapsed/ReqMemCPU clustering"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Elapsed/ReqMemCPU clustering"
+    "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n",
+    "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n",
-    "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
+    "# In the cell below, set the fit based on the normalization type by uncommenting the line to run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment for no normalization\n",
+    "#elapsed_reqmem_fit = df_runtime_cluster\n",
+    "\n",
+    "# uncomment for 0-1 normalization\n",
+    "#elapsed_reqmem_fit = normalized_runtime_df\n",
+    "\n",
+    "# uncomment for log10 normalization\n",
+    "elapsed_reqmem_fit = log_runtime_df"
    ]
   },
   {
@@ -397,8 +498,30 @@
     "\n",
     "# sets to clusters and returns the cluster points\n",
     "kmeans_elapsed_reqmem = KMeans(n_clusters=3, random_state=111)\n",
-    "kmeans_elapsed_reqmem.fit(normalized_runtime_df)\n",
-    "clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_ * df_runtime_cluster_max"
+    "kmeans_elapsed_reqmem.fit(elapsed_reqmem_fit)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment if no normalization\n",
+    "#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_\n",
+    "\n",
+    "# uncomment if 0-1 normalization\n",
+    "#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_ * df_runtime_cluster_max\n",
+    "\n",
+    "# uncomment if log10 normalization\n",
+    "clusterpoints_elapsed_reqmem = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1"
    ]
   },
   {
@@ -612,6 +735,29 @@
     "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# In the cell below, set the fit based on the normalization type by uncommenting the line to run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment for no normalization\n",
+    "#elapsed_alloc_fit = df_runtime_cluster\n",
+    "\n",
+    "# uncomment for 0-1 normalization\n",
+    "#elapsed_alloc_fit = normalized_runtime_df\n",
+    "\n",
+    "# uncomment for log10 normalization\n",
+    "elapsed_alloc_fit = log_runtime_df"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -622,8 +768,30 @@
     "\n",
     "# sets to clusters and returns the cluster points\n",
     "kmeans_elapsed_alloc = KMeans(n_clusters=3, random_state=111)\n",
-    "kmeans_elapsed_alloc.fit(normalized_runtime_df)\n",
-    "clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_ * df_runtime_cluster_max"
+    "kmeans_elapsed_alloc.fit(elapsed_alloc_fit)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment if no normalization\n",
+    "#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_\n",
+    "\n",
+    "# uncomment if 0-1 normalization\n",
+    "#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_ * df_runtime_cluster_max\n",
+    "\n",
+    "# uncomment if log10 normalization\n",
+    "clusterpoints_elapsed_alloc = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1"
    ]
   },
   {
@@ -851,15 +1019,50 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# must run if dataset will not be normalized\n",
     "\n",
     "#ReqMemCPU = 0 - 50 gigs\n",
     "#AllocCPUS = 0 - 50 cores\n",
     "#Elapsed = 0 - 150.02 hours\n",
     "\n",
+    "# non normalized dataset\n",
+    "# used for fitting for the Alloc/ReqMem graph without normalization\n",
     "df_alloc_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
     "df_alloc_cluster.head(5)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run if dataset will be 0-1 normalized\n",
+    "\n",
+    "# 0-1 normalized dataset\n",
+    "# used for 0-1 normalization fitting for the Alloc/ReqMem graph\n",
+    "column_maxes_alloc = df_alloc_cluster.max()\n",
+    "df_alloc_cluster_max = column_maxes_alloc.max()\n",
+    "normalized_alloc_df = df_alloc_cluster / df_alloc_cluster_max\n",
+    "\n",
+    "print(normalized_alloc_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
+    "\n",
+    "# log10 normalized dataset\n",
+    "# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n",
+    "\n",
+    "log_alloc_df = np.log10(df_alloc_cluster+1)\n",
+    "log_alloc_df.describe()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -893,17 +1096,27 @@
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# In the cell below, set the fit based on the normalization type by uncommenting the line to run"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "column_maxes_alloc = df_alloc_cluster.max()\n",
-    "df_alloc_cluster_max = column_maxes_alloc.max()\n",
-    "normalized_alloc_df = df_alloc_cluster / df_alloc_cluster_max\n",
+    "# uncomment for no normalization\n",
+    "#alloc_reqmem_fit = df_alloc_cluster\n",
     "\n",
-    "print(normalized_alloc_df)"
+    "# uncomment for 0-1 normalization\n",
+    "#alloc_reqmem_fit = normalized_alloc_df\n",
+    "\n",
+    "# uncomment for log10 normalization\n",
+    "alloc_reqmem_fit = log_alloc_df"
    ]
   },
   {
@@ -916,15 +1129,56 @@
     "\n",
     "# sets to clusters and returns the cluster points\n",
     "kmeans_alloc_reqmem = KMeans(n_clusters=3, random_state=111)\n",
-    "kmeans_alloc_reqmem.fit(normalized_alloc_df)\n",
-    "clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_ * df_alloc_cluster_max"
+    "kmeans_alloc_reqmem.fit(alloc_reqmem_fit)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n",
+    "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment if no normalization\n",
+    "#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_\n",
+    "\n",
+    "# uncomment if 0-1 normalization\n",
+    "#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_ * df_alloc_cluster_max\n",
+    "\n",
+    "# uncomment if log10 normalization\n",
+    "clusterpoints_alloc_reqmem = (10 ** (kmeans_alloc_reqmem.cluster_centers_)) - 1\n",
+    "print(clusterpoints_alloc_reqmem)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clusterpoints_alloc_reqmem[:,0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "clusterpoints_alloc_reqmem[:,2]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next 5 cells find each cluster label, and create datasets of data in each cluster.\n",
     "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
    ]
   },
@@ -1029,14 +1283,14 @@
     "\n",
     "alloc_reqmem_cluster_graph = figure.add_subplot(121)\n",
     "alloc_reqmem_cluster_graph.scatter(df_alloc_cluster['ReqMemCPU'],df_alloc_cluster['AllocCPUS'], c=kmeans_alloc_reqmem.labels_, cmap='rainbow')\n",
-    "alloc_reqmem_cluster_graph.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,1], color='black')\n",
+    "alloc_reqmem_cluster_graph.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n",
     "plt.xlabel('ReqMemCPU(gigs)')\n",
     "plt.ylabel('AllocCPUS')\n",
     "\n",
     "# 3d veiw of the scatterplot for better understanding of the data\n",
     "alloc_reqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n",
     "alloc_reqmem_clustergraph_3d.scatter(df_alloc_cluster['ReqMemCPU'], df_alloc_cluster['AllocCPUS'], df_alloc_cluster['Elapsed'], c=kmeans_alloc_reqmem.labels_ ,cmap='rainbow')\n",
-    "alloc_reqmem_clustergraph_3d.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,1], color='black')\n",
+    "alloc_reqmem_clustergraph_3d.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n",
     "alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n",
     "alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n",
     "alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n",
-- 
GitLab