clean up

d16fd96e · Ryan Randles Jones · c1b8abbc · d16fd96e
Commit d16fd96e authored 4 years ago by Ryan Randles Jones
--- a/Cluster_Analysis.ipynb
+++ b/Cluster_Analysis.ipynb
@@ -81,8 +81,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "start_date = '2020-11-01'\n",
-    "end_date = '2020-11-23'"
+    "start_date = '2021-01-01'\n",
+    "end_date = '2021-01-08'"
   ]
  },
  {
@@ -156,10 +156,10 @@
   "outputs": [],
   "source": [
    "xaxis_min = 0\n",
-    "xaxis_max = 140\n",
+    "xaxis_max = 80\n",
    "\n",
    "yaxis_min = 0\n",
-    "yaxis_max = 100"
+    "yaxis_max = 20"
   ]
  },
  {
@@ -756,7 +756,7 @@
    "\n",
    "#####Blue\n",
    "ax = fig.add_subplot(431)\n",
-    "rqmem_elapsed_blue_hist = ax.hist2d(df_1_2d1['ReqMemCPU'],df_1_2d1['Elapsed'], \n",
+    "rqmem_elapsed_blue_hist = ax.hist2d(cluster_1['ReqMemCPU'],cluster_1['Elapsed'], \n",
    "                                bins =[x_blue_rqmem_elapsed_bins, y_blue_rqmem_elapsed_bins], \n",
    "                                      cmap = plt.cm.Blues)\n",
    "ax.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -767,7 +767,7 @@
    "\n",
    "\n",
    "ax2 = fig.add_subplot(432)\n",
-    "alloc_elapsed_blue_hist = ax2.hist2d(df_1_2d2['AllocCPUS'],df_1_2d2['Elapsed'], \n",
+    "alloc_elapsed_blue_hist = ax2.hist2d(cluster_1['AllocCPUS'],cluster_1['Elapsed'], \n",
    "                                bins =[x_blue_alloc_elapsed_bins, y_blue_alloc_elapsed_bins],\n",
    "                                      cmap = plt.cm.Blues)\n",
    "ax2.set_xlabel('AllocCPUS')\n",
@@ -778,7 +778,7 @@
    "\n",
    "\n",
    "ax3 = fig.add_subplot(433)\n",
-    "reqmem_alloc_blue_hist = ax3.hist2d(df_1_2d3['ReqMemCPU'],df_1_2d3['AllocCPUS'], \n",
+    "reqmem_alloc_blue_hist = ax3.hist2d(cluster_1['ReqMemCPU'],cluster_1['AllocCPUS'], \n",
    "                                bins =[x_blue_reqmem_alloc_bins, y_blue_reqmem_alloc_bins],\n",
    "                                     cmap = plt.cm.Blues)\n",
    "ax3.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -791,7 +791,7 @@
    " \n",
    "####Purple\n",
    "ax4 = fig.add_subplot(434) # This represents a (3x3) grid (row x col) and we are plotting the (1) subplot. The last number increments row-wise.\n",
-    "rqmem_elapsed_purple_hist = ax4.hist2d(df_0_2d1['ReqMemCPU'],df_0_2d1['Elapsed'], \n",
+    "rqmem_elapsed_purple_hist = ax4.hist2d(cluster_0['ReqMemCPU'],cluster_0['Elapsed'], \n",
    "                                bins =[x_purple_rqmem_elapsed_bins, y_purple_rqmem_elapsed_bins], \n",
    "                                      cmap = plt.cm.Blues)\n",
    "ax4.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -803,7 +803,7 @@
    "\n",
    "\n",
    "ax5 = fig.add_subplot(435) # Second subplot\n",
-    "alloc_elapsed_purple_hist = ax5.hist2d(df_0_2d2['AllocCPUS'],df_0_2d2['Elapsed'], \n",
+    "alloc_elapsed_purple_hist = ax5.hist2d(cluster_0['AllocCPUS'],cluster_0['Elapsed'], \n",
    "                                bins =[x_purple_alloc_elapsed_bins, y_purple_alloc_elapsed_bins], \n",
    "                                       cmap = plt.cm.Blues)\n",
    "ax5.set_xlabel('AllocCPUS')\n",
@@ -814,7 +814,7 @@
    "\n",
    "\n",
    "ax6 = fig.add_subplot(436)\n",
-    "reqmem_alloc_purple_hist = ax6.hist2d(df_0_2d3['ReqMemCPU'],df_0_2d3['AllocCPUS'], \n",
+    "reqmem_alloc_purple_hist = ax6.hist2d(cluster_0['ReqMemCPU'],cluster_0['AllocCPUS'], \n",
    "                                bins =[x_purple_reqmem_alloc_bins, y_purple_reqmem_alloc_bins], \n",
    "                                      cmap = plt.cm.Blues) # use magma or\n",
    "ax6.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -826,7 +826,7 @@
    "\n",
    "#####Red\n",
    "ax7 = fig.add_subplot(437)\n",
-    "rqmem_elapsed_red_hist = ax7.hist2d(df_3_2d1['ReqMemCPU'],df_3_2d1['Elapsed'], \n",
+    "rqmem_elapsed_red_hist = ax7.hist2d(cluster_3['ReqMemCPU'],cluster_3['Elapsed'], \n",
    "                                bins =[x_red_rqmem_elapsed_bins, y_red_rqmem_elapsed_bins],\n",
    "                                    cmap = plt.cm.Blues)\n",
    "ax7.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -837,7 +837,7 @@
    "\n",
    "\n",
    "ax8 = fig.add_subplot(438)\n",
-    "alloc_elapsed_red_hist = ax8.hist2d(df_3_2d2['AllocCPUS'],df_3_2d2['Elapsed'], \n",
+    "alloc_elapsed_red_hist = ax8.hist2d(cluster_3['AllocCPUS'],cluster_3['Elapsed'], \n",
    "                                bins =[x_red_reqmem_alloc_bins, y_red_reqmem_alloc_bins],\n",
    "                                    cmap = plt.cm.Blues)\n",
    "ax8.set_xlabel('AllocCPUS')\n",
@@ -848,7 +848,7 @@
    "\n",
    "\n",
    "ax9 = fig.add_subplot(439)\n",
-    "reqmem_alloc_red_hist = ax9.hist2d(df_3_2d3['ReqMemCPU'],df_3_2d3['AllocCPUS'], \n",
+    "reqmem_alloc_red_hist = ax9.hist2d(cluster_3['ReqMemCPU'],cluster_3['AllocCPUS'], \n",
    "                                bins =[x_red_reqmem_alloc_bins, y_red_reqmem_alloc_bins],\n",
    "                                   cmap = plt.cm.Blues)\n",
    "ax9.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -861,7 +861,7 @@
    "\n",
    "#####Yellow\n",
    "ax10 = fig.add_subplot(4,3,10)\n",
-    "rqmem_elapsed_yellow_hist = ax10.hist2d(df_2_2d1['ReqMemCPU'],df_2_2d1['Elapsed'], \n",
+    "rqmem_elapsed_yellow_hist = ax10.hist2d(cluster_2['ReqMemCPU'],cluster_2['Elapsed'], \n",
    "                                bins =[x_yellow_rqmem_elapsed_bins, y_yellow_rqmem_elapsed_bins],\n",
    "                                    cmap = plt.cm.Blues)\n",
    "ax10.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -872,7 +872,7 @@
    "\n",
    "\n",
    "ax11 = fig.add_subplot(4,3,11)\n",
-    "alloc_elapsed_yellow_hist = ax11.hist2d(df_2_2d2['AllocCPUS'],df_2_2d2['Elapsed'], \n",
+    "alloc_elapsed_yellow_hist = ax11.hist2d(cluster_2['AllocCPUS'],cluster_2['Elapsed'], \n",
    "                                bins =[x_yellow_reqmem_alloc_bins, y_yellow_reqmem_alloc_bins],\n",
    "                                    cmap = plt.cm.Blues)\n",
    "ax11.set_xlabel('AllocCPUS')\n",
@@ -883,7 +883,7 @@
    "\n",
    "\n",
    "ax12 = fig.add_subplot(4,3,12)\n",
-    "reqmem_alloc_yellow_hist = ax12.hist2d(df_2_2d3['ReqMemCPU'],df_2_2d3['AllocCPUS'], \n",
+    "reqmem_alloc_yellow_hist = ax12.hist2d(cluster_2['ReqMemCPU'],cluster_2['AllocCPUS'], \n",
    "                                bins =[x_yellow_reqmem_alloc_bins, y_yellow_reqmem_alloc_bins],\n",
    "                                   cmap = plt.cm.Blues)\n",
    "ax12.set_xlabel('ReqMemCPU(gigs)')\n",
@@ -914,7 +914,39 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "xaxis_min = 0\n",
+    "xaxis_max = 10\n",
+    "\n",
+    "yaxis_min = 0\n",
+    "yaxis_max = 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only the darkest spots\n",
+    "\n",
+    "# blue reqmem/elapsed denstiy spot: reqmem = 16-17 ,elapsed = 0-1\n",
+    "# blue alloc/elapsed denstiy spot: alloc = 4-5 ,elapsed = 0-1\n",
+    "# blue reqmem/alloc denstiy spot: reqmem = 16-17 ,alloc = 4-5\n",
+    "\n",
+    "# purple reqmem/elapsed denstiy spot: reqmem = 8-9 ,elapsed = 0-1\n",
+    "# purple alloc/elapsed denstiy spot: alloc = 1-2 ,elapsed = 0-1\n",
+    "# purple reqmem/alloc denstiy spot: reqmem = 8-9 ,alloc = 1-2\n",
+    "\n",
+    "# red reqmem/elapsed denstiy spot: reqmem = ,elapsed = \n",
+    "# red alloc/elapsed denstiy spot: alloc = ,elapsed = \n",
+    "# red reqmem/alloc denstiy spot: reqmem = ,alloc = \n",
+    "\n",
+    "# yellow reqmem/elapsed denstiy spot: reqmem = ,elapsed = \n",
+    "# yellow alloc/elapsed denstiy spot: alloc = ,elapsed = \n",
+    "# yellow reqmem/alloc density spot: reqmem = 0-10, alloc = 1-2\n",
+    "\n"
+   ]
  }
 ],
 "metadata": {

 %% Cell type:markdown id: tags:

 # Purpose

 %% Cell type:markdown id: tags:

 This notebook is for the clustering analysis of ReqMemCPU, AllocCPUS, and Elapsed.
 ReqMemCPU is the amount of RAM in gigs for each job as requested by the user.
 AllocCPUS is the amount of cores that were used for each job.
 Elapsed is the amount of time, in hours, that job took to run.

 %% Cell type:markdown id: tags:

 # Assumptions and Restrictions

 %% Cell type:markdown id: tags:

 Based on extensiive data and clustering exploration, this Notebook is set to graph up to 4 clusters (n_clusters = 4 in kmeans). In order to raise the number of clusters, more code will have to be added to add more 2d histograms of those extra cluster groups. And in order to lower the number of clusters, the code would have to be modified to expect fewer than 4 clusters as an input.

 %% Cell type:markdown id: tags:

 # Data Setup Options

 %% Cell type:markdown id: tags:

 There are 6 decisions to make in the set up of the data.

 Date Range: Choose a start date and an end date of data that you want to see cluster analysis of.
            The format is yyyy-mm-dd

 Bracketing Values: Choose a minimum and maximum value for ReqMemCPU, AllocCPUS, and Elapsed.
                   These values will allow you to "zoom in" or "zoom out" on your data.

 1. Upper/LowerGB - min/max ReqMemCPU: most of the data lies between 1 and 150 gigs.
    Most of the ReqMemCPU above 150 are outliers
 2. Upper/LowerAllocCPU - min/max AllocCPUS: most of the data lies between 1 and 260 cores.
    Most of the AllocCPUS above 260 are outliers
 3. Upper/LowerElapsed - min/max Elapsed: 150.02 hours is the highest Elapsed goes to.

 Data Normalization: There are three choices for normalization of the data - 'none', '0-1', or 'log'
 1. 'none' - no data normalization. Data is clustered and graphed as is.
 2. '0-1'- all data in the date range and bracketing ranges chosen will be scaled to have values between 0 and 1.
    This would be useful if your bracketing ranges differ greatly from each other.
 3. 'log' - all data in the date range and bracketing ranges chosen will be scaled to have log values.
    This would be useful if your bracketing ranges create data that is very large and would be easier to
    visualize with log values.


 2D Histogram X and Y Axes: This will set a min and max for the x and y axes in the 2D histograms of each of the four clusters. All the x and y axes are the same across the 2d histograms. This allows the user to "zoom" in or out of the data.

 %% Cell type:markdown id: tags:

 ## Date Range

 %% Cell type:code id: tags:

 ``` 
-start_date = '2020-11-01'
-end_date = '2020-11-23'
+start_date = '2021-01-01'
+end_date = '2021-01-08'
 ```

 %% Cell type:markdown id: tags:

 ## Bracketing Values

 %% Cell type:code id: tags:

 ``` 
 # sets min and max parameters for ReqMemCPU - user requested
 LowerlimitGB = 0
 UpperlimitGB = 150
 ```

 %% Cell type:code id: tags:

 ``` 
 # sets min and max parameters for AllocCPUS - allocated by slurm
 LowerlimitAllocCPU = 0
 UpperlimitAllocCPU = 260
 ```

 %% Cell type:code id: tags:

 ``` 
 # sets min and max parameters for Elapsed
 LowerlimitElapsed = 0
 UpperlimitElapsed = 150.02 # = 6.25 days
 ```

 %% Cell type:markdown id: tags:

 ## Data Normalization

 %% Cell type:code id: tags:

 ``` 
 # Enter 'none', '0-1', or 'log' as a choice for data nomralization
 Data_Normalization_Choice = 'none'
 ```

 %% Cell type:markdown id: tags:

 # 2D Histogram X and Y Axes

 %% Cell type:code id: tags:

 ``` 
 xaxis_min = 0
-xaxis_max = 140
+xaxis_max = 80

 yaxis_min = 0
-yaxis_max = 100
+yaxis_max = 20
 ```

 %% Cell type:markdown id: tags:

 # Imports

 %% Cell type:code id: tags:

 ``` 
 # must run

 import sqlite3
 import slurm2sql
 import pandas as pd
 import matplotlib.pyplot as plt
 %matplotlib inline
 import seaborn as sns
 import seaborn as sb
 import plotly.express as px
 import matplotlib.ticker as ticker
 import numpy as np
 from mpl_toolkits.mplot3d import Axes3D
 import os
 from RC_styles import rc_styles as style
 from sklearn.cluster import KMeans
 ```

 %% Cell type:markdown id: tags:

 # Database Creation

 %% Cell type:code id: tags:

 ``` 
 #connecting to database
 db = sqlite3.connect('cluster_analysis_from_'+str(start_date)+'to'+str(end_date)+'.db')
 ```

 %% Cell type:code id: tags:

 ``` 
 #creating a database based on the start date
 slurm2sql.slurm2sql(db, ['-S',start_date, '-E', end_date,'-X', '-a']) #-X is allocations, -a is all users
 ```

 %% Cell type:code id: tags:

 ``` 
 df = pd.read_sql('SELECT * FROM slurm', db)
 ```

 %% Cell type:code id: tags:

 ``` 
 #Deleting the database
 os.remove('cluster_analysis_from_'+str(start_date)+'to'+str(end_date)+'.db')
 os.remove('cluster_analysis_from_'+str(start_date)+'to'+str(end_date)+'.db-shm')
 os.remove('cluster_analysis_from_'+str(start_date)+'to'+str(end_date)+'.db-wal')
 ```

 %% Cell type:markdown id: tags:

 # Dataset Creation

 %% Cell type:code id: tags:

 ``` 
 # df_1 is dataframe of all completed jobs
 df_1 = df[df.State.str.contains('COMPLETED')]
 df_1.head(20)
 ```

 %% Cell type:code id: tags:

 ``` 
 # dataset of needed columns for all graphs below
 df_completed = df_1.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
 #df_1.head(5)
 ```

 %% Cell type:code id: tags:

 ``` 
 # converts units in ReqMemCPU column from bytes to gigs and rounds up to nearest whole number
 df_completed['ReqMemCPU'] = df_completed['ReqMemCPU'].div(1024**3).apply(np.ceil).apply(int)
 #df_completed.head()
 ```

 %% Cell type:code id: tags:

 ``` 
 # converts Elapsed time to hours (from seconds) and rounds up to nearest 2 decimal places
 df_completed['Elapsed'] = df_completed['Elapsed'].div(3600).round(2)
 ```

 %% Cell type:code id: tags:

 ``` 
 # creates dataset of ReqMemCPU, Elapsed, and AllocCPUS for completed jobs using the min and max parameters created above
 df_clustering = df_completed[(df_completed['ReqMemCPU'] <= UpperlimitGB) &
                       (df_completed['ReqMemCPU'] >= LowerlimitGB) &
                       (df_completed['AllocCPUS'] <= UpperlimitAllocCPU) &
                       (df_completed['AllocCPUS'] >= LowerlimitAllocCPU)
                       &
                       (df_completed['Elapsed'] <= UpperlimitElapsed) &
                       (df_completed['Elapsed'] >= LowerlimitElapsed)]
 ```

 %% Cell type:markdown id: tags:

 # Elbow Method to Determine Number of Clusters

 %% Cell type:code id: tags:

 ``` 
 # sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
 Sum_of_squared_distances = []
 K = range(1,10)
 for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(df_clustering)
    Sum_of_squared_distances.append(km.inertia_)
 ```

 %% Cell type:code id: tags:

 ``` 
 # the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
 plt.plot(K, Sum_of_squared_distances, 'bx-')
 plt.xlabel('k')
 plt.ylabel('Sum_of_squared_distances')
 plt.title('Elbow Method For Optimal k')
 plt.show()
 ```

 %% Cell type:markdown id: tags:

 # Normalizing the Data for ReqMem/Elapsed

 %% Cell type:code id: tags:

 ``` 
 if Data_Normalization_Choice == '0-1':
    column_max = df_clustering.max()
    df_clustering_max = column_max.max()
    fit = df_clustering / df_clustering_max
    print("0-1")

 elif Data_Normalization_Choice == 'log':
    fit = np.log10(df_clustering+1)
    print("log")

 else:
    fit = df_clustering
    print("none")
 ```

 %% Cell type:markdown id: tags:

 # kmeans Clustering

 %% Cell type:code id: tags:

 ``` 
 # sets to clusters and returns the cluster points
 kmeans_cluster = KMeans(n_clusters=4, random_state=111)
 kmeans_cluster.fit(fit)
 print(kmeans_cluster.cluster_centers_)
 ```

 %% Cell type:markdown id: tags:

 # Reverting Cluster Points Back to align with UnNormalized data

 %% Cell type:code id: tags:

 ``` 
 if Data_Normalization_Choice == '0-1':
    clusterpoints = kmeans_cluster.cluster_centers_ * df_clustering_max
    print("0-1")

 elif Data_Normalization_Choice == 'log':
    clusterpoints = 10 ** (kmeans_cluster.cluster_centers_) - 1
    print("log")

 else:
    clusterpoints = kmeans_cluster.cluster_centers_
    print("none")
    print(clusterpoints[:,0],clusterpoints[:,1])
 ```

 %% Cell type:markdown id: tags:

 # Separating the Clusters for 2d Histograms

 %% Cell type:code id: tags:

 ``` 
 # creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above

 #Purple
 cluster_0 = df_clustering[kmeans_cluster.labels_ == 0]

 #Green
 cluster_1 = df_clustering[kmeans_cluster.labels_ == 1]

 #Yellow
 cluster_2 = df_clustering[kmeans_cluster.labels_ == 2]

 #Red
 cluster_3 = df_clustering[kmeans_cluster.labels_ == 3]
 ```

 %% Cell type:code id: tags:

 ``` 
 # returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above.
 # These are the parameters for the scatter plots of each cluster
 print("Purple Cluster")
 print("ReqMemCPU:", "min =",cluster_0.ReqMemCPU.min()," ","max =",cluster_0.ReqMemCPU.max())
 print("Elapsed:", "min =",cluster_0.Elapsed.min()," ","max =",cluster_0.Elapsed.max())
 print("AllocCPUS:", "min =",cluster_0.AllocCPUS.min()," ","max =",cluster_0.AllocCPUS.max())

 print("\nBlue Cluster")
 print("ReqMemCPU:", "min =",cluster_1.ReqMemCPU.min()," ","max =",cluster_1.ReqMemCPU.max())
 print("Elapsed:", "min =",cluster_1.Elapsed.min()," ","max =",cluster_1.Elapsed.max())
 print("AllocCPUS:", "min =",cluster_1.AllocCPUS.min()," ","max =",cluster_1.AllocCPUS.max())

 print("\nYellow Cluster")
 print("ReqMemCPU:", "min =",cluster_2.ReqMemCPU.min()," ","max =",cluster_2.ReqMemCPU.max())
 print("Elapsed:", "min =",cluster_2.Elapsed.min()," ","max =",cluster_2.Elapsed.max())
 print("AllocCPUS:", "min =",cluster_2.AllocCPUS.min()," ","max =",cluster_2.AllocCPUS.max())

 print("\nRed Cluster")
 print("ReqMemCPU:", "min =",cluster_3.ReqMemCPU.min()," ","max =",cluster_3.ReqMemCPU.max())
 print("Elapsed:", "min =",cluster_3.Elapsed.min()," ","max =",cluster_3.Elapsed.max())
 print("AllocCPUS:", "min =",cluster_3.AllocCPUS.min()," ","max =",cluster_3.AllocCPUS.max())
 ```

 %% Cell type:code id: tags:

 ``` 
 # Creates datasets used to make the 2d histograms that correspond to each cluster scatter plot.
 # The groupby does not change the data, but it does make a small enough dataset

 # for purple cluster
 df_0_2d1 = cluster_0.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
 df_0_2d2 = cluster_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
 df_0_2d3 = cluster_0.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()

 # for blue cluster
 df_1_2d1 = cluster_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
 df_1_2d2 = cluster_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
 df_1_2d3 = cluster_1.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()

 # for yellow cluster
 df_2_2d1 = cluster_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
 df_2_2d2 = cluster_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
 df_2_2d3 = cluster_2.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()

 # for red cluster
 df_3_2d1 = cluster_3.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
 df_3_2d2 = cluster_3.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
 df_3_2d3 = cluster_3.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()
 ```

 %% Cell type:code id: tags:

 ``` 
 # Creating bins

 ####Purple
 purple_rqmem_min = np.min(cluster_0.ReqMemCPU.min())
 purple_rqmem_max = np.max(cluster_0.ReqMemCPU.max())

 purple_elapsed_min = np.min(cluster_0.Elapsed.min())
 purple_elapsed_max = np.max(cluster_0.Elapsed.max())

 purple_alloc_min = np.min(cluster_0.AllocCPUS.min())
 purple_alloc_max = np.max(cluster_0.AllocCPUS.max())


 x_purple_rqmem_elapsed_bins = list(range(purple_rqmem_max))
 y_purple_rqmem_elapsed_bins = list(range(int(purple_elapsed_max)))

 x_purple_alloc_elapsed_bins = list(range(purple_alloc_max))
 y_purple_alloc_elapsed_bins = list(range(int(purple_elapsed_max)))

 x_purple_reqmem_alloc_bins = list(range(purple_rqmem_max))
 y_purple_reqmem_alloc_bins = list(range(int(purple_alloc_max)))


 ####Blue
 blue_rqmem_min = np.min(cluster_1.ReqMemCPU.min())
 blue_rqmem_max = np.max(cluster_1.ReqMemCPU.max())

 blue_elapsed_min = np.min(cluster_1.Elapsed.min())
 blue_elapsed_max = np.max(cluster_1.Elapsed.max())

 blue_alloc_min = np.min(cluster_1.AllocCPUS.min())
 blue_alloc_max = np.max(cluster_1.AllocCPUS.max())


 x_blue_rqmem_elapsed_bins = list(range(blue_rqmem_max))
 y_blue_rqmem_elapsed_bins = list(range(int(blue_elapsed_max)))

 x_blue_alloc_elapsed_bins = list(range(blue_alloc_max))
 y_blue_alloc_elapsed_bins = list(range(int(blue_elapsed_max)))

 x_blue_reqmem_alloc_bins = list(range(blue_rqmem_max))
 y_blue_reqmem_alloc_bins = list(range(int(blue_alloc_max)))

 ####Yellow
 yellow_rqmem_min = np.min(cluster_2.ReqMemCPU.min())
 yellow_rqmem_max = np.max(cluster_2.ReqMemCPU.max())

 yellow_elapsed_min = np.min(cluster_2.Elapsed.min())
 yellow_elapsed_max = np.max(cluster_2.Elapsed.max())

 yellow_alloc_min = np.min(cluster_2.AllocCPUS.min())
 yellow_alloc_max = np.max(cluster_2.AllocCPUS.max())


 x_yellow_rqmem_elapsed_bins = list(range(yellow_rqmem_max))
 y_yellow_rqmem_elapsed_bins = list(range(int(yellow_elapsed_max)))

 x_yellow_alloc_elapsed_bins = list(range(yellow_alloc_max))
 y_yellow_alloc_elapsed_bins = list(range(int(yellow_elapsed_max)))

 x_yellow_reqmem_alloc_bins = list(range(yellow_rqmem_max)) # list range gives one bin per gig
 y_yellow_reqmem_alloc_bins = list(range(yellow_alloc_max)) # list range gives one bin per cpu


 ####Red
 red_rqmem_min = np.min(cluster_3.ReqMemCPU.min())
 red_rqmem_max = np.max(cluster_3.ReqMemCPU.max())

 red_elapsed_min = np.min(cluster_3.Elapsed.min())
 red_elapsed_max = np.max(cluster_3.Elapsed.max())

 red_alloc_min = np.min(cluster_3.AllocCPUS.min())
 red_alloc_max = np.max(cluster_3.AllocCPUS.max())


 x_red_rqmem_elapsed_bins = list(range(red_rqmem_max))
 y_red_rqmem_elapsed_bins = list(range(int(red_elapsed_max)))

 x_red_alloc_elapsed_bins = list(range(int(red_alloc_max)))
 y_red_alloc_elapsed_bins = list(range(int(red_elapsed_max)))

 x_red_reqmem_alloc_bins = list(range(red_rqmem_max)) # list range gives one bin per gig
 y_red_reqmem_alloc_bins = list(range(red_alloc_max)) # list range gives one bin per cpu
 ```

 %% Cell type:markdown id: tags:

 # Summary Stats

 %% Cell type:code id: tags:

 ``` 
 # number of purple cluster jobs and users
 cluster_0_jobs = cluster_0.shape[0]
 users_0 = cluster_0.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
 users_0['user'] = pd.Series(df_1['User'])
 cluster_0_users = users_0.drop_duplicates(subset=['user'])

 # number of green cluster jobs and users
 cluster_1_jobs = cluster_1.shape[0]
 users_1 = cluster_1.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
 users_1['user'] = pd.Series(df_1['User'])
 cluster_1_users = users_1.drop_duplicates(subset=['user'])

 # number of yellow cluster jobs and users
 cluster_2_jobs = cluster_2.shape[0]
 users_2 = cluster_2.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
 users_2['user'] = pd.Series(df_1['User'])
 cluster_2_users = users_2.drop_duplicates(subset=['user'])

 # number of red cluster jobs and users
 cluster_3_jobs = cluster_3.shape[0]
 users_3 = cluster_3.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
 users_3['user'] = pd.Series(df_1['User'])
 cluster_3_users = users_3.drop_duplicates(subset=['user'])


 summary_stats = pd.DataFrame({'Job Count': [cluster_0_jobs, cluster_1_jobs, cluster_2_jobs, cluster_3_jobs],
                              'User Count': [cluster_0_users.shape[0], cluster_1_users.shape[0], cluster_2_users.shape[0], cluster_3_users.shape[0]]},
                               index=['Purple Cluster','Blue Cluster', 'Yellow Cluster', 'Red Cluster'])
 summary_stats.head()
 ```

 %% Cell type:markdown id: tags:

 # Plotting of the Clusters in 1d and 3d Graphs

 %% Cell type:code id: tags:

 ``` 
 print(summary_stats)

 figure = plt.figure()

 figure.set_size_inches(20,15)

 # ReqMem/Elapsed 2d Graph
 rqmem_elapsed_clustergraph = figure.add_subplot(2,3,1)

 rqmem_elapsed_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['Elapsed'],
                                   c=kmeans_cluster.labels_, cmap='rainbow')
 rqmem_elapsed_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')
 plt.xlabel('ReqMemCPU(gigs)')
 plt.ylabel('Elapsed(hours)')
 plt.title('Runtime/Requested Gigs RAM')


 # Alloc/Elapsed 2d Graph
 alloc_elapsed_clustergraph = figure.add_subplot(2,3,2)
 alloc_elapsed_clustergraph.scatter(df_clustering['AllocCPUS'],df_clustering['Elapsed'],
                                   c=kmeans_cluster.labels_, cmap='rainbow')
 alloc_elapsed_clustergraph.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')
 plt.xlabel('AllocCPUS')
 plt.ylabel('Elapsed(hours)')
 plt.title('Runtime/Core')

 # ReqMem/Alloc 2d Graph
 rqmem_alloc_clustergraph = figure.add_subplot(2,3,3)
 rqmem_alloc_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['AllocCPUS'],
                                   c=kmeans_cluster.labels_, cmap='rainbow')
 rqmem_alloc_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')
 plt.xlabel('ReqMemCPU(gigs)')
 plt.ylabel('AllocCPUS')
 plt.title('Cores/Requested Gigs RAM')

 ########### 3d Graphs
 # ReqMem/Alloc 3d Graph
 rqmem_alloc_clustergraph_3d = figure.add_subplot(2,3,4, projection='3d')
 rqmem_alloc_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'],
                                     c=kmeans_cluster.labels_ ,cmap='rainbow')
 rqmem_alloc_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')
 rqmem_alloc_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')
 rqmem_alloc_clustergraph_3d.set_ylabel('AllocCPUS')
 rqmem_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')

 # sets size and color for gridlines by axis
 rqmem_alloc_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
 rqmem_alloc_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
 rqmem_alloc_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})


 # Alloc/Elapsed 3d Graph
 alloc_elapsed_clustergraph_3d = figure.add_subplot(2,3,5, projection='3d')
 alloc_elapsed_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'],
                                      c=kmeans_cluster.labels_ ,cmap='rainbow')
 alloc_elapsed_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')
 alloc_elapsed_clustergraph_3d.set_xlabel('AllocCPUS')
 alloc_elapsed_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')
 alloc_elapsed_clustergraph_3d.set_zlabel('Elapsed(hours)')

 alloc_elapsed_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
 alloc_elapsed_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
 alloc_elapsed_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})



 # ReqMem/Elapsed 3d Graph
 rqmem_elapsed_clustergraph_3d = figure.add_subplot(2,3,6, projection='3d')
 rqmem_elapsed_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'],
                                      c=kmeans_cluster.labels_ ,cmap='rainbow')
 rqmem_elapsed_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')

 rqmem_elapsed_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')
 rqmem_elapsed_clustergraph_3d.set_ylabel('Elapsed(hours)')
 rqmem_elapsed_clustergraph_3d.set_zlabel('AllocCPUS')

 rqmem_elapsed_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
 rqmem_elapsed_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
 rqmem_elapsed_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})


 plt.show()
 ```

 %% Cell type:markdown id: tags:

 # Plotting of Each Cluster Group in 2d Histograms

 %% Cell type:code id: tags:

 ``` 
 fig = plt.figure()
 fig.set_size_inches(20,10)
 #fig.tight_layout()


 #####Blue
 ax = fig.add_subplot(431)
-rqmem_elapsed_blue_hist = ax.hist2d(df_1_2d1['ReqMemCPU'],df_1_2d1['Elapsed'],
+rqmem_elapsed_blue_hist = ax.hist2d(cluster_1['ReqMemCPU'],cluster_1['Elapsed'],
                                bins =[x_blue_rqmem_elapsed_bins, y_blue_rqmem_elapsed_bins],
                                      cmap = plt.cm.Blues)
 ax.set_xlabel('ReqMemCPU(gigs)')
 ax.set_ylabel('Elapsed(hours)')
 ax.set_title('Blue Cluster')
 ax.set_xlim(xaxis_min,xaxis_max)
 ax.set_ylim(yaxis_min,yaxis_max)


 ax2 = fig.add_subplot(432)
-alloc_elapsed_blue_hist = ax2.hist2d(df_1_2d2['AllocCPUS'],df_1_2d2['Elapsed'],
+alloc_elapsed_blue_hist = ax2.hist2d(cluster_1['AllocCPUS'],cluster_1['Elapsed'],
                                bins =[x_blue_alloc_elapsed_bins, y_blue_alloc_elapsed_bins],
                                      cmap = plt.cm.Blues)
 ax2.set_xlabel('AllocCPUS')
 ax2.set_ylabel('Elapsed(hours)')
 ax2.set_title('Blue Cluster')
 ax2.set_xlim(xaxis_min,xaxis_max)
 ax2.set_ylim(yaxis_min,yaxis_max)


 ax3 = fig.add_subplot(433)
-reqmem_alloc_blue_hist = ax3.hist2d(df_1_2d3['ReqMemCPU'],df_1_2d3['AllocCPUS'],
+reqmem_alloc_blue_hist = ax3.hist2d(cluster_1['ReqMemCPU'],cluster_1['AllocCPUS'],
                                bins =[x_blue_reqmem_alloc_bins, y_blue_reqmem_alloc_bins],
                                     cmap = plt.cm.Blues)
 ax3.set_xlabel('ReqMemCPU(gigs)')
 ax3.set_ylabel('AllocCPUS')
 ax3.set_title('Blue Cluster')
 ax3.set_xlim(xaxis_min,xaxis_max)
 ax3.set_ylim(yaxis_min,yaxis_max)



 ####Purple
 ax4 = fig.add_subplot(434) # This represents a (3x3) grid (row x col) and we are plotting the (1) subplot. The last number increments row-wise.
-rqmem_elapsed_purple_hist = ax4.hist2d(df_0_2d1['ReqMemCPU'],df_0_2d1['Elapsed'],
+rqmem_elapsed_purple_hist = ax4.hist2d(cluster_0['ReqMemCPU'],cluster_0['Elapsed'],
                                bins =[x_purple_rqmem_elapsed_bins, y_purple_rqmem_elapsed_bins],
                                      cmap = plt.cm.Blues)
 ax4.set_xlabel('ReqMemCPU(gigs)')
 ax4.set_ylabel('Elapsed(hours)')
 ax4.set_title('Purple Cluster')
 ax4.set_xlim(xaxis_min,xaxis_max)
 ax4.set_ylim(yaxis_min,yaxis_max)



 ax5 = fig.add_subplot(435) # Second subplot
-alloc_elapsed_purple_hist = ax5.hist2d(df_0_2d2['AllocCPUS'],df_0_2d2['Elapsed'],
+alloc_elapsed_purple_hist = ax5.hist2d(cluster_0['AllocCPUS'],cluster_0['Elapsed'],
                                bins =[x_purple_alloc_elapsed_bins, y_purple_alloc_elapsed_bins],
                                       cmap = plt.cm.Blues)
 ax5.set_xlabel('AllocCPUS')
 ax5.set_ylabel('Elapsed(hours)')
 ax5.set_title('Purple Cluster')
 ax5.set_xlim(xaxis_min,xaxis_max)
 ax5.set_ylim(yaxis_min,yaxis_max)


 ax6 = fig.add_subplot(436)
-reqmem_alloc_purple_hist = ax6.hist2d(df_0_2d3['ReqMemCPU'],df_0_2d3['AllocCPUS'],
+reqmem_alloc_purple_hist = ax6.hist2d(cluster_0['ReqMemCPU'],cluster_0['AllocCPUS'],
                                bins =[x_purple_reqmem_alloc_bins, y_purple_reqmem_alloc_bins],
                                      cmap = plt.cm.Blues) # use magma or
 ax6.set_xlabel('ReqMemCPU(gigs)')
 ax6.set_ylabel('AllocCPUS')
 ax6.set_title('Purple Cluster')
 ax6.set_xlim(xaxis_min,xaxis_max)
 ax6.set_ylim(yaxis_min,yaxis_max)


 #####Red
 ax7 = fig.add_subplot(437)
-rqmem_elapsed_red_hist = ax7.hist2d(df_3_2d1['ReqMemCPU'],df_3_2d1['Elapsed'],
+rqmem_elapsed_red_hist = ax7.hist2d(cluster_3['ReqMemCPU'],cluster_3['Elapsed'],
                                bins =[x_red_rqmem_elapsed_bins, y_red_rqmem_elapsed_bins],
                                    cmap = plt.cm.Blues)
 ax7.set_xlabel('ReqMemCPU(gigs)')
 ax7.set_ylabel('Elapsed(hours)')
 ax7.set_title('Red Cluster')
 ax7.set_xlim(xaxis_min,xaxis_max)
 ax7.set_ylim(yaxis_min,yaxis_max)


 ax8 = fig.add_subplot(438)
-alloc_elapsed_red_hist = ax8.hist2d(df_3_2d2['AllocCPUS'],df_3_2d2['Elapsed'],
+alloc_elapsed_red_hist = ax8.hist2d(cluster_3['AllocCPUS'],cluster_3['Elapsed'],
                                bins =[x_red_reqmem_alloc_bins, y_red_reqmem_alloc_bins],
                                    cmap = plt.cm.Blues)
 ax8.set_xlabel('AllocCPUS')
 ax8.set_ylabel('Elapsed(hours)')
 ax8.set_title('Red Cluster')
 ax8.set_xlim(xaxis_min,xaxis_max)
 ax8.set_ylim(yaxis_min,yaxis_max)


 ax9 = fig.add_subplot(439)
-reqmem_alloc_red_hist = ax9.hist2d(df_3_2d3['ReqMemCPU'],df_3_2d3['AllocCPUS'],
+reqmem_alloc_red_hist = ax9.hist2d(cluster_3['ReqMemCPU'],cluster_3['AllocCPUS'],
                                bins =[x_red_reqmem_alloc_bins, y_red_reqmem_alloc_bins],
                                   cmap = plt.cm.Blues)
 ax9.set_xlabel('ReqMemCPU(gigs)')
 ax9.set_ylabel('AllocCPUS')
 ax9.set_title('Red Cluster')
 ax9.set_xlim(xaxis_min,xaxis_max)
 ax9.set_ylim(yaxis_min,yaxis_max)



 #####Yellow
 ax10 = fig.add_subplot(4,3,10)
-rqmem_elapsed_yellow_hist = ax10.hist2d(df_2_2d1['ReqMemCPU'],df_2_2d1['Elapsed'],
+rqmem_elapsed_yellow_hist = ax10.hist2d(cluster_2['ReqMemCPU'],cluster_2['Elapsed'],
                                bins =[x_yellow_rqmem_elapsed_bins, y_yellow_rqmem_elapsed_bins],
                                    cmap = plt.cm.Blues)
 ax10.set_xlabel('ReqMemCPU(gigs)')
 ax10.set_ylabel('Elapsed(hours)')
 ax10.set_title('Yellow Cluster')
 ax10.set_xlim(xaxis_min,xaxis_max)
 ax10.set_ylim(yaxis_min,yaxis_max)


 ax11 = fig.add_subplot(4,3,11)
-alloc_elapsed_yellow_hist = ax11.hist2d(df_2_2d2['AllocCPUS'],df_2_2d2['Elapsed'],
+alloc_elapsed_yellow_hist = ax11.hist2d(cluster_2['AllocCPUS'],cluster_2['Elapsed'],
                                bins =[x_yellow_reqmem_alloc_bins, y_yellow_reqmem_alloc_bins],
                                    cmap = plt.cm.Blues)
 ax11.set_xlabel('AllocCPUS')
 ax11.set_ylabel('Elapsed(hours)')
 ax11.set_title('Yellow Cluster')
 ax11.set_xlim(xaxis_min,xaxis_max)
 ax11.set_ylim(yaxis_min,yaxis_max)


 ax12 = fig.add_subplot(4,3,12)
-reqmem_alloc_yellow_hist = ax12.hist2d(df_2_2d3['ReqMemCPU'],df_2_2d3['AllocCPUS'],
+reqmem_alloc_yellow_hist = ax12.hist2d(cluster_2['ReqMemCPU'],cluster_2['AllocCPUS'],
                                bins =[x_yellow_reqmem_alloc_bins, y_yellow_reqmem_alloc_bins],
                                   cmap = plt.cm.Blues)
 ax12.set_xlabel('ReqMemCPU(gigs)')
 ax12.set_ylabel('AllocCPUS')
 ax12.set_title('Yellow Cluster')
 ax12.set_xlim(xaxis_min,xaxis_max)
 ax12.set_ylim(yaxis_min,yaxis_max)



 # sets the spacing between plots
 # top = space between title and graphs - increase number to bring title down and decrease to bring title up
 # left = space to the left
 # wspace = padding on both sides of graphs
 # hspace = padding on top and bottom of graphs
 #bottom = the bottom of the subplots of the figure
 #top = the top of the subplots of the figure

 fig.subplots_adjust(left=0.0, wspace=0.2, top=3, hspace=.2)
 figure.suptitle('Clusters', fontsize=20)


 plt.show()
 ```

 %% Cell type:code id: tags:

 ``` 
+xaxis_min = 0
+xaxis_max = 10
+
+yaxis_min = 0
+yaxis_max = 5
+```
+
+%% Cell type:code id: tags:
+
+``` 
+# only the darkest spots
+
+# blue reqmem/elapsed denstiy spot: reqmem = 16-17 ,elapsed = 0-1
+# blue alloc/elapsed denstiy spot: alloc = 4-5 ,elapsed = 0-1
+# blue reqmem/alloc denstiy spot: reqmem = 16-17 ,alloc = 4-5
+
+# purple reqmem/elapsed denstiy spot: reqmem = 8-9 ,elapsed = 0-1
+# purple alloc/elapsed denstiy spot: alloc = 1-2 ,elapsed = 0-1
+# purple reqmem/alloc denstiy spot: reqmem = 8-9 ,alloc = 1-2
+
+# red reqmem/elapsed denstiy spot: reqmem = ,elapsed =
+# red alloc/elapsed denstiy spot: alloc = ,elapsed =
+# red reqmem/alloc denstiy spot: reqmem = ,alloc =
+
+# yellow reqmem/elapsed denstiy spot: reqmem = ,elapsed =
+# yellow alloc/elapsed denstiy spot: alloc = ,elapsed =
+# yellow reqmem/alloc density spot: reqmem = 0-10, alloc = 1-2
+
 ```