diff --git a/Runtime-and-CoreCount.ipynb b/Runtime-and-CoreCount.ipynb index 3587c830b3f3103e28db38dd73fb6a119c93f725..0def6d6c9363dcaa2f603ee799bb5da4e3821e7f 100644 --- a/Runtime-and-CoreCount.ipynb +++ b/Runtime-and-CoreCount.ipynb @@ -128,7 +128,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ReqMemCPU,Corecount,Runtime Clustering" + "# ReqMemCPU,Corecount,Runtime FacetGrid" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 4 cells set up the df_1 dataset, which will be the base dataset used for the facet grid." ] }, { @@ -174,20 +181,18 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# makes ReqMemCPU column whole numbers rather than floats for easy readability in graphs\n", "df_1.ReqMemCPU = df_1.ReqMemCPU.apply(int)\n", "df_1.head(5)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "print(\"ReqMemCPU\",df_1.ReqMemCPU.max())\n", - "print(\"AllocCPUS\",df_1.AllocCPUS.max())\n", - "\n", - "print(\"Elapsed\",df_1.Elapsed.max())" + "The next 3 cells set the min and max parameters for ReqMemCPU, AllocCPUS, and Elapsed. These parameters are used in creating the facet grid and are the parameters for all the cluster graphs." ] }, { @@ -229,6 +234,13 @@ "UpperlimitElapsed = 150.02" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "df_facet is a dataset created from df_1 using the parameters above. It will be the dataset that all the cluster graphs will be made from." + ] + }, { "cell_type": "code", "execution_count": null, @@ -256,7 +268,7 @@ "source": [ "# must run\n", "\n", - "# creates a facet grid from df_runtime dataset\n", + "# creates a facet grid from df_1 dataset\n", "# Elapsed time in hours and ReqMemCPU in gigs\n", "style.default_axes_and_ticks()\n", "style.figsize()\n", @@ -264,7 +276,6 @@ "full_facet = sns.pairplot(df_facet, diag_kind = 'kde') # makes density plots - kernel density estimate\n", "# y axis is count in the diagonal graphs\n", "\n", - "#full_facet = sb.PairGrid(df_facet)\n", "full_facet.map(plt.scatter);\n", "plt.show()" ] @@ -282,9 +293,9 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "# voluntary\n", "\n", - "# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n", + "# pair grid of the two graphs being clustered using df_facet dataset\n", "style.default_axes_and_ticks()\n", "style.figsize()\n", "\n", @@ -292,12 +303,20 @@ "elapsed_reqmem_alloc.map(sns.regplot, color=\"blue\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "df_runtime_cluster is a dataset made from df_facet. It is used to make the elbow graph and calculate the clustering for Elapsed/ReqMemCPU and Elapsed/AllocCPUS" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "#must run\n", "\n", "#ReqMemCPU = 0 - 50 gigs\n", "#AllocCPUS = 0 - 50 cores\n", @@ -347,6 +366,14 @@ "# Elapsed/ReqMemCPU clustering" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", + "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." + ] + }, { "cell_type": "code", "execution_count": null, @@ -367,6 +394,13 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", + "\n", + "# 0 = purple cluster\n", + "# 1 = green cluster\n", + "# 2 = red cluster\n", "np.unique(kmeans_elapsed_reqmem.labels_)" ] }, @@ -378,7 +412,7 @@ "source": [ "# must run\n", "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", "\n", "#Purple\n", "df_elapsed_reqmem_0 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 0]\n", @@ -399,6 +433,10 @@ "metadata": {}, "outputs": [], "source": [ + "# voluntary\n", + "\n", + "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", + "# These are the parameters for the scatter plots of each cluster\n", "print(\"Purple Cluster\")\n", "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_0.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_0.ReqMemCPU.max())\n", "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_0.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_0.Elapsed.max())\n", @@ -421,10 +459,19 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", + "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", + "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", + "\n", + "# for purple cluster \n", "df_elapsed_reqmem_swarmplot0 = df_elapsed_reqmem_0.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", "\n", + "# for green cluster\n", "df_elapsed_reqmem_swarmplot1 = df_elapsed_reqmem_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", "\n", + "# for red cluster\n", "df_elapsed_reqmem_swarmplot2 = df_elapsed_reqmem_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()" ] }, @@ -434,6 +481,9 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# scatterplot of Runtime per Requested gigs of RAM using df_runtime_cluster dataset with clustering\n", "figure = plt.figure(figsize=(14, 8))\n", "figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", "\n", @@ -443,6 +493,7 @@ "plt.xlabel('ReqMemCPU(gigs)')\n", "plt.ylabel('Elapsed(hours)')\n", "\n", + "# 3d veiw of the scatterplot for better understanding of the data\n", "elapsed_rqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", "elapsed_rqmem_clustergraph_3d.scatter(df_runtime_cluster['ReqMemCPU'], df_runtime_cluster['Elapsed'], df_runtime_cluster['AllocCPUS'], c=kmeans_elapsed_reqmem.labels_ ,cmap='rainbow')\n", "elapsed_rqmem_clustergraph_3d.scatter(kmeans_elapsed_reqmem.cluster_centers_[:,0] ,kmeans_elapsed_reqmem.cluster_centers_[:,1], color='black')\n", @@ -453,12 +504,22 @@ "plt.show()\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This graph is a facet grid that shows scatterplots by cluster color on the left, and it's corresponging swarmplot in the right. The swarmplots give a better understanding of the distrubition of jobs matching a specific datapoint." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# sets the figure and size that each subplot is added to - each graph is a subplot\n", "figure = plt.figure( figsize=(16, 16))\n", "\n", "#purple cluster and swarmplot\n", @@ -502,9 +563,15 @@ "plt.xlabel('ReqMemCPU(gigs)')\n", "plt.ylabel('Elapsed(hours)')\n", "\n", + "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", "figure.subplots_adjust(left=0.2, wspace=0.2, top=1.2, hspace=0.3)\n", "\n", - "figure.suptitle('This is a somewhat long figure title', fontsize=20)\n", + "figure.suptitle('Clusters from Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB, fontsize=20)\n", "\n", "plt.show()\n" ] @@ -516,6 +583,14 @@ "# Elapsed/AllocCPUS clustering" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", + "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." + ] + }, { "cell_type": "code", "execution_count": null, @@ -536,6 +611,13 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", + "\n", + "# 0 = purple cluster\n", + "# 1 = green cluster\n", + "# 2 = red cluster\n", "np.unique(kmeans_elapsed_alloc.labels_)" ] }, @@ -547,7 +629,7 @@ "source": [ "# must run\n", "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", "\n", "#Purple\n", "df_elapsed_alloc_0 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 0]\n", @@ -568,6 +650,10 @@ "metadata": {}, "outputs": [], "source": [ + "# voluntary\n", + "\n", + "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", + "# These are the parameters for the scatter plots of each cluster\n", "print(\"Purple Cluster\")\n", "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_0.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_0.ReqMemCPU.max())\n", "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_0.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_0.Elapsed.max())\n", @@ -590,8 +676,33 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", + "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", + "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", + "\n", + "# for purple cluster \n", + "df_elapsed_alloc_swarmplot0 = df_elapsed_alloc_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "\n", + "# for green cluster \n", + "df_elapsed_alloc_swarmplot1 = df_elapsed_alloc_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "\n", + "# for red cluster \n", + "df_elapsed_alloc_swarmplot2 = df_elapsed_alloc_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# scatterplot of Runtime per Core using df_runtime_cluster dataset with clustering\n", "figure = plt.figure(figsize=(14, 8))\n", - "figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", + "figure.suptitle('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n", "\n", "elapsed_alloc_clustergraph = figure.add_subplot(121)\n", "elapsed_alloc_clustergraph.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans_elapsed_alloc.labels_, cmap='rainbow')\n", @@ -599,6 +710,7 @@ "plt.xlabel('AllocCPUS')\n", "plt.ylabel('Elapsed(hours)')\n", "\n", + "# 3d veiw of the scatterplot for better understanding of the data\n", "elapsed_alloc_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", "elapsed_alloc_clustergraph_3d.scatter(df_runtime_cluster['AllocCPUS'], df_runtime_cluster['Elapsed'], df_runtime_cluster['ReqMemCPU'], c=kmeans_elapsed_alloc.labels_ ,cmap='rainbow')\n", "elapsed_alloc_clustergraph_3d.scatter(kmeans_elapsed_alloc.cluster_centers_[:,0] ,kmeans_elapsed_alloc.cluster_centers_[:,1], color='black')\n", @@ -615,17 +727,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_elapsed_alloc_swarmplot0 = df_elapsed_alloc_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "df_elapsed_alloc_swarmplot1 = df_elapsed_alloc_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "df_elapsed_alloc_swarmplot2 = df_elapsed_alloc_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "# must run\n", + "\n", + "# sets the figure and size that each subplot is added to - each graph is a subplot\n", "figure = plt.figure( figsize=(21, 16))\n", "\n", "#purple cluster and swarmplot\n", @@ -669,8 +773,13 @@ "plt.xlabel('AllocCPUS')\n", "plt.ylabel('Elapsed(hours)')\n", "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", "figure.subplots_adjust(left=0.2, wspace=0.2, top=.94, hspace=0.3)\n", - "figure.suptitle('This is a somewhat long figure title', fontsize=20)\n", + "figure.suptitle('Clusters from Runtime per Core %i cores or less'%UpperlimitAllocCPU, fontsize=20)\n", "\n", "\n", "plt.show()" @@ -691,7 +800,7 @@ "source": [ "# must run\n", "\n", - "# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n", + "# scatterplot of AllocCPUS/ReqMemCPU using df_facet dataset\n", "style.default_axes_and_ticks()\n", "style.figsize()\n", "\n", @@ -699,10 +808,17 @@ "\n", "plt.xlabel('ReqMemCPU(gigs)')\n", "plt.ylabel('AllocCPUS')\n", - "#plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_core)\n", + "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "df_alloc_cluster is a dataset made from df_facet. It is used to make the elbow graph and calculate the clustering for AllocCPUS/ReqMemCPU" + ] + }, { "cell_type": "code", "execution_count": null, @@ -726,7 +842,7 @@ "source": [ "# must run\n", "\n", - "# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n", + "# sets up info for plotting the optimal number of clusters - uses df_alloc_cluster datasaet\n", "Sum_of_squared_distances = []\n", "K = range(1,10)\n", "for k in K:\n", @@ -743,7 +859,7 @@ "source": [ "# must run\n", "\n", - "# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n", + "# the bend in the graph is the optimal number of clusters for graphs using the df_alloc_cluster dataset\n", "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", "plt.xlabel('k')\n", "plt.ylabel('Sum_of_squared_distances')\n", @@ -751,6 +867,14 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", + "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." + ] + }, { "cell_type": "code", "execution_count": null, @@ -771,6 +895,13 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", + "\n", + "# 0 = purple cluster\n", + "# 1 = green cluster\n", + "# 2 = red cluster\n", "np.unique(kmeans_elapsed_alloc.labels_)" ] }, @@ -782,7 +913,7 @@ "source": [ "# must run\n", "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", "\n", "#Purple\n", "df_alloc_reqmem_0 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 0]\n", @@ -803,6 +934,10 @@ "metadata": {}, "outputs": [], "source": [ + "# voluntary\n", + "\n", + "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", + "# These are the parameters for the scatter plots of each cluster\n", "print(\"Purple Cluster\")\n", "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_0.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_0.ReqMemCPU.max())\n", "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_0.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_0.Elapsed.max())\n", @@ -825,8 +960,19 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", + "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", + "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", + "\n", + "# for purple cluster \n", "df_alloc_reqmem_swarmplot0 = df_alloc_reqmem_0.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()\n", + "\n", + "# for green cluster \n", "df_alloc_reqmem_swarmplot1 = df_alloc_reqmem_1.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()\n", + "\n", + "# for red cluster \n", "df_alloc_reqmem_swarmplot2 = df_alloc_reqmem_2.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()" ] }, @@ -836,8 +982,11 @@ "metadata": {}, "outputs": [], "source": [ + "# must run\n", + "\n", + "# scatterplot of Core per Requested RAM using df_alloc_cluster dataset with clustering\n", "figure = plt.figure(figsize=(14, 8))\n", - "figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", + "figure.suptitle('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", "\n", "alloc_reqmem_cluster_graph = figure.add_subplot(121)\n", "alloc_reqmem_cluster_graph.scatter(df_alloc_cluster['ReqMemCPU'],df_alloc_cluster['AllocCPUS'], c=kmeans_alloc_reqmem.labels_, cmap='rainbow')\n", @@ -845,7 +994,7 @@ "plt.xlabel('ReqMemCPU(gigs)')\n", "plt.ylabel('AllocCPUS')\n", "\n", - "\n", + "# 3d veiw of the scatterplot for better understanding of the data\n", "alloc_reqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", "alloc_reqmem_clustergraph_3d.scatter(df_alloc_cluster['ReqMemCPU'], df_alloc_cluster['AllocCPUS'], df_alloc_cluster['Elapsed'], c=kmeans_alloc_reqmem.labels_ ,cmap='rainbow')\n", "alloc_reqmem_clustergraph_3d.scatter(kmeans_alloc_reqmem.cluster_centers_[:,0] ,kmeans_alloc_reqmem.cluster_centers_[:,1], color='black')\n", @@ -862,7 +1011,10 @@ "metadata": {}, "outputs": [], "source": [ - "figure = plt.figure( figsize=(21, 16))\n", + "# must run\n", + "\n", + "# sets the figure and size that each subplot is added to - each graph is a subplot\n", + "figure = plt.figure(figsize=(21, 16))\n", "\n", "\n", "#purple cluster and swarmplot\n", @@ -906,8 +1058,13 @@ "plt.xlabel('ReqMemCPU(gigs)')\n", "plt.ylabel('AllocCPUS')\n", "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", "figure.subplots_adjust(left=0.2, wspace=0.2, top=.94, hspace=0.3)\n", - "figure.suptitle('This is a somewhat long figure title', fontsize=20)\n", + "figure.suptitle('Clusters from Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB, fontsize=20)\n", "\n", "\n", "\n",