diff --git a/Runtime-and-CoreCount.ipynb b/Runtime-and-CoreCount.ipynb index 7c375952737f8f81ce861df369abe641ba50d57a..5c421e1f776fbf5ccf1ccfde6e34669c8dc1e48b 100644 --- a/Runtime-and-CoreCount.ipynb +++ b/Runtime-and-CoreCount.ipynb @@ -7,6 +7,16 @@ "# Notebook Setup" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# year-date-month\n", + "#start_date = '2020-10-09'" + ] + }, { "cell_type": "code", "execution_count": null, @@ -24,7 +34,9 @@ "import seaborn as sb\n", "import plotly.express as px\n", "import matplotlib.ticker as ticker\n", - "import numpy as np" + "import numpy as np\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "import os" ] }, { @@ -49,6 +61,17 @@ "from sklearn.cluster import KMeans" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#connecting to database\n", + "#db = sqlite3.connect('runtime_and_core_count.db')\n", + "#print(db)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -58,7 +81,29 @@ "# must run\n", "\n", "# creates database of info from March 2020 using sqlite 3\n", - "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')" + "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')\n", + "#print(db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#slurm2sql.slurm2sql(db, ['-S 2020-09-08 -E 2020-09-15 -a --allocations -o Job,Submit,Start,End'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " #creating a database based on the start date\n", + "#slurm2sql.slurm2sql(db, ['-S', '2020-01-09', '-a'])\n", + "#print(db)\n", + "#print(start_date)" ] }, { @@ -70,7 +115,21 @@ "# must run\n", "\n", "# df is starting database\n", - "df = pd.read_sql('SELECT * FROM slurm', db)" + "df = pd.read_sql('SELECT * FROM slurm', db)\n", + "#df = pd.read_sql('SELECT JobID,Submit,Start,End FROM slurm', db)\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " #Deleting the database\n", + "#os.remove('runtime_and_core_count.db')\n", + "#os.remove('runtime_and_core_count.db-shm')\n", + "#os.remove('runtime_and_core_count.db-wal') " ] }, { @@ -83,7 +142,7 @@ "\n", "# for displaying all available column options\n", "pd.set_option('display.max_columns', None)\n", - "df.head(5)" + "df.count()" ] }, { @@ -127,7 +186,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ReqMemCPU,Corecount,Runtime Clustering" + "# ReqMemCPU,Corecount,Runtime FacetGrid" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 4 cells set up the df_1 dataset, which will be the base dataset used for the facet grid." ] }, { @@ -175,9 +241,16 @@ "source": [ "# must run\n", "\n", - "# sorts dataset by AllocCPUS for easy visualization\n", - "df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)\n", - "df_1_sorted.head(5)" + "# makes ReqMemCPU column whole numbers rather than floats for easy readability in graphs\n", + "df_1.ReqMemCPU = df_1.ReqMemCPU.apply(int)\n", + "df_1.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 3 cells set the min and max parameters for ReqMemCPU, AllocCPUS, and Elapsed. These parameters are used in creating the facet grid and are the parameters for all the cluster graphs." ] }, { @@ -189,8 +262,8 @@ "# must run\n", "\n", "# sets min and max parameters for ReqMemCPU\n", - "UpperlimitGB = 50\n", - "LowerlimitGB = 0" + "LowerlimitGB = 0\n", + "UpperlimitGB = 50" ] }, { @@ -202,8 +275,28 @@ "# must run\n", "\n", "# sets min and max parameters for AllocCPUS\n", - "UpperlimitAllocCPU = 20\n", - "LowerlimitAllocCPU = 0" + "LowerlimitAllocCPU = 0\n", + "UpperlimitAllocCPU = 50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets min and max parameters for Elapsed\n", + "LowerlimitElapsed = 0\n", + "UpperlimitElapsed = 150.02" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "df_facet is a dataset created from df_1 using the parameters above. It will be the dataset that all the cluster graphs will be made from." ] }, { @@ -215,7 +308,13 @@ "# must run\n", "\n", "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", - "df_facet = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n", + "df_facet = df_1[(df_1['ReqMemCPU'] <= UpperlimitGB) & \n", + " (df_1['ReqMemCPU'] >= LowerlimitGB) & \n", + " (df_1['AllocCPUS'] <= UpperlimitAllocCPU) & \n", + " (df_1['AllocCPUS'] >= LowerlimitAllocCPU)\n", + " & \n", + " (df_1['Elapsed'] <= UpperlimitElapsed) & \n", + " (df_1['Elapsed'] >= LowerlimitElapsed)]\n", "df_facet.head(5)" ] }, @@ -227,12 +326,14 @@ "source": [ "# must run\n", "\n", - "# creates a facet grid from df_runtime dataset\n", + "# creates a facet grid from df_1 dataset\n", "# Elapsed time in hours and ReqMemCPU in gigs\n", "style.default_axes_and_ticks()\n", "style.figsize()\n", "\n", - "full_facet = sb.PairGrid(df_facet)\n", + "full_facet = sns.pairplot(df_facet, diag_kind = 'kde') # makes density plots - kernel density estimate\n", + "# y axis is count in the diagonal graphs\n", + "\n", "full_facet.map(plt.scatter);\n", "plt.show()" ] @@ -250,11 +351,21 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "# voluntary\n", "\n", - "# sets min and max parameters for ReqMemCPU for clustered Elapsed Time Graphs\n", - "UpperlimitGB_elapsed = 50\n", - "LowerlimitGB_elapsed = 0" + "# pair grid of the two graphs being clustered using df_facet dataset\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "elapsed_reqmem_alloc = sns.PairGrid(df_facet, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n", + "elapsed_reqmem_alloc.map(sns.regplot, color=\"blue\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "df_runtime_cluster is a dataset made from df_facet. It is used to make the elbow graph and calculate the clustering for Elapsed/ReqMemCPU and Elapsed/AllocCPUS" ] }, { @@ -263,11 +374,15 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "#must run if dataset will not be normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", + "\n", + "#ReqMemCPU = 0 - 50 gigs\n", + "#AllocCPUS = 0 - 50 cores\n", + "#Elapsed = 0 - 150.02 hours\n", "\n", - "# sets min and max parameters for AllocCPUS for clustered Elapsed Time Graphs\n", - "UpperlimitAllocCPU_elapsed = 20\n", - "LowerlimitAllocCPU_elapsed = 0" + "# data set without normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs\n", + "df_runtime_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "df_runtime_cluster.head(5)" ] }, { @@ -276,11 +391,30 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "# must run if dataset will be 0-1 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", - "df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB_elapsed) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB_elapsed) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU_elapsed) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU_elapsed)]\n", - "df_runtime_cluster.head(5)" + "# 0-1 normalized dataset\n", + "# used for 0-1 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n", + "column_maxes_runtime = df_runtime_cluster.max()\n", + "df_runtime_cluster_max = column_maxes_runtime.max()\n", + "normalized_runtime_df = df_runtime_cluster / df_runtime_cluster_max\n", + "\n", + "print(normalized_runtime_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", + "\n", + "# log10 normalized dataset\n", + "# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n", + "\n", + "log_runtime_df = np.log10(df_runtime_cluster+1)\n", + "log_runtime_df.describe()" ] }, { @@ -316,6 +450,44 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Elapsed/ReqMemCPU clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", + "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In the cell below, set the fit based on the normalization type by uncommenting the line to run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment for no normalization\n", + "#elapsed_reqmem_fit = df_runtime_cluster\n", + "\n", + "# uncomment for 0-1 normalization\n", + "#elapsed_reqmem_fit = normalized_runtime_df\n", + "\n", + "# uncomment for log10 normalization\n", + "elapsed_reqmem_fit = log_runtime_df" + ] + }, { "cell_type": "code", "execution_count": null, @@ -325,9 +497,15 @@ "# must run\n", "\n", "# sets to clusters and returns the cluster points\n", - "kmeans = KMeans(n_clusters=3, random_state=111)\n", - "kmeans.fit(df_runtime_cluster)\n", - "print(kmeans.cluster_centers_)" + "kmeans_elapsed_reqmem = KMeans(n_clusters=3, random_state=111)\n", + "kmeans_elapsed_reqmem.fit(elapsed_reqmem_fit)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type" ] }, { @@ -336,14 +514,14 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "# uncomment if no normalization\n", + "#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_\n", "\n", - "# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", + "# uncomment if 0-1 normalization\n", + "#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_ * df_runtime_cluster_max\n", "\n", - "elapsed_reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n", - "elapsed_reqmem_alloc.map(sns.regplot, color=\"blue\")" + "# uncomment if log10 normalization\n", + "clusterpoints_elapsed_reqmem = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1" ] }, { @@ -354,17 +532,12 @@ "source": [ "# must run\n", "\n", - "# clustered graph\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "elapsed_runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n", - "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n", + "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", "\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed)\n", - "plt.show()" + "# 0 = purple cluster\n", + "# 1 = green cluster\n", + "# 2 = red cluster\n", + "np.unique(kmeans_elapsed_reqmem.labels_)" ] }, { @@ -375,24 +548,45 @@ "source": [ "# must run\n", "\n", - "# clustered graph\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", "\n", - "elapsed_alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n", - "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n", + "#Purple\n", + "df_elapsed_reqmem_0 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 0]\n", "\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_elapsed)\n", - "plt.show()" + "#Green\n", + "df_elapsed_reqmem_1 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 1]\n", + "\n", + "#Red\n", + "df_elapsed_reqmem_2 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 2]\n", + "\n", + "#df_elapsed_reqmem_0.head(5)\n", + "#df_elapsed_reqmem_0.ReqMemCPU.count()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "# Detailed Look at Elapsed Time - In terms of Requested RAM and Cores" + "# voluntary\n", + "\n", + "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", + "# These are the parameters for the scatter plots of each cluster\n", + "print(\"Purple Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_0.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_0.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_0.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_0.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_elapsed_reqmem_0.AllocCPUS.min(),\" \",\"max =\",df_elapsed_reqmem_0.AllocCPUS.max())\n", + "\n", + "print(\"\\nGreen Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_1.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_1.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_1.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_1.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_elapsed_reqmem_1.AllocCPUS.min(),\" \",\"max =\",df_elapsed_reqmem_1.AllocCPUS.max())\n", + "\n", + "print(\"\\nRed Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_2.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_2.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_2.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_2.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_elapsed_reqmem_2.AllocCPUS.min(),\" \",\"max =\",df_elapsed_reqmem_2.AllocCPUS.max())" ] }, { @@ -403,9 +597,18 @@ "source": [ "# must run\n", "\n", - "# second set of min and max parameters for ReqMemCPU to use for AllocCPU/ReqMemCPU cluster graph \n", - "UpperlimitGB_alloc = 50\n", - "LowerlimitGB_alloc = 0" + "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", + "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", + "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", + "\n", + "# for purple cluster \n", + "df_elapsed_reqmem_swarmplot0 = df_elapsed_reqmem_0.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", + "\n", + "# for green cluster\n", + "df_elapsed_reqmem_swarmplot1 = df_elapsed_reqmem_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", + "\n", + "# for red cluster\n", + "df_elapsed_reqmem_swarmplot2 = df_elapsed_reqmem_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()" ] }, { @@ -416,9 +619,40 @@ "source": [ "# must run\n", "\n", - "# sets min and max parameters for AllocCPUS\n", - "UpperlimitAllocCPU_alloc = 60\n", - "LowerlimitAllocCPU_alloc = 0" + "# scatterplot of Runtime per Requested gigs of RAM using df_runtime_cluster dataset with clustering\n", + "figure = plt.figure(figsize=(14, 8))\n", + "figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", + "\n", + "elapsed_rqmem_clustergraph = figure.add_subplot(121)\n", + "elapsed_rqmem_clustergraph.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans_elapsed_reqmem.labels_, cmap='rainbow')\n", + "elapsed_rqmem_clustergraph.scatter(clusterpoints_elapsed_reqmem[:,0] ,clusterpoints_elapsed_reqmem[:,1], color='black')\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "# 3d veiw of the scatterplot for better understanding of the data\n", + "elapsed_rqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", + "elapsed_rqmem_clustergraph_3d.scatter(df_runtime_cluster['ReqMemCPU'], df_runtime_cluster['Elapsed'], df_runtime_cluster['AllocCPUS'], \n", + " c=kmeans_elapsed_reqmem.labels_ ,cmap='rainbow')\n", + "elapsed_rqmem_clustergraph_3d.scatter(clusterpoints_elapsed_reqmem[:,0] ,clusterpoints_elapsed_reqmem[:,1], color='black')\n", + "\n", + "\n", + "elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')\n", + "elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')\n", + "elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')\n", + "\n", + "# sets size and color for gridlines by axis\n", + "elapsed_rqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_rqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_rqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This graph is a facet grid that shows scatterplots by cluster color on the left, and it's corresponging swarmplot in the right. The swarmplots give a better understanding of the distrubition of jobs matching a specific datapoint." ] }, { @@ -429,9 +663,83 @@ "source": [ "# must run\n", "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", - "df_allocCPUS_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB_alloc) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB_alloc) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU_alloc) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU_alloc)]\n", - "df_allocCPUS.head(5)" + "# sets the figure and size that each subplot is added to - each graph is a subplot\n", + "figure = plt.figure( figsize=(16, 16))\n", + "\n", + "#purple cluster and swarmplot\n", + "elapsed_reqmem_clustergraph_0 = figure.add_subplot(423)\n", + "elapsed_reqmem_clustergraph_0.scatter(df_elapsed_reqmem_0['ReqMemCPU'],df_elapsed_reqmem_0['Elapsed'], color = \"blueviolet\")\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "figure.add_subplot(424)\n", + "elapsed_reqmem_swarmgraph_0 = sns.swarmplot(data=df_elapsed_reqmem_swarmplot0, x='ReqMemCPU', y='Elapsed')\n", + "plt.yticks(np.arange(df_elapsed_reqmem_0.Elapsed.min(), df_elapsed_reqmem_0.Elapsed.max(), 5))\n", + "plt.margins(0.02)\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "\n", + "#green cluster and swarmplot\n", + "elapsed_reqmem_clustergraph_1 = figure.add_subplot(425)\n", + "elapsed_reqmem_clustergraph_1.scatter(df_elapsed_reqmem_1['ReqMemCPU'],df_elapsed_reqmem_1['Elapsed'], color = \"aquamarine\")\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "figure.add_subplot(426)\n", + "elapsed_reqmem_swarmgraph_1 = sns.swarmplot(data=df_elapsed_reqmem_swarmplot1, x='ReqMemCPU', y='Elapsed')\n", + "plt.yticks(np.arange(df_elapsed_reqmem_1.Elapsed.min(), df_elapsed_reqmem_1.Elapsed.max(), 5))\n", + "plt.margins(0.02)\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "\n", + "#red cluster and swarmplot\n", + "elapsed_reqmem_clustergraph_2 = figure.add_subplot(427)\n", + "elapsed_reqmem_clustergraph_2.scatter(df_elapsed_reqmem_2['ReqMemCPU'],df_elapsed_reqmem_2['Elapsed'], color = \"red\")\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "figure.add_subplot(428)\n", + "elapsed_reqmem_swarmgraph_2 = sns.swarmplot(data=df_elapsed_reqmem_swarmplot2, x='ReqMemCPU', y='Elapsed')\n", + "plt.yticks(np.arange(df_elapsed_reqmem_2.Elapsed.min(), df_elapsed_reqmem_2.Elapsed.max(), 10))\n", + "plt.margins(0.02)\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", + "figure.subplots_adjust(left=0.2, wspace=0.2, top=1.2, hspace=0.3)\n", + "\n", + "figure.suptitle('Clusters from Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB, fontsize=20)\n", + "\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Elapsed/AllocCPUS clustering" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", + "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In the cell below, set the fit based on the normalization type by uncommenting the line to run" ] }, { @@ -440,15 +748,14 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "# uncomment for no normalization\n", + "#elapsed_alloc_fit = df_runtime_cluster\n", "\n", - "# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n", - "Sum_of_squared_distances = []\n", - "K = range(1,10)\n", - "for k in K:\n", - " km = KMeans(n_clusters=k)\n", - " km = km.fit(df_allocCPUS_cluster)\n", - " Sum_of_squared_distances.append(km.inertia_)" + "# uncomment for 0-1 normalization\n", + "#elapsed_alloc_fit = normalized_runtime_df\n", + "\n", + "# uncomment for log10 normalization\n", + "elapsed_alloc_fit = log_runtime_df" ] }, { @@ -459,12 +766,16 @@ "source": [ "# must run\n", "\n", - "# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n", - "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", - "plt.xlabel('k')\n", - "plt.ylabel('Sum_of_squared_distances')\n", - "plt.title('Elbow Method For Optimal k')\n", - "plt.show()" + "# sets to clusters and returns the cluster points\n", + "kmeans_elapsed_alloc = KMeans(n_clusters=3, random_state=111)\n", + "kmeans_elapsed_alloc.fit(elapsed_alloc_fit)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type" ] }, { @@ -473,12 +784,14 @@ "metadata": {}, "outputs": [], "source": [ - "# must run\n", + "# uncomment if no normalization\n", + "#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_\n", "\n", - "# sets to clusters and returns the cluster points\n", - "kmeans = KMeans(n_clusters=3, random_state=111)\n", - "kmeans.fit(df_allocCPUS_cluster)\n", - "print(kmeans.cluster_centers_)" + "# uncomment if 0-1 normalization\n", + "#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_ * df_runtime_cluster_max\n", + "\n", + "# uncomment if log10 normalization\n", + "clusterpoints_elapsed_alloc = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1" ] }, { @@ -487,18 +800,14 @@ "metadata": {}, "outputs": [], "source": [ - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "alloc_reqmem_graph = sns.scatterplot(x=\"ReqMemCPU\", y=\"AllocCPUS\",data=df_allocCPUS_cluster)\n", - "\n", - "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_alloc)\n", + "# must run\n", "\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "#plt.yscale(\"log\")\n", + "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", "\n", - "plt.show()" + "# 0 = purple cluster\n", + "# 1 = green cluster\n", + "# 2 = red cluster\n", + "np.unique(kmeans_elapsed_alloc.labels_)" ] }, { @@ -509,16 +818,554 @@ "source": [ "# must run\n", "\n", - "# clustered graph\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", "\n", - "alloc_reqmem_cluster_graph = plt.scatter(df_allocCPUS_cluster['ReqMemCPU'],df_allocCPUS_cluster['AllocCPUS'], c=kmeans.labels_, cmap='rainbow')\n", - "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n", + "#Purple\n", + "df_elapsed_alloc_0 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 0]\n", "\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", + "#Green\n", + "df_elapsed_alloc_1 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 1]\n", + "\n", + "#Red\n", + "df_elapsed_alloc_2 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 2]\n", + "\n", + "#df_elapsed_alloc_0.head(5)\n", + "#df_elapsed_alloc_0.AllocCPUS.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# voluntary\n", + "\n", + "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", + "# These are the parameters for the scatter plots of each cluster\n", + "print(\"Purple Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_0.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_0.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_0.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_0.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_elapsed_alloc_0.AllocCPUS.min(),\" \",\"max =\",df_elapsed_alloc_0.AllocCPUS.max())\n", + "\n", + "print(\"\\nGreen Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_1.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_1.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_1.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_1.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_elapsed_alloc_1.AllocCPUS.min(),\" \",\"max =\",df_elapsed_alloc_1.AllocCPUS.max())\n", + "\n", + "print(\"\\nRed Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_2.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_2.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_2.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_2.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_elapsed_alloc_2.AllocCPUS.min(),\" \",\"max =\",df_elapsed_alloc_2.AllocCPUS.max())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", + "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", + "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", + "\n", + "# for purple cluster \n", + "df_elapsed_alloc_swarmplot0 = df_elapsed_alloc_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "\n", + "# for green cluster \n", + "df_elapsed_alloc_swarmplot1 = df_elapsed_alloc_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "\n", + "# for red cluster \n", + "df_elapsed_alloc_swarmplot2 = df_elapsed_alloc_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# scatterplot of Runtime per Core using df_runtime_cluster dataset with clustering\n", + "figure = plt.figure(figsize=(14, 8))\n", + "figure.suptitle('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n", + "\n", + "elapsed_alloc_clustergraph = figure.add_subplot(121)\n", + "elapsed_alloc_clustergraph.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans_elapsed_alloc.labels_, cmap='rainbow')\n", + "elapsed_alloc_clustergraph.scatter(clusterpoints_elapsed_alloc[:,0] ,clusterpoints_elapsed_alloc[:,1], color='black')\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "# 3d veiw of the scatterplot for better understanding of the data\n", + "elapsed_alloc_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", + "elapsed_alloc_clustergraph_3d.scatter(df_runtime_cluster['AllocCPUS'], df_runtime_cluster['Elapsed'], df_runtime_cluster['ReqMemCPU'], c=kmeans_elapsed_alloc.labels_ ,cmap='rainbow')\n", + "elapsed_alloc_clustergraph_3d.scatter(clusterpoints_elapsed_alloc[:,0] ,clusterpoints_elapsed_alloc[:,1], color='black')\n", + "elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')\n", + "elapsed_alloc_clustergraph_3d.set_ylabel('Elapsed(hours)')\n", + "elapsed_alloc_clustergraph_3d.set_zlabel('ReqMemCPU(gigs)')\n", + "\n", + "# sets size and color for gridlines by axis\n", + "elapsed_alloc_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_alloc_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_alloc_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets the figure and size that each subplot is added to - each graph is a subplot\n", + "figure = plt.figure( figsize=(21, 16))\n", + "\n", + "#purple cluster and swarmplot\n", + "elapsed_alloc_clustergraph_0 = figure.add_subplot(321)\n", + "elapsed_alloc_clustergraph_0.scatter(df_elapsed_alloc_0['AllocCPUS'],df_elapsed_alloc_0['Elapsed'], color = \"blueviolet\")\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "figure.add_subplot(322)\n", + "elapsed_alloc_swarmgraph_0 = sns.swarmplot(data=df_elapsed_alloc_swarmplot0, x='AllocCPUS', y='Elapsed')\n", + "plt.yticks(np.arange(df_elapsed_alloc_0.Elapsed.min(), df_elapsed_alloc_0.Elapsed.max(), 5))\n", + "plt.margins(0.02)\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "\n", + "#green cluster and swarmplot\n", + "elapsed_alloc_clustergraph_1 = figure.add_subplot(323)\n", + "elapsed_alloc_clustergraph_1.scatter(df_elapsed_alloc_1['AllocCPUS'],df_elapsed_alloc_1['Elapsed'], color = \"aquamarine\")\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "figure.add_subplot(324)\n", + "elapsed_alloc_swarmgraph_1 = sns.swarmplot(data=df_elapsed_alloc_swarmplot1, x='AllocCPUS', y='Elapsed')\n", + "plt.yticks(np.arange(df_elapsed_alloc_1.Elapsed.min(), df_elapsed_alloc_1.Elapsed.max(), 5))\n", + "plt.margins(0.02)\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "\n", + "#red cluster and swarmplot\n", + "elapsed_alloc_clustergraph_2 = figure.add_subplot(325)\n", + "elapsed_alloc_clustergraph_2.scatter(df_elapsed_alloc_2['AllocCPUS'],df_elapsed_alloc_2['Elapsed'], color = \"red\")\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "figure.add_subplot(326)\n", + "elapsed_alloc_swarmgraph_2 = sns.swarmplot(data=df_elapsed_alloc_swarmplot2, x='AllocCPUS', y='Elapsed')\n", + "plt.yticks(np.arange(df_elapsed_alloc_2.Elapsed.min(), df_elapsed_alloc_2.Elapsed.max(), 10))\n", + "plt.margins(0.02)\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", + "figure.subplots_adjust(left=0.2, wspace=0.2, top=.94, hspace=0.3)\n", + "figure.suptitle('Clusters from Runtime per Core %i cores or less'%UpperlimitAllocCPU, fontsize=20)\n", + "\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Detailed Look at Cores - In terms of Requested RAM" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# scatterplot of AllocCPUS/ReqMemCPU using df_facet dataset\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "elapsed_alloc_reqmem = plt.scatter(df_facet[\"ReqMemCPU\"], df_facet[\"AllocCPUS\"], color = \"blue\")\n", + "\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", "plt.ylabel('AllocCPUS')\n", - "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_alloc)\n", + "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "df_alloc_cluster is a dataset made from df_facet. It is used to make the elbow graph and calculate the clustering for AllocCPUS/ReqMemCPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run if dataset will not be normalized\n", + "\n", + "#ReqMemCPU = 0 - 50 gigs\n", + "#AllocCPUS = 0 - 50 cores\n", + "#Elapsed = 0 - 150.02 hours\n", + "\n", + "# non normalized dataset\n", + "# used for fitting for the Alloc/ReqMem graph without normalization\n", + "df_alloc_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "df_alloc_cluster.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run if dataset will be 0-1 normalized\n", + "\n", + "# 0-1 normalized dataset\n", + "# used for 0-1 normalization fitting for the Alloc/ReqMem graph\n", + "column_maxes_alloc = df_alloc_cluster.max()\n", + "df_alloc_cluster_max = column_maxes_alloc.max()\n", + "normalized_alloc_df = df_alloc_cluster / df_alloc_cluster_max\n", + "\n", + "print(normalized_alloc_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", + "\n", + "# log10 normalized dataset\n", + "# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n", + "\n", + "log_alloc_df = np.log10(df_alloc_cluster+1)\n", + "log_alloc_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets up info for plotting the optimal number of clusters - uses df_alloc_cluster datasaet\n", + "Sum_of_squared_distances = []\n", + "K = range(1,10)\n", + "for k in K:\n", + " km = KMeans(n_clusters=k)\n", + " km = km.fit(df_alloc_cluster)\n", + " Sum_of_squared_distances.append(km.inertia_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# the bend in the graph is the optimal number of clusters for graphs using the df_alloc_cluster dataset\n", + "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", + "plt.xlabel('k')\n", + "plt.ylabel('Sum_of_squared_distances')\n", + "plt.title('Elbow Method For Optimal k')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In the cell below, set the fit based on the normalization type by uncommenting the line to run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment for no normalization\n", + "#alloc_reqmem_fit = df_alloc_cluster\n", + "\n", + "# uncomment for 0-1 normalization\n", + "#alloc_reqmem_fit = normalized_alloc_df\n", + "\n", + "# uncomment for log10 normalization\n", + "alloc_reqmem_fit = log_alloc_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets to clusters and returns the cluster points\n", + "kmeans_alloc_reqmem = KMeans(n_clusters=3, random_state=111)\n", + "kmeans_alloc_reqmem.fit(alloc_reqmem_fit)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment if no normalization\n", + "#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_\n", + "\n", + "# uncomment if 0-1 normalization\n", + "#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_ * df_alloc_cluster_max\n", + "\n", + "# uncomment if log10 normalization\n", + "clusterpoints_alloc_reqmem = (10 ** (kmeans_alloc_reqmem.cluster_centers_)) - 1\n", + "print(clusterpoints_alloc_reqmem)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clusterpoints_alloc_reqmem[:,0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clusterpoints_alloc_reqmem[:,2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next 5 cells find each cluster label, and create datasets of data in each cluster.\n", + "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", + "\n", + "# 0 = purple cluster\n", + "# 1 = green cluster\n", + "# 2 = red cluster\n", + "np.unique(kmeans_alloc_reqmem.labels_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", + "\n", + "#Purple\n", + "df_alloc_reqmem_0 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 0]\n", + "\n", + "#Green\n", + "df_alloc_reqmem_1 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 1]\n", + "\n", + "#Red\n", + "df_alloc_reqmem_2 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 2]\n", + "\n", + "#df_elapsed_alloc_0.head(5)\n", + "#df_elapsed_alloc_0.AllocCPUS.count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# voluntary\n", + "\n", + "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", + "# These are the parameters for the scatter plots of each cluster\n", + "print(\"Purple Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_0.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_0.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_0.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_0.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_alloc_reqmem_0.AllocCPUS.min(),\" \",\"max =\",df_alloc_reqmem_0.AllocCPUS.max())\n", + "\n", + "print(\"\\nGreen Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_1.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_1.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_1.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_1.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_alloc_reqmem_1.AllocCPUS.min(),\" \",\"max =\",df_alloc_reqmem_1.AllocCPUS.max())\n", + "\n", + "print(\"\\nRed Cluster\")\n", + "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_2.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_2.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_2.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_2.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",df_alloc_reqmem_2.AllocCPUS.min(),\" \",\"max =\",df_alloc_reqmem_2.AllocCPUS.max())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", + "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", + "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", + "\n", + "# for purple cluster \n", + "df_alloc_reqmem_swarmplot0 = df_alloc_reqmem_0.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()\n", + "\n", + "# for green cluster \n", + "df_alloc_reqmem_swarmplot1 = df_alloc_reqmem_1.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()\n", + "\n", + "# for red cluster \n", + "df_alloc_reqmem_swarmplot2 = df_alloc_reqmem_2.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# scatterplot of Core per Requested RAM using df_alloc_cluster dataset with clustering\n", + "figure = plt.figure(figsize=(14, 8))\n", + "figure.suptitle('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", + "\n", + "alloc_reqmem_cluster_graph = figure.add_subplot(121)\n", + "alloc_reqmem_cluster_graph.scatter(df_alloc_cluster['ReqMemCPU'],df_alloc_cluster['AllocCPUS'], c=kmeans_alloc_reqmem.labels_, cmap='rainbow')\n", + "alloc_reqmem_cluster_graph.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "# 3d veiw of the scatterplot for better understanding of the data\n", + "alloc_reqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", + "alloc_reqmem_clustergraph_3d.scatter(df_alloc_cluster['ReqMemCPU'], df_alloc_cluster['AllocCPUS'], df_alloc_cluster['Elapsed'], c=kmeans_alloc_reqmem.labels_ ,cmap='rainbow')\n", + "alloc_reqmem_clustergraph_3d.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n", + "alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n", + "alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n", + "alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n", + "\n", + "# sets size and color for gridlines by axis\n", + "alloc_reqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "alloc_reqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "alloc_reqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets the figure and size that each subplot is added to - each graph is a subplot\n", + "figure = plt.figure(figsize=(21, 16))\n", + "\n", + "\n", + "#purple cluster and swarmplot\n", + "alloc_reqmem_clustergraph_0 = figure.add_subplot(321)\n", + "alloc_reqmem_clustergraph_0.scatter(df_alloc_reqmem_0['ReqMemCPU'],df_alloc_reqmem_0['AllocCPUS'], color = \"blueviolet\")\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "figure.add_subplot(322)\n", + "alloc_reqmem_swarmgraph_0 = sns.swarmplot(data=df_alloc_reqmem_swarmplot0, x='ReqMemCPU', y='AllocCPUS')\n", + "plt.yticks(np.arange(0, df_alloc_reqmem_0.AllocCPUS.max(), 3))\n", + "plt.margins(0.02)\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "\n", + "#green cluster and swarmplot\n", + "alloc_reqmem_clustergraph_1 = figure.add_subplot(323)\n", + "alloc_reqmem_clustergraph_1.scatter(df_alloc_reqmem_1['ReqMemCPU'],df_alloc_reqmem_1['AllocCPUS'], color = \"aquamarine\")\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "figure.add_subplot(324)\n", + "alloc_reqmem_swarmgraph_1 = sns.swarmplot(data=df_alloc_reqmem_swarmplot1, x='ReqMemCPU', y='AllocCPUS')\n", + "plt.yticks(np.arange(df_alloc_reqmem_1.AllocCPUS.min(), df_alloc_reqmem_1.AllocCPUS.max(), 5))\n", + "plt.margins(0.02)\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "\n", + "#red cluster and swarmplot\n", + "alloc_reqmem_clustergraph_2 = figure.add_subplot(325)\n", + "alloc_reqmem_clustergraph_2 = plt.scatter(df_alloc_reqmem_2['ReqMemCPU'],df_alloc_reqmem_2['AllocCPUS'], color = \"red\")\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "figure.add_subplot(326)\n", + "alloc_reqmem_swarmgraph_2 = sns.swarmplot(data=df_alloc_reqmem_swarmplot2, x='ReqMemCPU', y='AllocCPUS')\n", + "plt.yticks(np.arange(df_alloc_reqmem_2.AllocCPUS.min(), df_alloc_reqmem_2.AllocCPUS.max(), 5))\n", + "plt.margins(0.02)\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", + "figure.subplots_adjust(left=0.2, wspace=0.2, top=.94, hspace=0.3)\n", + "figure.suptitle('Clusters from Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB, fontsize=20)\n", + "\n", + "\n", + "\n", "plt.show()" ] }, diff --git a/gitattributes.txt b/gitattributes.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c2ef3020c839cc1bcf08b88e3041a704aa98dfa --- /dev/null +++ b/gitattributes.txt @@ -0,0 +1 @@ +*.ipynb filter=nbstrip_full diff --git a/gitconfig.txt b/gitconfig.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fb335e9cbd93454934ebfa7cd59c9cc20c2a86f --- /dev/null +++ b/gitconfig.txt @@ -0,0 +1,11 @@ +[core] +attributesfile = ~/.gitattributes +[filter "nbstrip_full"] +clean = "jq --indent 1 \ + '(.cells[] | select(has(\"outputs\")) | .outputs) = [] \ + | (.cells[] | select(has(\"execution_count\")) | .execution_count) = null \ + | .metadata = {\"language_info\": {\"name\": \"python\", \"pygments_lexer\": \"ipython3\"}} \ + | .cells[].metadata = {} \ + '" +smudge = cat +required = true diff --git a/requirements.txt b/requirements.txt index 36faad6e414084dc05ef4e6639de433f1f8e3580..452a651e794717af9370c1c6124b6dc8a06efe99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,7 +48,7 @@ parso==0.6.2 pexpect==4.8.0 phik==0.9.9 pickleshare==0.7.5 -plotly==4.5.2 +plotly==4.8.2 pluggy==0.13.1 prometheus-client==0.7.1 prompt-toolkit==3.0.3