diff --git a/Runtime-and-CoreCount.ipynb b/Runtime-and-CoreCount.ipynb deleted file mode 100644 index 5c421e1f776fbf5ccf1ccfde6e34669c8dc1e48b..0000000000000000000000000000000000000000 --- a/Runtime-and-CoreCount.ipynb +++ /dev/null @@ -1,1388 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Notebook Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# year-date-month\n", - "#start_date = '2020-10-09'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "import sqlite3\n", - "import slurm2sql\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import seaborn as sns\n", - "import seaborn as sb\n", - "import plotly.express as px\n", - "import matplotlib.ticker as ticker\n", - "import numpy as np\n", - "from mpl_toolkits.mplot3d import Axes3D\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "from RC_styles import rc_styles as style" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "from sklearn.cluster import KMeans" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#connecting to database\n", - "#db = sqlite3.connect('runtime_and_core_count.db')\n", - "#print(db)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates database of info from March 2020 using sqlite 3\n", - "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')\n", - "#print(db)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#slurm2sql.slurm2sql(db, ['-S 2020-09-08 -E 2020-09-15 -a --allocations -o Job,Submit,Start,End'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " #creating a database based on the start date\n", - "#slurm2sql.slurm2sql(db, ['-S', '2020-01-09', '-a'])\n", - "#print(db)\n", - "#print(start_date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df is starting database\n", - "df = pd.read_sql('SELECT * FROM slurm', db)\n", - "#df = pd.read_sql('SELECT JobID,Submit,Start,End FROM slurm', db)\n", - "print(df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " #Deleting the database\n", - "#os.remove('runtime_and_core_count.db')\n", - "#os.remove('runtime_and_core_count.db-shm')\n", - "#os.remove('runtime_and_core_count.db-wal') " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# for displaying all available column options\n", - "pd.set_option('display.max_columns', None)\n", - "df.count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# converts units in ReqMemCPU column from bytes to gigs\n", - "df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# converts Elapsed time to hours (from seconds)\n", - "df['Elapsed'] = df['Elapsed'].div(3600)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df_completed is dataframe of all completed jobs\n", - "df_completed = df[df.State.str.contains('COMPLETED')]\n", - "#df_completed.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ReqMemCPU,Corecount,Runtime FacetGrid" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next 4 cells set up the df_1 dataset, which will be the base dataset used for the facet grid." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# dataset of needed columns for all graphs below\n", - "df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", - "df_1.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# rounds ReqMemCPU up to nearest whole number\n", - "df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# rounds Elapsed up to nearest 2 decimal places\n", - "df_1['Elapsed'] = df_1['Elapsed'].round(2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# makes ReqMemCPU column whole numbers rather than floats for easy readability in graphs\n", - "df_1.ReqMemCPU = df_1.ReqMemCPU.apply(int)\n", - "df_1.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next 3 cells set the min and max parameters for ReqMemCPU, AllocCPUS, and Elapsed. These parameters are used in creating the facet grid and are the parameters for all the cluster graphs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets min and max parameters for ReqMemCPU\n", - "LowerlimitGB = 0\n", - "UpperlimitGB = 50" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets min and max parameters for AllocCPUS\n", - "LowerlimitAllocCPU = 0\n", - "UpperlimitAllocCPU = 50" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets min and max parameters for Elapsed\n", - "LowerlimitElapsed = 0\n", - "UpperlimitElapsed = 150.02" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "df_facet is a dataset created from df_1 using the parameters above. It will be the dataset that all the cluster graphs will be made from." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", - "df_facet = df_1[(df_1['ReqMemCPU'] <= UpperlimitGB) & \n", - " (df_1['ReqMemCPU'] >= LowerlimitGB) & \n", - " (df_1['AllocCPUS'] <= UpperlimitAllocCPU) & \n", - " (df_1['AllocCPUS'] >= LowerlimitAllocCPU)\n", - " & \n", - " (df_1['Elapsed'] <= UpperlimitElapsed) & \n", - " (df_1['Elapsed'] >= LowerlimitElapsed)]\n", - "df_facet.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates a facet grid from df_1 dataset\n", - "# Elapsed time in hours and ReqMemCPU in gigs\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "full_facet = sns.pairplot(df_facet, diag_kind = 'kde') # makes density plots - kernel density estimate\n", - "# y axis is count in the diagonal graphs\n", - "\n", - "full_facet.map(plt.scatter);\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Detailed Look at Elapsed Time - In terms of Requested RAM and Cores" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# pair grid of the two graphs being clustered using df_facet dataset\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "elapsed_reqmem_alloc = sns.PairGrid(df_facet, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n", - "elapsed_reqmem_alloc.map(sns.regplot, color=\"blue\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "df_runtime_cluster is a dataset made from df_facet. It is used to make the elbow graph and calculate the clustering for Elapsed/ReqMemCPU and Elapsed/AllocCPUS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#must run if dataset will not be normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", - "\n", - "#ReqMemCPU = 0 - 50 gigs\n", - "#AllocCPUS = 0 - 50 cores\n", - "#Elapsed = 0 - 150.02 hours\n", - "\n", - "# data set without normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs\n", - "df_runtime_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", - "df_runtime_cluster.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run if dataset will be 0-1 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", - "\n", - "# 0-1 normalized dataset\n", - "# used for 0-1 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n", - "column_maxes_runtime = df_runtime_cluster.max()\n", - "df_runtime_cluster_max = column_maxes_runtime.max()\n", - "normalized_runtime_df = df_runtime_cluster / df_runtime_cluster_max\n", - "\n", - "print(normalized_runtime_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", - "\n", - "# log10 normalized dataset\n", - "# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n", - "\n", - "log_runtime_df = np.log10(df_runtime_cluster+1)\n", - "log_runtime_df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n", - "Sum_of_squared_distances = []\n", - "K = range(1,10)\n", - "for k in K:\n", - " km = KMeans(n_clusters=k)\n", - " km = km.fit(df_runtime_cluster)\n", - " Sum_of_squared_distances.append(km.inertia_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n", - "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", - "plt.xlabel('k')\n", - "plt.ylabel('Sum_of_squared_distances')\n", - "plt.title('Elbow Method For Optimal k')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Elapsed/ReqMemCPU clustering" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", - "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In the cell below, set the fit based on the normalization type by uncommenting the line to run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment for no normalization\n", - "#elapsed_reqmem_fit = df_runtime_cluster\n", - "\n", - "# uncomment for 0-1 normalization\n", - "#elapsed_reqmem_fit = normalized_runtime_df\n", - "\n", - "# uncomment for log10 normalization\n", - "elapsed_reqmem_fit = log_runtime_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets to clusters and returns the cluster points\n", - "kmeans_elapsed_reqmem = KMeans(n_clusters=3, random_state=111)\n", - "kmeans_elapsed_reqmem.fit(elapsed_reqmem_fit)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment if no normalization\n", - "#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_\n", - "\n", - "# uncomment if 0-1 normalization\n", - "#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_ * df_runtime_cluster_max\n", - "\n", - "# uncomment if log10 normalization\n", - "clusterpoints_elapsed_reqmem = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", - "\n", - "# 0 = purple cluster\n", - "# 1 = green cluster\n", - "# 2 = red cluster\n", - "np.unique(kmeans_elapsed_reqmem.labels_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", - "\n", - "#Purple\n", - "df_elapsed_reqmem_0 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 0]\n", - "\n", - "#Green\n", - "df_elapsed_reqmem_1 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 1]\n", - "\n", - "#Red\n", - "df_elapsed_reqmem_2 = df_runtime_cluster[kmeans_elapsed_reqmem.labels_ == 2]\n", - "\n", - "#df_elapsed_reqmem_0.head(5)\n", - "#df_elapsed_reqmem_0.ReqMemCPU.count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", - "# These are the parameters for the scatter plots of each cluster\n", - "print(\"Purple Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_0.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_0.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_0.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_0.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_elapsed_reqmem_0.AllocCPUS.min(),\" \",\"max =\",df_elapsed_reqmem_0.AllocCPUS.max())\n", - "\n", - "print(\"\\nGreen Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_1.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_1.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_1.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_1.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_elapsed_reqmem_1.AllocCPUS.min(),\" \",\"max =\",df_elapsed_reqmem_1.AllocCPUS.max())\n", - "\n", - "print(\"\\nRed Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_elapsed_reqmem_2.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_reqmem_2.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_elapsed_reqmem_2.Elapsed.min(),\" \",\"max =\",df_elapsed_reqmem_2.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_elapsed_reqmem_2.AllocCPUS.min(),\" \",\"max =\",df_elapsed_reqmem_2.AllocCPUS.max())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", - "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", - "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", - "\n", - "# for purple cluster \n", - "df_elapsed_reqmem_swarmplot0 = df_elapsed_reqmem_0.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", - "\n", - "# for green cluster\n", - "df_elapsed_reqmem_swarmplot1 = df_elapsed_reqmem_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", - "\n", - "# for red cluster\n", - "df_elapsed_reqmem_swarmplot2 = df_elapsed_reqmem_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# scatterplot of Runtime per Requested gigs of RAM using df_runtime_cluster dataset with clustering\n", - "figure = plt.figure(figsize=(14, 8))\n", - "figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", - "\n", - "elapsed_rqmem_clustergraph = figure.add_subplot(121)\n", - "elapsed_rqmem_clustergraph.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans_elapsed_reqmem.labels_, cmap='rainbow')\n", - "elapsed_rqmem_clustergraph.scatter(clusterpoints_elapsed_reqmem[:,0] ,clusterpoints_elapsed_reqmem[:,1], color='black')\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "# 3d veiw of the scatterplot for better understanding of the data\n", - "elapsed_rqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", - "elapsed_rqmem_clustergraph_3d.scatter(df_runtime_cluster['ReqMemCPU'], df_runtime_cluster['Elapsed'], df_runtime_cluster['AllocCPUS'], \n", - " c=kmeans_elapsed_reqmem.labels_ ,cmap='rainbow')\n", - "elapsed_rqmem_clustergraph_3d.scatter(clusterpoints_elapsed_reqmem[:,0] ,clusterpoints_elapsed_reqmem[:,1], color='black')\n", - "\n", - "\n", - "elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')\n", - "elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')\n", - "elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')\n", - "\n", - "# sets size and color for gridlines by axis\n", - "elapsed_rqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "elapsed_rqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "elapsed_rqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This graph is a facet grid that shows scatterplots by cluster color on the left, and it's corresponging swarmplot in the right. The swarmplots give a better understanding of the distrubition of jobs matching a specific datapoint." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets the figure and size that each subplot is added to - each graph is a subplot\n", - "figure = plt.figure( figsize=(16, 16))\n", - "\n", - "#purple cluster and swarmplot\n", - "elapsed_reqmem_clustergraph_0 = figure.add_subplot(423)\n", - "elapsed_reqmem_clustergraph_0.scatter(df_elapsed_reqmem_0['ReqMemCPU'],df_elapsed_reqmem_0['Elapsed'], color = \"blueviolet\")\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "figure.add_subplot(424)\n", - "elapsed_reqmem_swarmgraph_0 = sns.swarmplot(data=df_elapsed_reqmem_swarmplot0, x='ReqMemCPU', y='Elapsed')\n", - "plt.yticks(np.arange(df_elapsed_reqmem_0.Elapsed.min(), df_elapsed_reqmem_0.Elapsed.max(), 5))\n", - "plt.margins(0.02)\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "\n", - "#green cluster and swarmplot\n", - "elapsed_reqmem_clustergraph_1 = figure.add_subplot(425)\n", - "elapsed_reqmem_clustergraph_1.scatter(df_elapsed_reqmem_1['ReqMemCPU'],df_elapsed_reqmem_1['Elapsed'], color = \"aquamarine\")\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "figure.add_subplot(426)\n", - "elapsed_reqmem_swarmgraph_1 = sns.swarmplot(data=df_elapsed_reqmem_swarmplot1, x='ReqMemCPU', y='Elapsed')\n", - "plt.yticks(np.arange(df_elapsed_reqmem_1.Elapsed.min(), df_elapsed_reqmem_1.Elapsed.max(), 5))\n", - "plt.margins(0.02)\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "\n", - "#red cluster and swarmplot\n", - "elapsed_reqmem_clustergraph_2 = figure.add_subplot(427)\n", - "elapsed_reqmem_clustergraph_2.scatter(df_elapsed_reqmem_2['ReqMemCPU'],df_elapsed_reqmem_2['Elapsed'], color = \"red\")\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "figure.add_subplot(428)\n", - "elapsed_reqmem_swarmgraph_2 = sns.swarmplot(data=df_elapsed_reqmem_swarmplot2, x='ReqMemCPU', y='Elapsed')\n", - "plt.yticks(np.arange(df_elapsed_reqmem_2.Elapsed.min(), df_elapsed_reqmem_2.Elapsed.max(), 10))\n", - "plt.margins(0.02)\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "\n", - "# sets the spacing\n", - "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", - "# left = space to the left\n", - "# wspace = padding on both sides of graphs\n", - "# hspace = padding on top and bottom of graphs\n", - "figure.subplots_adjust(left=0.2, wspace=0.2, top=1.2, hspace=0.3)\n", - "\n", - "figure.suptitle('Clusters from Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB, fontsize=20)\n", - "\n", - "plt.show()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Elapsed/AllocCPUS clustering" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n", - "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In the cell below, set the fit based on the normalization type by uncommenting the line to run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment for no normalization\n", - "#elapsed_alloc_fit = df_runtime_cluster\n", - "\n", - "# uncomment for 0-1 normalization\n", - "#elapsed_alloc_fit = normalized_runtime_df\n", - "\n", - "# uncomment for log10 normalization\n", - "elapsed_alloc_fit = log_runtime_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets to clusters and returns the cluster points\n", - "kmeans_elapsed_alloc = KMeans(n_clusters=3, random_state=111)\n", - "kmeans_elapsed_alloc.fit(elapsed_alloc_fit)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment if no normalization\n", - "#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_\n", - "\n", - "# uncomment if 0-1 normalization\n", - "#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_ * df_runtime_cluster_max\n", - "\n", - "# uncomment if log10 normalization\n", - "clusterpoints_elapsed_alloc = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", - "\n", - "# 0 = purple cluster\n", - "# 1 = green cluster\n", - "# 2 = red cluster\n", - "np.unique(kmeans_elapsed_alloc.labels_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", - "\n", - "#Purple\n", - "df_elapsed_alloc_0 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 0]\n", - "\n", - "#Green\n", - "df_elapsed_alloc_1 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 1]\n", - "\n", - "#Red\n", - "df_elapsed_alloc_2 = df_runtime_cluster[kmeans_elapsed_alloc.labels_ == 2]\n", - "\n", - "#df_elapsed_alloc_0.head(5)\n", - "#df_elapsed_alloc_0.AllocCPUS.count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", - "# These are the parameters for the scatter plots of each cluster\n", - "print(\"Purple Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_0.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_0.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_0.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_0.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_elapsed_alloc_0.AllocCPUS.min(),\" \",\"max =\",df_elapsed_alloc_0.AllocCPUS.max())\n", - "\n", - "print(\"\\nGreen Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_1.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_1.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_1.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_1.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_elapsed_alloc_1.AllocCPUS.min(),\" \",\"max =\",df_elapsed_alloc_1.AllocCPUS.max())\n", - "\n", - "print(\"\\nRed Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_elapsed_alloc_2.ReqMemCPU.min(),\" \",\"max =\",df_elapsed_alloc_2.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_elapsed_alloc_2.Elapsed.min(),\" \",\"max =\",df_elapsed_alloc_2.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_elapsed_alloc_2.AllocCPUS.min(),\" \",\"max =\",df_elapsed_alloc_2.AllocCPUS.max())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", - "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", - "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", - "\n", - "# for purple cluster \n", - "df_elapsed_alloc_swarmplot0 = df_elapsed_alloc_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "\n", - "# for green cluster \n", - "df_elapsed_alloc_swarmplot1 = df_elapsed_alloc_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "\n", - "# for red cluster \n", - "df_elapsed_alloc_swarmplot2 = df_elapsed_alloc_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# scatterplot of Runtime per Core using df_runtime_cluster dataset with clustering\n", - "figure = plt.figure(figsize=(14, 8))\n", - "figure.suptitle('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n", - "\n", - "elapsed_alloc_clustergraph = figure.add_subplot(121)\n", - "elapsed_alloc_clustergraph.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans_elapsed_alloc.labels_, cmap='rainbow')\n", - "elapsed_alloc_clustergraph.scatter(clusterpoints_elapsed_alloc[:,0] ,clusterpoints_elapsed_alloc[:,1], color='black')\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "# 3d veiw of the scatterplot for better understanding of the data\n", - "elapsed_alloc_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", - "elapsed_alloc_clustergraph_3d.scatter(df_runtime_cluster['AllocCPUS'], df_runtime_cluster['Elapsed'], df_runtime_cluster['ReqMemCPU'], c=kmeans_elapsed_alloc.labels_ ,cmap='rainbow')\n", - "elapsed_alloc_clustergraph_3d.scatter(clusterpoints_elapsed_alloc[:,0] ,clusterpoints_elapsed_alloc[:,1], color='black')\n", - "elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')\n", - "elapsed_alloc_clustergraph_3d.set_ylabel('Elapsed(hours)')\n", - "elapsed_alloc_clustergraph_3d.set_zlabel('ReqMemCPU(gigs)')\n", - "\n", - "# sets size and color for gridlines by axis\n", - "elapsed_alloc_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "elapsed_alloc_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "elapsed_alloc_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets the figure and size that each subplot is added to - each graph is a subplot\n", - "figure = plt.figure( figsize=(21, 16))\n", - "\n", - "#purple cluster and swarmplot\n", - "elapsed_alloc_clustergraph_0 = figure.add_subplot(321)\n", - "elapsed_alloc_clustergraph_0.scatter(df_elapsed_alloc_0['AllocCPUS'],df_elapsed_alloc_0['Elapsed'], color = \"blueviolet\")\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "figure.add_subplot(322)\n", - "elapsed_alloc_swarmgraph_0 = sns.swarmplot(data=df_elapsed_alloc_swarmplot0, x='AllocCPUS', y='Elapsed')\n", - "plt.yticks(np.arange(df_elapsed_alloc_0.Elapsed.min(), df_elapsed_alloc_0.Elapsed.max(), 5))\n", - "plt.margins(0.02)\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "\n", - "#green cluster and swarmplot\n", - "elapsed_alloc_clustergraph_1 = figure.add_subplot(323)\n", - "elapsed_alloc_clustergraph_1.scatter(df_elapsed_alloc_1['AllocCPUS'],df_elapsed_alloc_1['Elapsed'], color = \"aquamarine\")\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "figure.add_subplot(324)\n", - "elapsed_alloc_swarmgraph_1 = sns.swarmplot(data=df_elapsed_alloc_swarmplot1, x='AllocCPUS', y='Elapsed')\n", - "plt.yticks(np.arange(df_elapsed_alloc_1.Elapsed.min(), df_elapsed_alloc_1.Elapsed.max(), 5))\n", - "plt.margins(0.02)\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "\n", - "#red cluster and swarmplot\n", - "elapsed_alloc_clustergraph_2 = figure.add_subplot(325)\n", - "elapsed_alloc_clustergraph_2.scatter(df_elapsed_alloc_2['AllocCPUS'],df_elapsed_alloc_2['Elapsed'], color = \"red\")\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "figure.add_subplot(326)\n", - "elapsed_alloc_swarmgraph_2 = sns.swarmplot(data=df_elapsed_alloc_swarmplot2, x='AllocCPUS', y='Elapsed')\n", - "plt.yticks(np.arange(df_elapsed_alloc_2.Elapsed.min(), df_elapsed_alloc_2.Elapsed.max(), 10))\n", - "plt.margins(0.02)\n", - "plt.xlabel('AllocCPUS')\n", - "plt.ylabel('Elapsed(hours)')\n", - "\n", - "# sets the spacing\n", - "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", - "# left = space to the left\n", - "# wspace = padding on both sides of graphs\n", - "# hspace = padding on top and bottom of graphs\n", - "figure.subplots_adjust(left=0.2, wspace=0.2, top=.94, hspace=0.3)\n", - "figure.suptitle('Clusters from Runtime per Core %i cores or less'%UpperlimitAllocCPU, fontsize=20)\n", - "\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Detailed Look at Cores - In terms of Requested RAM" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# scatterplot of AllocCPUS/ReqMemCPU using df_facet dataset\n", - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "elapsed_alloc_reqmem = plt.scatter(df_facet[\"ReqMemCPU\"], df_facet[\"AllocCPUS\"], color = \"blue\")\n", - "\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "df_alloc_cluster is a dataset made from df_facet. It is used to make the elbow graph and calculate the clustering for AllocCPUS/ReqMemCPU" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run if dataset will not be normalized\n", - "\n", - "#ReqMemCPU = 0 - 50 gigs\n", - "#AllocCPUS = 0 - 50 cores\n", - "#Elapsed = 0 - 150.02 hours\n", - "\n", - "# non normalized dataset\n", - "# used for fitting for the Alloc/ReqMem graph without normalization\n", - "df_alloc_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", - "df_alloc_cluster.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run if dataset will be 0-1 normalized\n", - "\n", - "# 0-1 normalized dataset\n", - "# used for 0-1 normalization fitting for the Alloc/ReqMem graph\n", - "column_maxes_alloc = df_alloc_cluster.max()\n", - "df_alloc_cluster_max = column_maxes_alloc.max()\n", - "normalized_alloc_df = df_alloc_cluster / df_alloc_cluster_max\n", - "\n", - "print(normalized_alloc_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n", - "\n", - "# log10 normalized dataset\n", - "# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n", - "\n", - "log_alloc_df = np.log10(df_alloc_cluster+1)\n", - "log_alloc_df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets up info for plotting the optimal number of clusters - uses df_alloc_cluster datasaet\n", - "Sum_of_squared_distances = []\n", - "K = range(1,10)\n", - "for k in K:\n", - " km = KMeans(n_clusters=k)\n", - " km = km.fit(df_alloc_cluster)\n", - " Sum_of_squared_distances.append(km.inertia_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# the bend in the graph is the optimal number of clusters for graphs using the df_alloc_cluster dataset\n", - "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", - "plt.xlabel('k')\n", - "plt.ylabel('Sum_of_squared_distances')\n", - "plt.title('Elbow Method For Optimal k')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In the cell below, set the fit based on the normalization type by uncommenting the line to run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment for no normalization\n", - "#alloc_reqmem_fit = df_alloc_cluster\n", - "\n", - "# uncomment for 0-1 normalization\n", - "#alloc_reqmem_fit = normalized_alloc_df\n", - "\n", - "# uncomment for log10 normalization\n", - "alloc_reqmem_fit = log_alloc_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets to clusters and returns the cluster points\n", - "kmeans_alloc_reqmem = KMeans(n_clusters=3, random_state=111)\n", - "kmeans_alloc_reqmem.fit(alloc_reqmem_fit)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# uncomment if no normalization\n", - "#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_\n", - "\n", - "# uncomment if 0-1 normalization\n", - "#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_ * df_alloc_cluster_max\n", - "\n", - "# uncomment if log10 normalization\n", - "clusterpoints_alloc_reqmem = (10 ** (kmeans_alloc_reqmem.cluster_centers_)) - 1\n", - "print(clusterpoints_alloc_reqmem)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "clusterpoints_alloc_reqmem[:,0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "clusterpoints_alloc_reqmem[:,2]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next 5 cells find each cluster label, and create datasets of data in each cluster.\n", - "All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# returns array of labels for each cluster - used to find min and max x and y points for each cluster\n", - "\n", - "# 0 = purple cluster\n", - "# 1 = green cluster\n", - "# 2 = red cluster\n", - "np.unique(kmeans_alloc_reqmem.labels_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", - "\n", - "#Purple\n", - "df_alloc_reqmem_0 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 0]\n", - "\n", - "#Green\n", - "df_alloc_reqmem_1 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 1]\n", - "\n", - "#Red\n", - "df_alloc_reqmem_2 = df_alloc_cluster[kmeans_alloc_reqmem.labels_ == 2]\n", - "\n", - "#df_elapsed_alloc_0.head(5)\n", - "#df_elapsed_alloc_0.AllocCPUS.count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", - "# These are the parameters for the scatter plots of each cluster\n", - "print(\"Purple Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_0.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_0.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_0.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_0.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_alloc_reqmem_0.AllocCPUS.min(),\" \",\"max =\",df_alloc_reqmem_0.AllocCPUS.max())\n", - "\n", - "print(\"\\nGreen Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_1.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_1.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_1.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_1.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_alloc_reqmem_1.AllocCPUS.min(),\" \",\"max =\",df_alloc_reqmem_1.AllocCPUS.max())\n", - "\n", - "print(\"\\nRed Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_alloc_reqmem_2.ReqMemCPU.min(),\" \",\"max =\",df_alloc_reqmem_2.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_alloc_reqmem_2.Elapsed.min(),\" \",\"max =\",df_alloc_reqmem_2.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_alloc_reqmem_2.AllocCPUS.min(),\" \",\"max =\",df_alloc_reqmem_2.AllocCPUS.max())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# Creates datasets used to make the swarmplots that correspong to each cluster scatter plot. \n", - "# The groupby does not change the data, but it does make a small enough dataset to keep from having a \n", - "#runtime error, as will happen if a swarmplot is made using the scatter plot datasets.\n", - "\n", - "# for purple cluster \n", - "df_alloc_reqmem_swarmplot0 = df_alloc_reqmem_0.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()\n", - "\n", - "# for green cluster \n", - "df_alloc_reqmem_swarmplot1 = df_alloc_reqmem_1.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()\n", - "\n", - "# for red cluster \n", - "df_alloc_reqmem_swarmplot2 = df_alloc_reqmem_2.groupby(['AllocCPUS','ReqMemCPU']).sum().reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# scatterplot of Core per Requested RAM using df_alloc_cluster dataset with clustering\n", - "figure = plt.figure(figsize=(14, 8))\n", - "figure.suptitle('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", - "\n", - "alloc_reqmem_cluster_graph = figure.add_subplot(121)\n", - "alloc_reqmem_cluster_graph.scatter(df_alloc_cluster['ReqMemCPU'],df_alloc_cluster['AllocCPUS'], c=kmeans_alloc_reqmem.labels_, cmap='rainbow')\n", - "alloc_reqmem_cluster_graph.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "# 3d veiw of the scatterplot for better understanding of the data\n", - "alloc_reqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n", - "alloc_reqmem_clustergraph_3d.scatter(df_alloc_cluster['ReqMemCPU'], df_alloc_cluster['AllocCPUS'], df_alloc_cluster['Elapsed'], c=kmeans_alloc_reqmem.labels_ ,cmap='rainbow')\n", - "alloc_reqmem_clustergraph_3d.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n", - "alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n", - "alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n", - "alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n", - "\n", - "# sets size and color for gridlines by axis\n", - "alloc_reqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "alloc_reqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "alloc_reqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# sets the figure and size that each subplot is added to - each graph is a subplot\n", - "figure = plt.figure(figsize=(21, 16))\n", - "\n", - "\n", - "#purple cluster and swarmplot\n", - "alloc_reqmem_clustergraph_0 = figure.add_subplot(321)\n", - "alloc_reqmem_clustergraph_0.scatter(df_alloc_reqmem_0['ReqMemCPU'],df_alloc_reqmem_0['AllocCPUS'], color = \"blueviolet\")\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "figure.add_subplot(322)\n", - "alloc_reqmem_swarmgraph_0 = sns.swarmplot(data=df_alloc_reqmem_swarmplot0, x='ReqMemCPU', y='AllocCPUS')\n", - "plt.yticks(np.arange(0, df_alloc_reqmem_0.AllocCPUS.max(), 3))\n", - "plt.margins(0.02)\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "\n", - "#green cluster and swarmplot\n", - "alloc_reqmem_clustergraph_1 = figure.add_subplot(323)\n", - "alloc_reqmem_clustergraph_1.scatter(df_alloc_reqmem_1['ReqMemCPU'],df_alloc_reqmem_1['AllocCPUS'], color = \"aquamarine\")\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "figure.add_subplot(324)\n", - "alloc_reqmem_swarmgraph_1 = sns.swarmplot(data=df_alloc_reqmem_swarmplot1, x='ReqMemCPU', y='AllocCPUS')\n", - "plt.yticks(np.arange(df_alloc_reqmem_1.AllocCPUS.min(), df_alloc_reqmem_1.AllocCPUS.max(), 5))\n", - "plt.margins(0.02)\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "\n", - "#red cluster and swarmplot\n", - "alloc_reqmem_clustergraph_2 = figure.add_subplot(325)\n", - "alloc_reqmem_clustergraph_2 = plt.scatter(df_alloc_reqmem_2['ReqMemCPU'],df_alloc_reqmem_2['AllocCPUS'], color = \"red\")\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "figure.add_subplot(326)\n", - "alloc_reqmem_swarmgraph_2 = sns.swarmplot(data=df_alloc_reqmem_swarmplot2, x='ReqMemCPU', y='AllocCPUS')\n", - "plt.yticks(np.arange(df_alloc_reqmem_2.AllocCPUS.min(), df_alloc_reqmem_2.AllocCPUS.max(), 5))\n", - "plt.margins(0.02)\n", - "plt.xlabel('ReqMemCPU(gigs)')\n", - "plt.ylabel('AllocCPUS')\n", - "\n", - "# sets the spacing\n", - "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", - "# left = space to the left\n", - "# wspace = padding on both sides of graphs\n", - "# hspace = padding on top and bottom of graphs\n", - "figure.subplots_adjust(left=0.2, wspace=0.2, top=.94, hspace=0.3)\n", - "figure.suptitle('Clusters from Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB, fontsize=20)\n", - "\n", - "\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}