diff --git a/Cluster_Analysis.ipynb b/Cluster_Analysis.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f1dd1446cb9b22367aef0b44e1c6eaab4db27d79 --- /dev/null +++ b/Cluster_Analysis.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Setup Options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# year-date-month\n", + "#start_date = '2020-10-09'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets min and max parameters for ReqMemCPU\n", + "LowerlimitGB = 0\n", + "UpperlimitGB = 50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets min and max parameters for AllocCPUS\n", + "LowerlimitAllocCPU = 0\n", + "UpperlimitAllocCPU = 50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets min and max parameters for Elapsed\n", + "LowerlimitElapsed = 0\n", + "UpperlimitElapsed = 150.02" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Enter 'none', '0-1', or 'log' as achoice for data nomralization\n", + "Data_Normalization_Choice = 'none'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "import sqlite3\n", + "import slurm2sql\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "import seaborn as sb\n", + "import plotly.express as px\n", + "import matplotlib.ticker as ticker\n", + "import numpy as np\n", + "from mpl_toolkits.mplot3d import Axes3D\n", + "import os\n", + "from RC_styles import rc_styles as style\n", + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Database Creation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates database of info from March 2020 using sqlite 3\n", + "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# df is starting database\n", + "df = pd.read_sql('SELECT * FROM slurm', db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# df_1 is dataframe of all completed jobs\n", + "df_1 = df[df.State.str.contains('COMPLETED')]\n", + "#df_completed.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# dataset of needed columns for all graphs below\n", + "df_completed = df_1.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "#df_1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# converts units in ReqMemCPU column from bytes to gigs and rounds up to nearest whole number\n", + "df_completed['ReqMemCPU'] = df_completed['ReqMemCPU'].div(1024**3).apply(np.ceil).apply(int)\n", + "#df_completed.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# converts Elapsed time to hours (from seconds) and rounds up to nearest 2 decimal places\n", + "df_completed['Elapsed'] = df_completed['Elapsed'].div(3600).round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS for completed jobs using the min and max parameters created above\n", + "df_clustering = df_completed[(df_completed['ReqMemCPU'] <= UpperlimitGB) & \n", + " (df_completed['ReqMemCPU'] >= LowerlimitGB) & \n", + " (df_completed['AllocCPUS'] <= UpperlimitAllocCPU) & \n", + " (df_completed['AllocCPUS'] >= LowerlimitAllocCPU)\n", + " & \n", + " (df_completed['Elapsed'] <= UpperlimitElapsed) & \n", + " (df_completed['Elapsed'] >= LowerlimitElapsed)]\n", + "df_clustering.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Normalizing the Data for ReqMem/Elapsed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if Data_Normalization_Choice == '0-1':\n", + " column_max = df_clustering.max()\n", + " df_clustering_max = column_max.max()\n", + " fit = df_clustering / df_clustering_max\n", + " print(\"0-1\")\n", + " \n", + "elif Data_Normalization_Choice == 'log':\n", + " fit = np.log10(df_clustering+1)\n", + " print(\"log\")\n", + " \n", + "else:\n", + " fit = df_clustering\n", + " print(\"none\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# kmeans Clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets to clusters and returns the cluster points\n", + "kmeans_cluster = KMeans(n_clusters=3, random_state=111)\n", + "kmeans_cluster.fit(fit)\n", + "print(kmeans_cluster.cluster_centers_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reverting Cluster Points Back to align with UnNormalized data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if Data_Normalization_Choice == '0-1':\n", + " clusterpoints = kmeans_cluster.cluster_centers_ * df_clustering_max\n", + " print(\"0-1\")\n", + " \n", + "elif Data_Normalization_Choice == 'log':\n", + " clusterpoints = 10 ** (kmeans_cluster.cluster_centers_) - 1\n", + " print(\"log\")\n", + " \n", + "else:\n", + " clusterpoints = kmeans_cluster.cluster_centers_\n", + " print(\"none\")\n", + " print(clusterpoints[:,0],clusterpoints[:,1])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "figure = plt.figure()\n", + "\n", + "figure.set_size_inches(20,20)\n", + "\n", + "# Elapsed/ReqMem 2d Graph\n", + "elapsed_rqmem_clustergraph = figure.add_subplot(3,3,1)\n", + "#figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", + "elapsed_rqmem_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['Elapsed'], \n", + " c=kmeans_cluster.labels_, cmap='rainbow')\n", + "elapsed_rqmem_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "\n", + "# Elapsed/Alloc 2d Graph\n", + "elapsed_alloc_clustergraph = figure.add_subplot(3,3,2)\n", + "#figure.suptitle('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n", + "elapsed_alloc_clustergraph.scatter(df_clustering['AllocCPUS'],df_clustering['Elapsed'], \n", + " c=kmeans_cluster.labels_, cmap='rainbow')\n", + "elapsed_alloc_clustergraph.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "\n", + "# Alloc/ReqMem 2d Graph\n", + "alloc_rqmem_clustergraph = figure.add_subplot(3,3,3)\n", + "#figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", + "alloc_rqmem_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['AllocCPUS'], \n", + " c=kmeans_cluster.labels_, cmap='rainbow')\n", + "elapsed_rqmem_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "\n", + "###########\n", + "# Alloc/ReqMem 3d Graph\n", + "alloc_reqmem_clustergraph_3d = figure.add_subplot(3,3,4, projection='3d')\n", + "alloc_reqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'], \n", + " c=kmeans_cluster.labels_ ,cmap='rainbow')\n", + "alloc_reqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')\n", + "alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n", + "alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n", + "alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n", + "\n", + "# sets size and color for gridlines by axis\n", + "alloc_reqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "alloc_reqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "alloc_reqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "\n", + "# Elapsed/Alloc 3d Graph\n", + "elapsed_alloc_clustergraph_3d = figure.add_subplot(3,3,5, projection='3d')\n", + "elapsed_alloc_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'], \n", + " c=kmeans_cluster.labels_ ,cmap='rainbow')\n", + "elapsed_alloc_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')\n", + "elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')\n", + "elapsed_alloc_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')\n", + "elapsed_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')\n", + "\n", + "elapsed_alloc_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_alloc_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_alloc_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "\n", + "\n", + "# Elapsed/ReqMem 3d Graph\n", + "elapsed_rqmem_clustergraph_3d = figure.add_subplot(3,3,6, projection='3d')\n", + "elapsed_rqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'], \n", + " c=kmeans_cluster.labels_ ,cmap='rainbow')\n", + "elapsed_rqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')\n", + "\n", + "elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')\n", + "elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')\n", + "elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')\n", + "\n", + "elapsed_rqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_rqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_rqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "\n", + "##############\n", + "# Alloc/ReqMem 3d Graph\n", + "alloc_reqmem_clustergraph_3d = figure.add_subplot(3,3,7, projection='3d')\n", + "alloc_reqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'], \n", + " c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)\n", + "alloc_reqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')\n", + "alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n", + "alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n", + "alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n", + "\n", + "# sets size and color for gridlines by axis\n", + "alloc_reqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "alloc_reqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "alloc_reqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "\n", + "# Elapsed/Alloc 3d Graph\n", + "elapsed_alloc_clustergraph_3d = figure.add_subplot(3,3,8, projection='3d')\n", + "elapsed_alloc_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'], \n", + " c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)\n", + "elapsed_alloc_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')\n", + "elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')\n", + "elapsed_alloc_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')\n", + "elapsed_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')\n", + "\n", + "elapsed_alloc_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_alloc_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_alloc_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "\n", + "\n", + "# Elapsed/ReqMem 3d Graph\n", + "elapsed_rqmem_clustergraph_3d = figure.add_subplot(3,3,9, projection='3d')\n", + "elapsed_rqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'], \n", + " c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)\n", + "elapsed_rqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')\n", + "\n", + "elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')\n", + "elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')\n", + "elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')\n", + "\n", + "elapsed_rqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_rqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "elapsed_rqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n", + "\n", + "\n", + "# sets the spacing\n", + "# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n", + "# left = space to the left\n", + "# wspace = padding on both sides of graphs\n", + "# hspace = padding on top and bottom of graphs\n", + "figure.subplots_adjust(left=0.0, wspace=0.2, top=.92, hspace=0.3)\n", + "figure.suptitle('Clusters', fontsize=20)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}