diff --git a/Runtime-and-CoreCount.ipynb b/Runtime-and-CoreCount.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..097a6bbd217c85aa7aa22a6d879bcbd940d2b3d6 --- /dev/null +++ b/Runtime-and-CoreCount.ipynb @@ -0,0 +1,395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "import sqlite3\n", + "import slurm2sql\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "import seaborn as sb\n", + "import plotly.express as px\n", + "import matplotlib.ticker as ticker\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "from RC_styles import rc_styles as style" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates database of info from March 2020 using sqlite 3\n", + "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# df is starting database\n", + "df = pd.read_sql('SELECT * FROM slurm', db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# voluntary\n", + "\n", + "# for displaying all available column options\n", + "pd.set_option('display.max_columns', None)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# converts units in ReqMemCPU column from bytes to gigs\n", + "df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# converts Elapsed time to hours (from seconds)\n", + "df['Elapsed'] = df['Elapsed'].div(3600)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# df_completed is dataframe of all completed jobs\n", + "df_completed = df[df.State.str.contains('COMPLETED')]\n", + "#df_completed.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ReqMemCPU,Corecount,Runtime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets min and max parameters for ReqMemCPU\n", + "UpperlimitGB = 50\n", + "LowerlimitGB = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets min and max parameters for AllocCPUS\n", + "UpperlimitAllocCPU = 20\n", + "LowerlimitAllocCPU = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# dataset of needed columns for all graphs below\n", + "df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "df_1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# rounds ReqMemCPU up to nearest whole number\n", + "df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# rounds Elapsed up to nearest 2 decimal places\n", + "df_1['Elapsed'] = df_1['Elapsed'].round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sorts dataset by AllocCPUS for easy visualization\n", + "df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)\n", + "df_1_sorted.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n", + "df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n", + "df_runtime.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates a facet grid from df_runtime dataset\n", + "# Elapsed time in hours and ReqMemCPU in gigs\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "full_facet = sb.PairGrid(df_runtime)\n", + "full_facet.map(plt.scatter);\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "runtime_graph = sns.scatterplot(x=\"ReqMemCPU\", y=\"AllocCPUS\",data=df_runtime)\n", + "\n", + "plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n", + "\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('AllocCPUS')\n", + "#plt.yscale(\"log\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above for clustering\n", + "df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n", + "df_runtime_cluster.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n", + "Sum_of_squared_distances = []\n", + "K = range(1,10)\n", + "for k in K:\n", + " km = KMeans(n_clusters=k)\n", + " km = km.fit(df_runtime_cluster)\n", + " Sum_of_squared_distances.append(km.inertia_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n", + "plt.plot(K, Sum_of_squared_distances, 'bx-')\n", + "plt.xlabel('k')\n", + "plt.ylabel('Sum_of_squared_distances')\n", + "plt.title('Elbow Method For Optimal k')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# sets to clusters and returns the cluster points\n", + "kmeans = KMeans(n_clusters=3, random_state=111)\n", + "kmeans.fit(df_runtime_cluster)\n", + "print(kmeans.cluster_centers_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n", + "reqmem_alloc.map(sns.regplot, color=\"blue\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# clustered graph\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n", + "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n", + "\n", + "plt.xlabel('ReqMemCPU(gigs)')\n", + "plt.ylabel('Elapsed(hours)')\n", + "plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# must run\n", + "\n", + "# clustered graph\n", + "style.default_axes_and_ticks()\n", + "style.figsize()\n", + "\n", + "alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n", + "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n", + "\n", + "plt.xlabel('AllocCPUS')\n", + "plt.ylabel('Elapsed(hours)')\n", + "plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}