diff --git a/Jobs-and-Users-ReqMemCPU.ipynb b/Jobs-and-Users-ReqMemCPU.ipynb deleted file mode 100644 index 0d23eb1d7e0cb1feca5a5295a860f846259d2f56..0000000000000000000000000000000000000000 --- a/Jobs-and-Users-ReqMemCPU.ipynb +++ /dev/null @@ -1,615 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Notebook Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "import sqlite3\n", - "import slurm2sql\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import seaborn as sns\n", - "import plotly.express as px\n", - "import matplotlib.ticker as ticker\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from RC_styles import rc_styles as style" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cluster import KMeans" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates database of info from March 2020 using sqlite 3\n", - "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df is starting database\n", - "df = pd.read_sql('SELECT * FROM slurm', db)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# for displaying all available column options\n", - "pd.set_option('display.max_columns', None)\n", - "df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# converts units in ReqMemCPU column from bytes to gigs\n", - "df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df_completed is dataframe of all completed jobs\n", - "df_completed = df[df.State.str.contains('COMPLETED')]\n", - "#df_completed.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df_batch is df with only batch jobs\n", - "df_batch = df[df.JobName.str.contains('batch')]\n", - "#df_batch.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Average RAM per CPU Requested by User" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df_2 is database of completed jobs with only User and ReqMemCpu\n", - "# it is used for the user dataframes\n", - "\n", - "df_2 = df_completed.loc[:,['User','ReqMemCPU']]\n", - "#df_2.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)\n", - "#df_2.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings \n", - "\n", - "nan_value = float(\"NaN\")\n", - "\n", - "df_2.replace(\"\", nan_value, inplace=True)\n", - "\n", - "df_2.dropna(subset = [\"User\"], inplace=True)\n", - "#df_2.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# count = count of jobs per user\n", - "# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs\n", - "df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()\n", - "#df_user.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", - "\n", - "# description of number of jobs run per user - can be used to choose the Upper Limit Job Count\n", - "df_user['count'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# variable for to be used in names of plots to describe the max job count per user\n", - "\n", - "# max = 367257\n", - "UpperlimitJobCount = 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above\n", - "jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]\n", - "#jobscount_cutoff.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph\n", - "df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)\n", - "df_user_graph_full.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]\n", - "df_user_graph.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "user_graph1 = sns.scatterplot(x=\"count\", y=\"mean\",data=df_user_graph)\n", - "\n", - "plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)\n", - "\n", - "plt.xlabel('Job Count Per User')\n", - "plt.ylabel('Average Requested RAM per CPU (Gigs)')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kmeans = KMeans(n_clusters=3)\n", - "model = kmeans.fit(df_user_graph[['count', 'mean']])\n", - "# Now, we can get the predicted model labels, or Centroids, in the form of an array:\n", - "model.cluster_centers_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# attach predicted cluster to original points\n", - "df_user_graph['predicted'] = model.labels_\n", - "df_user_graph.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a dataframe for cluster_centers (centroids)\n", - "centroids = pd.DataFrame(model.cluster_centers_, columns=[\"count\", \"mean\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "## Plot scatter by cluster / color, and centroids\n", - "colors = [\"red\", \"green\", \"blue\"]\n", - "df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])\n", - "ax = df_user_graph.plot( \n", - " kind=\"scatter\", \n", - " x=\"count\", y=\"mean\",\n", - " c = df_user_graph['color']\n", - ")\n", - "centroids.plot(\n", - " kind=\"scatter\", \n", - " x=\"count\", y=\"mean\", \n", - " marker=\"*\", c=[\"r\", \"g\", \"b\"], s=550,\n", - " ax=ax\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# trying the same above graph using diffrerent syntax" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]\n", - "#df_user_graph_cluster.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kmeans = KMeans(n_clusters=3, random_state=111)\n", - "kmeans.fit(df_user_graph_cluster)\n", - "print(kmeans.cluster_centers_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')\n", - "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')\n", - "#plt.yscale(\"log\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Average RAM per CPU by Job" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID\n", - "# it is used to pull out needed information and create separate datasets to compare\n", - "df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]\n", - "#df_3.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)\n", - "#df_3.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# variable for to be used in names of plots to describe the max gigs measured\n", - "UpperlimitGB = 50" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n", - "gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]\n", - "#gig_cutoff.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# renames JobID column to JobCount since that's what it is now\n", - "df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n", - "#df_cpu_per_job.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_cpu_per_job['ReqMemCPU'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "cpu_per_job = sns.scatterplot(x=\"ReqMemCPU\", y=\"JobCount\",data=df_cpu_per_job)\n", - "\n", - "cpu_per_job.set_yscale('log')\n", - "\n", - "#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))\n", - "#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())\n", - "\n", - "plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n", - "\n", - "plt.xlabel('Requested RAM per CPU (Gigs) per Job')\n", - "plt.ylabel('Job Count')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n", - "df_cpu_per_job_cluster.head(30)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kmeans = KMeans(n_clusters=3, random_state=111)\n", - "kmeans.fit(df_cpu_per_job_cluster)\n", - "print(kmeans.cluster_centers_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(kmeans.labels_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.scatter(df_cpu_per_job_cluster['ReqMemCPU'],df_cpu_per_job_cluster['JobCount'], c=kmeans.labels_, cmap='rainbow')\n", - "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')\n", - "plt.yscale(\"log\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# renames JobID column to JobCount since that's what it is now\n", - "job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n", - "job_count.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "UpperlimitJobCount2 = 20" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above\n", - "df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]\n", - "df_job_count.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "style.default_axes_and_ticks()\n", - "style.figsize()\n", - "\n", - "job_count_graph = sns.scatterplot(x=\"JobCount\", y=\"ReqMemCPU\",data=df_job_count)\n", - "\n", - "#job_count_graph.set_yscale('log')\n", - "\n", - "#job_count_graph.yaxis.set_major_locator(ticker.MultipleLocator(100000))\n", - "#job_count_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())\n", - "\n", - "plt.title('Number of Jobs Requesting RAM per CPU for all Jobs counts of %i or less'%UpperlimitJobCount2)\n", - "\n", - "plt.xlabel('Job Count')\n", - "plt.ylabel('Average Requested RAM per CPU (Gigs) per Job')\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above\n", - "df_job_count_cluster = job_count[(job_count.JobCount <= UpperlimitJobCount2)]\n", - "df_job_count_cluster.head(50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kmeans2 = KMeans(n_clusters=4, random_state=111)\n", - "kmeans2.fit(df_job_count)\n", - "print(kmeans2.cluster_centers_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(kmeans2.labels_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.scatter(df_job_count['JobCount'],df_job_count['ReqMemCPU'], c=kmeans2.labels_, cmap='rainbow')\n", - "plt.scatter(kmeans2.cluster_centers_[:,1] ,kmeans2.cluster_centers_[:,0], color='grey')\n", - "#plt.yscale(\"log\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}