Delete Jobs-and-Users-ReqMemCPU.ipynb

1a55e8c9 · Ryan Randles Jones · 26c21463 · 26c21463
Commit 1a55e8c9 authored 4 years ago by Ryan Randles Jones
--- a/Jobs-and-Users-ReqMemCPU.ipynb
+++ b/Jobs-and-Users-ReqMemCPU.ipynb
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Notebook Setup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "import sqlite3\n",
-    "import slurm2sql\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "%matplotlib inline\n",
-    "import seaborn as sns\n",
-    "import plotly.express as px\n",
-    "import matplotlib.ticker as ticker\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from RC_styles import rc_styles as style"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.cluster import KMeans"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# creates database of info from March 2020 using sqlite 3\n",
-    "db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# df is starting database\n",
-    "df = pd.read_sql('SELECT * FROM slurm', db)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# voluntary\n",
-    "\n",
-    "# for displaying all available column options\n",
-    "pd.set_option('display.max_columns', None)\n",
-    "df.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# converts units in ReqMemCPU column from bytes to gigs\n",
-    "df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# df_completed is dataframe of all completed jobs\n",
-    "df_completed = df[df.State.str.contains('COMPLETED')]\n",
-    "#df_completed.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# df_batch is df with only batch jobs\n",
-    "df_batch = df[df.JobName.str.contains('batch')]\n",
-    "#df_batch.head(5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Average RAM per CPU Requested by User"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# df_2 is database of completed jobs with only User and ReqMemCpu\n",
-    "# it is used for the user dataframes\n",
-    "\n",
-    "df_2 = df_completed.loc[:,['User','ReqMemCPU']]\n",
-    "#df_2.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)\n",
-    "#df_2.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings \n",
-    "\n",
-    "nan_value = float(\"NaN\")\n",
-    "\n",
-    "df_2.replace(\"\", nan_value, inplace=True)\n",
-    "\n",
-    "df_2.dropna(subset = [\"User\"], inplace=True)\n",
-    "#df_2.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# count = count of jobs per user\n",
-    "# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs\n",
-    "df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()\n",
-    "#df_user.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# voluntary\n",
-    "\n",
-    "# description of number of jobs run per user - can be used to choose the Upper Limit Job Count\n",
-    "df_user['count'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# variable for to be used in names of plots to describe the max job count per user\n",
-    "\n",
-    "# max = 367257\n",
-    "UpperlimitJobCount = 100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above\n",
-    "jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]\n",
-    "#jobscount_cutoff.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph\n",
-    "df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)\n",
-    "df_user_graph_full.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]\n",
-    "df_user_graph.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "style.default_axes_and_ticks()\n",
-    "style.figsize()\n",
-    "\n",
-    "user_graph1 = sns.scatterplot(x=\"count\", y=\"mean\",data=df_user_graph)\n",
-    "\n",
-    "plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)\n",
-    "\n",
-    "plt.xlabel('Job Count Per User')\n",
-    "plt.ylabel('Average Requested RAM per CPU (Gigs)')\n",
-    "\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kmeans = KMeans(n_clusters=3)\n",
-    "model = kmeans.fit(df_user_graph[['count', 'mean']])\n",
-    "# Now, we can get the predicted model labels, or Centroids, in the form of an array:\n",
-    "model.cluster_centers_"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# attach predicted cluster to original points\n",
-    "df_user_graph['predicted'] = model.labels_\n",
-    "df_user_graph.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a dataframe for cluster_centers (centroids)\n",
-    "centroids = pd.DataFrame(model.cluster_centers_, columns=[\"count\", \"mean\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "style.default_axes_and_ticks()\n",
-    "style.figsize()\n",
-    "\n",
-    "## Plot scatter by cluster / color, and centroids\n",
-    "colors = [\"red\", \"green\", \"blue\"]\n",
-    "df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])\n",
-    "ax = df_user_graph.plot(    \n",
-    "    kind=\"scatter\", \n",
-    "    x=\"count\", y=\"mean\",\n",
-    "    c = df_user_graph['color']\n",
-    ")\n",
-    "centroids.plot(\n",
-    "    kind=\"scatter\", \n",
-    "    x=\"count\", y=\"mean\", \n",
-    "    marker=\"*\", c=[\"r\", \"g\", \"b\"], s=550,\n",
-    "    ax=ax\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# trying the same above graph using diffrerent syntax"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]\n",
-    "#df_user_graph_cluster.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kmeans = KMeans(n_clusters=3, random_state=111)\n",
-    "kmeans.fit(df_user_graph_cluster)\n",
-    "print(kmeans.cluster_centers_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')\n",
-    "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')\n",
-    "#plt.yscale(\"log\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Average RAM per CPU by Job"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID\n",
-    "# it is used to pull out needed information and create separate datasets to compare\n",
-    "df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]\n",
-    "#df_3.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)\n",
-    "#df_3.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# variable for to be used in names of plots to describe the max gigs measured\n",
-    "UpperlimitGB = 50"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n",
-    "gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]\n",
-    "#gig_cutoff.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# renames JobID column to JobCount since that's what it is now\n",
-    "df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n",
-    "#df_cpu_per_job.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_cpu_per_job['ReqMemCPU'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "style.default_axes_and_ticks()\n",
-    "style.figsize()\n",
-    "\n",
-    "cpu_per_job = sns.scatterplot(x=\"ReqMemCPU\", y=\"JobCount\",data=df_cpu_per_job)\n",
-    "\n",
-    "cpu_per_job.set_yscale('log')\n",
-    "\n",
-    "#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))\n",
-    "#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())\n",
-    "\n",
-    "plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
-    "\n",
-    "plt.xlabel('Requested RAM per CPU (Gigs) per Job')\n",
-    "plt.ylabel('Job Count')\n",
-    "\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n",
-    "df_cpu_per_job_cluster.head(30)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kmeans = KMeans(n_clusters=3, random_state=111)\n",
-    "kmeans.fit(df_cpu_per_job_cluster)\n",
-    "print(kmeans.cluster_centers_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(kmeans.labels_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.scatter(df_cpu_per_job_cluster['ReqMemCPU'],df_cpu_per_job_cluster['JobCount'], c=kmeans.labels_, cmap='rainbow')\n",
-    "plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')\n",
-    "plt.yscale(\"log\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# renames JobID column to JobCount since that's what it is now\n",
-    "job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n",
-    "job_count.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "UpperlimitJobCount2 = 20"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above\n",
-    "df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]\n",
-    "df_job_count.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "style.default_axes_and_ticks()\n",
-    "style.figsize()\n",
-    "\n",
-    "job_count_graph = sns.scatterplot(x=\"JobCount\", y=\"ReqMemCPU\",data=df_job_count)\n",
-    "\n",
-    "#job_count_graph.set_yscale('log')\n",
-    "\n",
-    "#job_count_graph.yaxis.set_major_locator(ticker.MultipleLocator(100000))\n",
-    "#job_count_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())\n",
-    "\n",
-    "plt.title('Number of Jobs Requesting RAM per CPU for all Jobs counts of %i or less'%UpperlimitJobCount2)\n",
-    "\n",
-    "plt.xlabel('Job Count')\n",
-    "plt.ylabel('Average Requested RAM per CPU (Gigs) per Job')\n",
-    "\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# must run\n",
-    "\n",
-    "# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above\n",
-    "df_job_count_cluster = job_count[(job_count.JobCount <= UpperlimitJobCount2)]\n",
-    "df_job_count_cluster.head(50)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kmeans2 = KMeans(n_clusters=4, random_state=111)\n",
-    "kmeans2.fit(df_job_count)\n",
-    "print(kmeans2.cluster_centers_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(kmeans2.labels_)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.scatter(df_job_count['JobCount'],df_job_count['ReqMemCPU'], c=kmeans2.labels_, cmap='rainbow')\n",
-    "plt.scatter(kmeans2.cluster_centers_[:,1] ,kmeans2.cluster_centers_[:,0], color='grey')\n",
-    "#plt.yscale(\"log\")"
-   ]
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python",
-   "pygments_lexer": "ipython3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
-%% Cell type:markdown id: tags:
-# Notebook Setup
-%% Cell type:code id: tags:
-``` 
-# must run
-import sqlite3
-import slurm2sql
-import pandas as pd
-import matplotlib.pyplot as plt
-%matplotlib inline
-import seaborn as sns
-import plotly.express as px
-import matplotlib.ticker as ticker
-import numpy as np
-```
-%% Cell type:code id: tags:
-``` 
-from RC_styles import rc_styles as style
-```
-%% Cell type:code id: tags:
-``` 
-from sklearn.cluster import KMeans
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# creates database of info from March 2020 using sqlite 3
-db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# df is starting database
-df = pd.read_sql('SELECT * FROM slurm', db)
-```
-%% Cell type:code id: tags:
-``` 
-# voluntary
-# for displaying all available column options
-pd.set_option('display.max_columns', None)
-df.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# converts units in ReqMemCPU column from bytes to gigs
-df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# df_completed is dataframe of all completed jobs
-df_completed = df[df.State.str.contains('COMPLETED')]
-#df_completed.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# df_batch is df with only batch jobs
-df_batch = df[df.JobName.str.contains('batch')]
-#df_batch.head(5)
-```
-%% Cell type:markdown id: tags:
-# Average RAM per CPU Requested by User
-%% Cell type:code id: tags:
-``` 
-# must run
-# df_2 is database of completed jobs with only User and ReqMemCpu
-# it is used for the user dataframes
-df_2 = df_completed.loc[:,['User','ReqMemCPU']]
-#df_2.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
-#df_2.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
-nan_value = float("NaN")
-df_2.replace("", nan_value, inplace=True)
-df_2.dropna(subset = ["User"], inplace=True)
-#df_2.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# count = count of jobs per user
-# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
-df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
-#df_user.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# voluntary
-# description of number of jobs run per user - can be used to choose the Upper Limit Job Count
-df_user['count'].describe()
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# variable for to be used in names of plots to describe the max job count per user
-# max = 367257
-UpperlimitJobCount = 100
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
-jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
-#jobscount_cutoff.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
-df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)
-df_user_graph_full.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]
-df_user_graph.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-style.default_axes_and_ticks()
-style.figsize()
-user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
-plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
-plt.xlabel('Job Count Per User')
-plt.ylabel('Average Requested RAM per CPU (Gigs)')
-plt.show()
-```
-%% Cell type:code id: tags:
-``` 
-kmeans = KMeans(n_clusters=3)
-model = kmeans.fit(df_user_graph[['count', 'mean']])
-# Now, we can get the predicted model labels, or Centroids, in the form of an array:
-model.cluster_centers_
-```
-%% Cell type:code id: tags:
-``` 
-# attach predicted cluster to original points
-df_user_graph['predicted'] = model.labels_
-df_user_graph.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# Create a dataframe for cluster_centers (centroids)
-centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"])
-```
-%% Cell type:code id: tags:
-``` 
-style.default_axes_and_ticks()
-style.figsize()
-## Plot scatter by cluster / color, and centroids
-colors = ["red", "green", "blue"]
-df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])
-ax = df_user_graph.plot(
-    kind="scatter",
-    x="count", y="mean",
-    c = df_user_graph['color']
-)
-centroids.plot(
-    kind="scatter",
-    x="count", y="mean",
-    marker="*", c=["r", "g", "b"], s=550,
-    ax=ax
-)
-```
-%% Cell type:markdown id: tags:
-# trying the same above graph using diffrerent syntax
-%% Cell type:code id: tags:
-``` 
-df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]
-#df_user_graph_cluster.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-kmeans = KMeans(n_clusters=3, random_state=111)
-kmeans.fit(df_user_graph_cluster)
-print(kmeans.cluster_centers_)
-```
-%% Cell type:code id: tags:
-``` 
-plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')
-plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
-#plt.yscale("log")
-```
-%% Cell type:markdown id: tags:
-# Average RAM per CPU by Job
-%% Cell type:code id: tags:
-``` 
-# must run
-# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
-# it is used to pull out needed information and create separate datasets to compare
-df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
-#df_3.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
-#df_3.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# variable for to be used in names of plots to describe the max gigs measured
-UpperlimitGB = 50
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
-gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
-#gig_cutoff.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-# renames JobID column to JobCount since that's what it is now
-df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
-#df_cpu_per_job.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-df_cpu_per_job['ReqMemCPU'].describe()
-```
-%% Cell type:code id: tags:
-``` 
-style.default_axes_and_ticks()
-style.figsize()
-cpu_per_job = sns.scatterplot(x="ReqMemCPU", y="JobCount",data=df_cpu_per_job)
-cpu_per_job.set_yscale('log')
-#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))
-#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())
-plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
-plt.xlabel('Requested RAM per CPU (Gigs) per Job')
-plt.ylabel('Job Count')
-plt.show()
-```
-%% Cell type:code id: tags:
-``` 
-df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
-df_cpu_per_job_cluster.head(30)
-```
-%% Cell type:code id: tags:
-``` 
-kmeans = KMeans(n_clusters=3, random_state=111)
-kmeans.fit(df_cpu_per_job_cluster)
-print(kmeans.cluster_centers_)
-```
-%% Cell type:code id: tags:
-``` 
-print(kmeans.labels_)
-```
-%% Cell type:code id: tags:
-``` 
-plt.scatter(df_cpu_per_job_cluster['ReqMemCPU'],df_cpu_per_job_cluster['JobCount'], c=kmeans.labels_, cmap='rainbow')
-plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
-plt.yscale("log")
-```
-%% Cell type:code id: tags:
-``` 
-# renames JobID column to JobCount since that's what it is now
-job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
-job_count.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-UpperlimitJobCount2 = 20
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
-df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
-df_job_count.head(5)
-```
-%% Cell type:code id: tags:
-``` 
-style.default_axes_and_ticks()
-style.figsize()
-job_count_graph = sns.scatterplot(x="JobCount", y="ReqMemCPU",data=df_job_count)
-#job_count_graph.set_yscale('log')
-#job_count_graph.yaxis.set_major_locator(ticker.MultipleLocator(100000))
-#job_count_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())
-plt.title('Number of Jobs Requesting RAM per CPU for all Jobs counts of %i or less'%UpperlimitJobCount2)
-plt.xlabel('Job Count')
-plt.ylabel('Average Requested RAM per CPU (Gigs) per Job')
-plt.show()
-```
-%% Cell type:code id: tags:
-``` 
-# must run
-# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
-df_job_count_cluster = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
-df_job_count_cluster.head(50)
-```
-%% Cell type:code id: tags:
-``` 
-kmeans2 = KMeans(n_clusters=4, random_state=111)
-kmeans2.fit(df_job_count)
-print(kmeans2.cluster_centers_)
-```
-%% Cell type:code id: tags:
-``` 
-print(kmeans2.labels_)
-```
-%% Cell type:code id: tags:
-``` 
-plt.scatter(df_job_count['JobCount'],df_job_count['ReqMemCPU'], c=kmeans2.labels_, cmap='rainbow')
-plt.scatter(kmeans2.cluster_centers_[:,1] ,kmeans2.cluster_centers_[:,0], color='grey')
-#plt.yscale("log")
-```