Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
createAndParseSACCT
Manage
Activity
Members
Labels
Plan
Issues
82
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ryan Randles Jones
createAndParseSACCT
Commits
1a55e8c9
Commit
1a55e8c9
authored
4 years ago
by
Ryan Randles Jones
Browse files
Options
Downloads
Patches
Plain Diff
Delete Jobs-and-Users-ReqMemCPU.ipynb
parent
26c21463
No related branches found
Branches containing commit
No related tags found
1 merge request
!1
Kmeans clustering
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Jobs-and-Users-ReqMemCPU.ipynb
+0
-615
0 additions, 615 deletions
Jobs-and-Users-ReqMemCPU.ipynb
with
0 additions
and
615 deletions
Jobs-and-Users-ReqMemCPU.ipynb
deleted
100644 → 0
+
0
−
615
View file @
26c21463
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"import sqlite3\n",
"import slurm2sql\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"import plotly.express as px\n",
"import matplotlib.ticker as ticker\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from RC_styles import rc_styles as style"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database of info from March 2020 using sqlite 3\n",
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df is starting database\n",
"df = pd.read_sql('SELECT * FROM slurm', db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# voluntary\n",
"\n",
"# for displaying all available column options\n",
"pd.set_option('display.max_columns', None)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts units in ReqMemCPU column from bytes to gigs\n",
"df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_completed is dataframe of all completed jobs\n",
"df_completed = df[df.State.str.contains('COMPLETED')]\n",
"#df_completed.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_batch is df with only batch jobs\n",
"df_batch = df[df.JobName.str.contains('batch')]\n",
"#df_batch.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Average RAM per CPU Requested by User"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_2 is database of completed jobs with only User and ReqMemCpu\n",
"# it is used for the user dataframes\n",
"\n",
"df_2 = df_completed.loc[:,['User','ReqMemCPU']]\n",
"#df_2.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)\n",
"#df_2.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings \n",
"\n",
"nan_value = float(\"NaN\")\n",
"\n",
"df_2.replace(\"\", nan_value, inplace=True)\n",
"\n",
"df_2.dropna(subset = [\"User\"], inplace=True)\n",
"#df_2.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# count = count of jobs per user\n",
"# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs\n",
"df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()\n",
"#df_user.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# voluntary\n",
"\n",
"# description of number of jobs run per user - can be used to choose the Upper Limit Job Count\n",
"df_user['count'].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# variable for to be used in names of plots to describe the max job count per user\n",
"\n",
"# max = 367257\n",
"UpperlimitJobCount = 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above\n",
"jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]\n",
"#jobscount_cutoff.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph\n",
"df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)\n",
"df_user_graph_full.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]\n",
"df_user_graph.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"user_graph1 = sns.scatterplot(x=\"count\", y=\"mean\",data=df_user_graph)\n",
"\n",
"plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)\n",
"\n",
"plt.xlabel('Job Count Per User')\n",
"plt.ylabel('Average Requested RAM per CPU (Gigs)')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kmeans = KMeans(n_clusters=3)\n",
"model = kmeans.fit(df_user_graph[['count', 'mean']])\n",
"# Now, we can get the predicted model labels, or Centroids, in the form of an array:\n",
"model.cluster_centers_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# attach predicted cluster to original points\n",
"df_user_graph['predicted'] = model.labels_\n",
"df_user_graph.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a dataframe for cluster_centers (centroids)\n",
"centroids = pd.DataFrame(model.cluster_centers_, columns=[\"count\", \"mean\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"## Plot scatter by cluster / color, and centroids\n",
"colors = [\"red\", \"green\", \"blue\"]\n",
"df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])\n",
"ax = df_user_graph.plot( \n",
" kind=\"scatter\", \n",
" x=\"count\", y=\"mean\",\n",
" c = df_user_graph['color']\n",
")\n",
"centroids.plot(\n",
" kind=\"scatter\", \n",
" x=\"count\", y=\"mean\", \n",
" marker=\"*\", c=[\"r\", \"g\", \"b\"], s=550,\n",
" ax=ax\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# trying the same above graph using diffrerent syntax"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]\n",
"#df_user_graph_cluster.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kmeans = KMeans(n_clusters=3, random_state=111)\n",
"kmeans.fit(df_user_graph_cluster)\n",
"print(kmeans.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')\n",
"#plt.yscale(\"log\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Average RAM per CPU by Job"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID\n",
"# it is used to pull out needed information and create separate datasets to compare\n",
"df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]\n",
"#df_3.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)\n",
"#df_3.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# variable for to be used in names of plots to describe the max gigs measured\n",
"UpperlimitGB = 50"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n",
"gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]\n",
"#gig_cutoff.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# renames JobID column to JobCount since that's what it is now\n",
"df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n",
"#df_cpu_per_job.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_cpu_per_job['ReqMemCPU'].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"cpu_per_job = sns.scatterplot(x=\"ReqMemCPU\", y=\"JobCount\",data=df_cpu_per_job)\n",
"\n",
"cpu_per_job.set_yscale('log')\n",
"\n",
"#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))\n",
"#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())\n",
"\n",
"plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"\n",
"plt.xlabel('Requested RAM per CPU (Gigs) per Job')\n",
"plt.ylabel('Job Count')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n",
"df_cpu_per_job_cluster.head(30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kmeans = KMeans(n_clusters=3, random_state=111)\n",
"kmeans.fit(df_cpu_per_job_cluster)\n",
"print(kmeans.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(kmeans.labels_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.scatter(df_cpu_per_job_cluster['ReqMemCPU'],df_cpu_per_job_cluster['JobCount'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')\n",
"plt.yscale(\"log\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# renames JobID column to JobCount since that's what it is now\n",
"job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()\n",
"job_count.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"UpperlimitJobCount2 = 20"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above\n",
"df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]\n",
"df_job_count.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"job_count_graph = sns.scatterplot(x=\"JobCount\", y=\"ReqMemCPU\",data=df_job_count)\n",
"\n",
"#job_count_graph.set_yscale('log')\n",
"\n",
"#job_count_graph.yaxis.set_major_locator(ticker.MultipleLocator(100000))\n",
"#job_count_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())\n",
"\n",
"plt.title('Number of Jobs Requesting RAM per CPU for all Jobs counts of %i or less'%UpperlimitJobCount2)\n",
"\n",
"plt.xlabel('Job Count')\n",
"plt.ylabel('Average Requested RAM per CPU (Gigs) per Job')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above\n",
"df_job_count_cluster = job_count[(job_count.JobCount <= UpperlimitJobCount2)]\n",
"df_job_count_cluster.head(50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kmeans2 = KMeans(n_clusters=4, random_state=111)\n",
"kmeans2.fit(df_job_count)\n",
"print(kmeans2.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(kmeans2.labels_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.scatter(df_job_count['JobCount'],df_job_count['ReqMemCPU'], c=kmeans2.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans2.cluster_centers_[:,1] ,kmeans2.cluster_centers_[:,0], color='grey')\n",
"#plt.yscale(\"log\")"
]
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import matplotlib.ticker as ticker
import numpy as np
```
%% Cell type:code id: tags:
```
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
from sklearn.cluster import KMeans
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
# must run
# df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_batch is df with only batch jobs
df_batch = df[df.JobName.str.contains('batch')]
#df_batch.head(5)
```
%% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User
%% Cell type:code id: tags:
```
# must run
# df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes
df_2 = df_completed.loc[:,['User','ReqMemCPU']]
#df_2.head(5)
```
%% Cell type:code id: tags:
```
df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
#df_2.head(5)
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5)
```
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
#df_user.head(5)
```
%% Cell type:code id: tags:
```
# voluntary
# description of number of jobs run per user - can be used to choose the Upper Limit Job Count
df_user['count'].describe()
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max job count per user
# max = 367257
UpperlimitJobCount = 100
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
#jobscount_cutoff.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)
df_user_graph_full.head(5)
```
%% Cell type:code id: tags:
```
df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]
df_user_graph.head(5)
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count Per User')
plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show()
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3)
model = kmeans.fit(df_user_graph[['count', 'mean']])
# Now, we can get the predicted model labels, or Centroids, in the form of an array:
model.cluster_centers_
```
%% Cell type:code id: tags:
```
# attach predicted cluster to original points
df_user_graph['predicted'] = model.labels_
df_user_graph.head(5)
```
%% Cell type:code id: tags:
```
# Create a dataframe for cluster_centers (centroids)
centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"])
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
## Plot scatter by cluster / color, and centroids
colors = ["red", "green", "blue"]
df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])
ax = df_user_graph.plot(
kind="scatter",
x="count", y="mean",
c = df_user_graph['color']
)
centroids.plot(
kind="scatter",
x="count", y="mean",
marker="*", c=["r", "g", "b"], s=550,
ax=ax
)
```
%% Cell type:markdown id: tags:
# trying the same above graph using diffrerent syntax
%% Cell type:code id: tags:
```
df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]
#df_user_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_user_graph_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
```
%% Cell type:markdown id: tags:
# Average RAM per CPU by Job
%% Cell type:code id: tags:
```
# must run
# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare
df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
#df_3.head(5)
```
%% Cell type:code id: tags:
```
df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
#df_3.head(5)
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
#gig_cutoff.head(5)
```
%% Cell type:code id: tags:
```
# renames JobID column to JobCount since that's what it is now
df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
#df_cpu_per_job.head(5)
```
%% Cell type:code id: tags:
```
df_cpu_per_job['ReqMemCPU'].describe()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
cpu_per_job = sns.scatterplot(x="ReqMemCPU", y="JobCount",data=df_cpu_per_job)
cpu_per_job.set_yscale('log')
#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested RAM per CPU (Gigs) per Job')
plt.ylabel('Job Count')
plt.show()
```
%% Cell type:code id: tags:
```
df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
df_cpu_per_job_cluster.head(30)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_cpu_per_job_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
print(kmeans.labels_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_cpu_per_job_cluster['ReqMemCPU'],df_cpu_per_job_cluster['JobCount'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
plt.yscale("log")
```
%% Cell type:code id: tags:
```
# renames JobID column to JobCount since that's what it is now
job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
job_count.head(5)
```
%% Cell type:code id: tags:
```
UpperlimitJobCount2 = 20
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
df_job_count.head(5)
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
job_count_graph = sns.scatterplot(x="JobCount", y="ReqMemCPU",data=df_job_count)
#job_count_graph.set_yscale('log')
#job_count_graph.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#job_count_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs counts of %i or less'%UpperlimitJobCount2)
plt.xlabel('Job Count')
plt.ylabel('Average Requested RAM per CPU (Gigs) per Job')
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
df_job_count_cluster = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
df_job_count_cluster.head(50)
```
%% Cell type:code id: tags:
```
kmeans2 = KMeans(n_clusters=4, random_state=111)
kmeans2.fit(df_job_count)
print(kmeans2.cluster_centers_)
```
%% Cell type:code id: tags:
```
print(kmeans2.labels_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_job_count['JobCount'],df_job_count['ReqMemCPU'], c=kmeans2.labels_, cmap='rainbow')
plt.scatter(kmeans2.cluster_centers_[:,1] ,kmeans2.cluster_centers_[:,0], color='grey')
#plt.yscale("log")
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment