Skip to content
Snippets Groups Projects
Commit 1a55e8c9 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

Delete Jobs-and-Users-ReqMemCPU.ipynb

parent 26c21463
No related branches found
No related tags found
1 merge request!1Kmeans clustering
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import matplotlib.ticker as ticker
import numpy as np
```
%% Cell type:code id: tags:
```
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
from sklearn.cluster import KMeans
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
# must run
# df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_batch is df with only batch jobs
df_batch = df[df.JobName.str.contains('batch')]
#df_batch.head(5)
```
%% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User
%% Cell type:code id: tags:
```
# must run
# df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes
df_2 = df_completed.loc[:,['User','ReqMemCPU']]
#df_2.head(5)
```
%% Cell type:code id: tags:
```
df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
#df_2.head(5)
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5)
```
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
#df_user.head(5)
```
%% Cell type:code id: tags:
```
# voluntary
# description of number of jobs run per user - can be used to choose the Upper Limit Job Count
df_user['count'].describe()
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max job count per user
# max = 367257
UpperlimitJobCount = 100
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
#jobscount_cutoff.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)
df_user_graph_full.head(5)
```
%% Cell type:code id: tags:
```
df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]
df_user_graph.head(5)
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count Per User')
plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show()
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3)
model = kmeans.fit(df_user_graph[['count', 'mean']])
# Now, we can get the predicted model labels, or Centroids, in the form of an array:
model.cluster_centers_
```
%% Cell type:code id: tags:
```
# attach predicted cluster to original points
df_user_graph['predicted'] = model.labels_
df_user_graph.head(5)
```
%% Cell type:code id: tags:
```
# Create a dataframe for cluster_centers (centroids)
centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"])
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
## Plot scatter by cluster / color, and centroids
colors = ["red", "green", "blue"]
df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])
ax = df_user_graph.plot(
kind="scatter",
x="count", y="mean",
c = df_user_graph['color']
)
centroids.plot(
kind="scatter",
x="count", y="mean",
marker="*", c=["r", "g", "b"], s=550,
ax=ax
)
```
%% Cell type:markdown id: tags:
# trying the same above graph using diffrerent syntax
%% Cell type:code id: tags:
```
df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]
#df_user_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_user_graph_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
```
%% Cell type:markdown id: tags:
# Average RAM per CPU by Job
%% Cell type:code id: tags:
```
# must run
# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare
df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
#df_3.head(5)
```
%% Cell type:code id: tags:
```
df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
#df_3.head(5)
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
#gig_cutoff.head(5)
```
%% Cell type:code id: tags:
```
# renames JobID column to JobCount since that's what it is now
df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
#df_cpu_per_job.head(5)
```
%% Cell type:code id: tags:
```
df_cpu_per_job['ReqMemCPU'].describe()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
cpu_per_job = sns.scatterplot(x="ReqMemCPU", y="JobCount",data=df_cpu_per_job)
cpu_per_job.set_yscale('log')
#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested RAM per CPU (Gigs) per Job')
plt.ylabel('Job Count')
plt.show()
```
%% Cell type:code id: tags:
```
df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
df_cpu_per_job_cluster.head(30)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_cpu_per_job_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
print(kmeans.labels_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_cpu_per_job_cluster['ReqMemCPU'],df_cpu_per_job_cluster['JobCount'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
plt.yscale("log")
```
%% Cell type:code id: tags:
```
# renames JobID column to JobCount since that's what it is now
job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
job_count.head(5)
```
%% Cell type:code id: tags:
```
UpperlimitJobCount2 = 20
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
df_job_count.head(5)
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
job_count_graph = sns.scatterplot(x="JobCount", y="ReqMemCPU",data=df_job_count)
#job_count_graph.set_yscale('log')
#job_count_graph.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#job_count_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs counts of %i or less'%UpperlimitJobCount2)
plt.xlabel('Job Count')
plt.ylabel('Average Requested RAM per CPU (Gigs) per Job')
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
df_job_count_cluster = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
df_job_count_cluster.head(50)
```
%% Cell type:code id: tags:
```
kmeans2 = KMeans(n_clusters=4, random_state=111)
kmeans2.fit(df_job_count)
print(kmeans2.cluster_centers_)
```
%% Cell type:code id: tags:
```
print(kmeans2.labels_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_job_count['JobCount'],df_job_count['ReqMemCPU'], c=kmeans2.labels_, cmap='rainbow')
plt.scatter(kmeans2.cluster_centers_[:,1] ,kmeans2.cluster_centers_[:,0], color='grey')
#plt.yscale("log")
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment