Skip to content
Snippets Groups Projects
Commit 26c21463 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added documentation

parent 384ff64c
No related branches found
No related tags found
1 merge request!1Kmeans clustering
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn as sb
import plotly.express as px
import matplotlib.ticker as ticker
import numpy as np
```
%% Cell type:code id: tags:
```
# must run
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
# must run
from sklearn.cluster import KMeans
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
# must run
# converts Elapsed time to hours (from seconds)
df['Elapsed'] = df['Elapsed'].div(3600)
```
%% Cell type:code id: tags:
```
# must run
# df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5)
```
%% Cell type:markdown id: tags:
# ReqMemCPU,Corecount,Runtime
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for ReqMemCPU
UpperlimitGB = 50
LowerlimitGB = 0
```
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for AllocCPUS
UpperlimitAllocCPU = 20
LowerlimitAllocCPU = 0
```
%% Cell type:code id: tags:
```
# must run
# dataset of needed columns for all graphs below
df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_1.head(5)
```
%% Cell type:code id: tags:
```
# must run
# rounds ReqMemCPU up to nearest whole number
df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
# must run
# rounds Elapsed up to nearest 2 decimal places
df_1['Elapsed'] = df_1['Elapsed'].round(2)
```
%% Cell type:code id: tags:
```
# must run
# sorts dataset by AllocCPUS for easy visualization
df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)
df_1_sorted.head(5)
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]
df_runtime.head(5)
```
%% Cell type:code id: tags:
```
# must run
# creates a facet grid from df_runtime dataset
# Elapsed time in hours and ReqMemCPU in gigs
style.default_axes_and_ticks()
style.figsize()
full_facet = sb.PairGrid(df_runtime)
full_facet.map(plt.scatter);
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
runtime_graph = sns.scatterplot(x="ReqMemCPU", y="AllocCPUS",data=df_runtime)
plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('AllocCPUS')
#plt.yscale("log")
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above for clustering
df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]
df_runtime_cluster.tail(5)
```
%% Cell type:code id: tags:
```
# must run
# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_runtime_cluster)
Sum_of_squared_distances.append(km.inertia_)
```
%% Cell type:code id: tags:
```
# must run
# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# sets to clusters and returns the cluster points
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_runtime_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
# must run
# facet grid of the two graphs being clustered using df_runtime_cluster dataset
style.default_axes_and_ticks()
style.figsize()
reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4)
reqmem_alloc.map(sns.regplot, color="blue")
```
%% Cell type:code id: tags:
```
# must run
# clustered graph
style.default_axes_and_ticks()
style.figsize()
runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# clustered graph
style.default_axes_and_ticks()
style.figsize()
alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU)
plt.show()
```
%% Cell type:code id: tags:
```
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment