Skip to content
Snippets Groups Projects
Commit dbd28839 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added clustering graphs

parent fc6574e1
No related branches found
No related tags found
1 merge request!3Cores and runtime clustering
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Notebook Setup # Notebook Setup
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import seaborn as sb import seaborn as sb
import plotly.express as px import plotly.express as px
import matplotlib.ticker as ticker import matplotlib.ticker as ticker
import numpy as np import numpy as np
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
from RC_styles import rc_styles as style from RC_styles import rc_styles as style
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df is starting database # df is starting database
df = pd.read_sql('SELECT * FROM slurm', db) df = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df.head(5) #df.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# converts units in ReqMemCPU column from bytes to gigs # converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3) df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# converts Elapsed time to hours (from seconds) # converts Elapsed time to hours (from seconds)
df['Elapsed'] = df['Elapsed'].div(3600) df['Elapsed'] = df['Elapsed'].div(3600)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_completed is dataframe of all completed jobs # df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')] df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5) #df_completed.head(5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# ReqMemCPU,Corecount,Runtime Clustering # ReqMemCPU,Corecount,Runtime Clustering
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# dataset of needed columns for all graphs below # dataset of needed columns for all graphs below
df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']] df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_1.head(5) df_1.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# rounds ReqMemCPU up to nearest whole number # rounds ReqMemCPU up to nearest whole number
df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil) df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# rounds Elapsed up to nearest 2 decimal places # rounds Elapsed up to nearest 2 decimal places
df_1['Elapsed'] = df_1['Elapsed'].round(2) df_1['Elapsed'] = df_1['Elapsed'].round(2)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run df_1.ReqMemCPU = df_1.ReqMemCPU.apply(int)
df_1.head(5)
# sorts dataset by AllocCPUS for easy visualization
df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)
df_1_sorted.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets min and max parameters for ReqMemCPU # sets min and max parameters for ReqMemCPU
UpperlimitGB = 50 UpperlimitGB = 50
LowerlimitGB = 0 LowerlimitGB = 0
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets min and max parameters for AllocCPUS # sets min and max parameters for AllocCPUS
UpperlimitAllocCPU = 20 UpperlimitAllocCPU = 40
LowerlimitAllocCPU = 0 LowerlimitAllocCPU = 0
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets min and max parameters for Elapsed
UpperlimitElapsed = 150
LowerlimitElapsed = 0.5
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above # creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_facet = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)] df_facet = df_1[(df_1['ReqMemCPU'] <= UpperlimitGB) &
(df_1['ReqMemCPU'] >= LowerlimitGB) &
(df_1['AllocCPUS'] <= UpperlimitAllocCPU) &
(df_1['AllocCPUS'] >= LowerlimitAllocCPU)
&
(df_1['Elapsed'] <= UpperlimitElapsed) &
(df_1['Elapsed'] >= LowerlimitElapsed)]
df_facet.head(5) df_facet.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates a facet grid from df_runtime dataset # creates a facet grid from df_runtime dataset
# Elapsed time in hours and ReqMemCPU in gigs # Elapsed time in hours and ReqMemCPU in gigs
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
full_facet = sb.PairGrid(df_facet) full_facet = sns.pairplot(df_facet, diag_kind = 'kde') # makes density plots - kernel density estimate
# y axis is count in the diagonal graphs
#full_facet = sb.PairGrid(df_facet)
full_facet.map(plt.scatter); full_facet.map(plt.scatter);
plt.show() plt.show()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Detailed Look at Elapsed Time - In terms of Requested RAM and Cores # Detailed Look at Elapsed Time - In terms of Requested RAM and Cores
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets min and max parameters for ReqMemCPU for clustered Elapsed Time Graphs # sets min and max parameters for ReqMemCPU for clustered Elapsed Time Graphs
UpperlimitGB_elapsed = 50 UpperlimitGB_elapsed = 50
LowerlimitGB_elapsed = 0 LowerlimitGB_elapsed = 0
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets min and max parameters for AllocCPUS for clustered Elapsed Time Graphs # sets min and max parameters for AllocCPUS for clustered Elapsed Time Graphs
UpperlimitAllocCPU_elapsed = 20 UpperlimitAllocCPU_elapsed = 40
LowerlimitAllocCPU_elapsed = 0 LowerlimitAllocCPU_elapsed = 0
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above # creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB_elapsed) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB_elapsed) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU_elapsed) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU_elapsed)] df_runtime_cluster = df_1[(df_1['ReqMemCPU'] <= UpperlimitGB_elapsed) &
(df_1['ReqMemCPU'] >= LowerlimitGB_elapsed) &
(df_1['AllocCPUS'] <= UpperlimitAllocCPU_elapsed) &
(df_1['AllocCPUS'] >= LowerlimitAllocCPU_elapsed)]
df_runtime_cluster.head(5) df_runtime_cluster.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# facet grid of the two graphs being clustered using df_runtime_cluster dataset
style.default_axes_and_ticks()
style.figsize()
elapsed_reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4)
elapsed_reqmem_alloc.map(sns.regplot, color="blue")
```
%% Cell type:markdown id: tags:
# Elapsed/ReqMemCPU clustering
%% Cell type:code id: tags:
```
# must run
# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet # sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
Sum_of_squared_distances = [] Sum_of_squared_distances = []
K = range(1,10) K = range(1,10)
for k in K: for k in K:
km = KMeans(n_clusters=k) km = KMeans(n_clusters=k)
km = km.fit(df_runtime_cluster) km = km.fit(df_runtime_cluster)
Sum_of_squared_distances.append(km.inertia_) Sum_of_squared_distances.append(km.inertia_)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset # the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
plt.plot(K, Sum_of_squared_distances, 'bx-') plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k') plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances') plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k') plt.title('Elbow Method For Optimal k')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets to clusters and returns the cluster points # sets to clusters and returns the cluster points
kmeans = KMeans(n_clusters=3, random_state=111) kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_runtime_cluster) kmeans.fit(df_runtime_cluster)
print(kmeans.cluster_centers_) print(kmeans.cluster_centers_)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# facet grid of the two graphs being clustered using df_runtime_cluster dataset # clustered graph
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
elapsed_reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4) elapsed_mem_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
elapsed_reqmem_alloc.map(sns.regplot, color="blue") plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed)
plt.show()
```
%% Cell type:code id: tags:
```
df_2 = df_1.loc[:,['ReqMemCPU', 'Elapsed']]
```
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for ReqMemCPU for histogram of clustered Elapsed Time Graphs
#purple
LowerlimitGB_elapsed_1 = 0
UpperlimitGB_elapsed_1 = 20
LowerlimitElapsed_elapsed_1 = 0
UpperlimitElapsed_elapsed_1 = 30
#red
LowerlimitGB_elapsed_2 = 20
UpperlimitGB_elapsed_2 = 60
LowerlimitElapsed_elapsed_2 = 0
UpperlimitElapsed_elapsed_2 = 25
#green
LowerlimitGB_elapsed_3 = 0
UpperlimitGB_elapsed_3 = 50
LowerlimitElapsed_elapsed_3 = 30
UpperlimitElapsed_elapsed_3 = 150
```
%% Cell type:markdown id: tags:
# Graphing of the Purple Section of Runtime per Requested gigs of RAM Clusters
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_elapsed_1 = df_2[(df_2['ReqMemCPU'] <= UpperlimitGB_elapsed_1) &
(df_2['ReqMemCPU'] >= LowerlimitGB_elapsed_1) &
(df_2['Elapsed'] <= UpperlimitElapsed_elapsed_1) &
(df_2['Elapsed'] > LowerlimitElapsed_elapsed_1)]
df_elapsed_1.head(10)
#df_elapsed_1.ReqMemCPU.count()
```
%% Cell type:code id: tags:
```
df_swarmplot1 = df_elapsed_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
df_swarmplot1.head(10)
#df_swarmplot1.ReqMemCPU.count()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# clustered graph
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
elapsed_runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow') elapsed_mem_cluster_graph_1 = plt.scatter(df_elapsed_1['ReqMemCPU'],df_elapsed_1['Elapsed'])
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('ReqMemCPU(gigs)') plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)') plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed) plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed_1)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks()
style.figsize()
elapsed_mem_swarm_graph_1 = sns.swarmplot(data=df_swarmplot1, x='ReqMemCPU', y='Elapsed')
plt.margins(0.02)
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed_1)
plt.show()
```
%% Cell type:markdown id: tags:
# Graphing of the Red Section of Runtime per Requested gigs of RAM Clusters
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_elapsed_2 = df_2[(df_2['ReqMemCPU'] <= UpperlimitGB_elapsed_2) &
(df_2['ReqMemCPU'] >= LowerlimitGB_elapsed_2) &
(df_2['Elapsed'] <= UpperlimitElapsed_elapsed_2) &
(df_2['Elapsed'] > LowerlimitElapsed_elapsed_2)]
#df_elapsed_2.head(5)
```
%% Cell type:code id: tags:
```
df_swarmplot2 = df_elapsed_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
#df_swarmplot2.head(5)
```
%% Cell type:code id: tags:
```
# must run # must run
# clustered graph
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
elapsed_alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow') elapsed_mem_cluster_graph_2 = plt.scatter(df_elapsed_2['ReqMemCPU'],df_elapsed_2['Elapsed'])
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('AllocCPUS') plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)') plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_elapsed) plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed_2)
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
elapsed_mem_swarm_graph_2 = sns.swarmplot(data=df_swarmplot2, x='ReqMemCPU', y='Elapsed')
plt.margins(0.02)
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed_2)
plt.show() plt.show()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Detailed Look at Elapsed Time - In terms of Requested RAM and Cores # Graphing of the Green Section of Runtime per Requested gigs of RAM Clusters
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# second set of min and max parameters for ReqMemCPU to use for AllocCPU/ReqMemCPU cluster graph # creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
UpperlimitGB_alloc = 50 df_elapsed_3 = df_2[(df_2['ReqMemCPU'] <= UpperlimitGB_elapsed_3) &
LowerlimitGB_alloc = 0 (df_2['ReqMemCPU'] >= LowerlimitGB_elapsed_3) &
(df_2['Elapsed'] <= UpperlimitElapsed_elapsed_3) &
(df_2['Elapsed'] > LowerlimitElapsed_elapsed_3)]
#df_elapsed_3.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run df_swarmplot3 = df_elapsed_3.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()
# sets min and max parameters for AllocCPUS #df_swarmplot3.head(5)
UpperlimitAllocCPU_alloc = 60
LowerlimitAllocCPU_alloc = 0
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above style.default_axes_and_ticks()
df_allocCPUS_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB_alloc) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB_alloc) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU_alloc) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU_alloc)] style.figsize()
df_allocCPUS.head(5)
elapsed_mem_cluster_graph_3 = plt.scatter(df_elapsed_3['ReqMemCPU'],df_elapsed_3['Elapsed'])
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed_3)
plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks()
style.figsize()
elapsed_mem_swarm_graph_3 = sns.swarmplot(data=df_swarmplot3, x='ReqMemCPU', y='Elapsed')
plt.margins(0.02)
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed_3)
plt.show()
```
%% Cell type:markdown id: tags:
# Elapsed/AllocCPUS clustering
%% Cell type:code id: tags:
```
# must run # must run
# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet # sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
Sum_of_squared_distances = [] Sum_of_squared_distances = []
K = range(1,10) K = range(1,10)
for k in K: for k in K:
km = KMeans(n_clusters=k) km = KMeans(n_clusters=k)
km = km.fit(df_allocCPUS_cluster) km = km.fit(df_runtime_cluster)
Sum_of_squared_distances.append(km.inertia_) Sum_of_squared_distances.append(km.inertia_)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset # the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
plt.plot(K, Sum_of_squared_distances, 'bx-') plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k') plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances') plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k') plt.title('Elbow Method For Optimal k')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets to clusters and returns the cluster points # sets to clusters and returns the cluster points
kmeans = KMeans(n_clusters=3, random_state=111) kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_allocCPUS_cluster) kmeans.fit(df_runtime_cluster)
print(kmeans.cluster_centers_) print(kmeans.cluster_centers_)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run
# clustered graph
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
alloc_reqmem_graph = sns.scatterplot(x="ReqMemCPU", y="AllocCPUS",data=df_allocCPUS_cluster) elapsed_alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_elapsed)
plt.show()
```
%% Cell type:code id: tags:
```
df_3 = df_1.loc[:,['Elapsed', 'AllocCPUS']]
```
%% Cell type:code id: tags:
```
# must run
plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_alloc) # purple
LowerlimitElapsed_alloc1 = 0
UpperlimitElapsed_alloc1 = 35
LowerlimitAllocCPU_alloc1 = 0
UpperlimitAllocCPU_alloc1 = 40
#red
LowerlimitElapsed_alloc2 = 0
UpperlimitElapsed_alloc2 = 35
LowerlimitAllocCPU_alloc2 = 0
UpperlimitAllocCPU_alloc2 = 16
#green
LowerlimitElapsed_alloc3 = 35
UpperlimitElapsed_alloc3 = 160
LowerlimitAllocCPU_alloc3 = 0
UpperlimitAllocCPU_alloc3 = 32
```
%% Cell type:markdown id: tags:
# Graphing of the Purple Section of Runtime per Requested gigs of RAM Clusters
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_allocCPUS_1 = df_3[(df_3['Elapsed'] <= UpperlimitElapsed_alloc1) &
(df_3['Elapsed'] >= LowerlimitElapsed_alloc1) &
(df_3['AllocCPUS'] <= UpperlimitAllocCPU_alloc1) &
(df_3['AllocCPUS'] >= LowerlimitAllocCPU_alloc1)]
df_allocCPUS_1.head(5)
```
%% Cell type:code id: tags:
```
df_swarmplot12 = df_allocCPUS_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
#df_swarmplot12.head(5)
```
%% Cell type:code id: tags:
```
# must run
style.default_axes_and_ticks()
style.figsize()
elapsed_alloc_cluster_graph_1 = plt.scatter(df_allocCPUS_1['AllocCPUS'],df_allocCPUS_1['Elapsed'])
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_alloc1)
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
elapsed_alloc_swarm_graph_1 = sns.swarmplot(data=df_swarmplot12, x='AllocCPUS', y='Elapsed')
plt.margins(0.02)
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_alloc1)
plt.show()
```
%% Cell type:markdown id: tags:
# Graphing of the Red Section of Runtime per Requested gigs of RAM Clusters
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_allocCPUS_2 = df_3[(df_3['Elapsed'] <= UpperlimitElapsed_alloc2) &
(df_3['Elapsed'] >= LowerlimitElapsed_alloc2) &
(df_3['AllocCPUS'] <= UpperlimitAllocCPU_alloc2) &
(df_3['AllocCPUS'] >= LowerlimitAllocCPU_alloc2)]
df_allocCPUS_2.head(5)
```
%% Cell type:code id: tags:
```
df_swarmplot22 = df_allocCPUS_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
#df_swarmplot22.head(5)
```
%% Cell type:code id: tags:
```
# must run
style.default_axes_and_ticks()
style.figsize()
elapsed_alloc_cluster_graph_2 = plt.scatter(df_allocCPUS_2['AllocCPUS'],df_allocCPUS_2['Elapsed'])
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_alloc2)
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
elapsed_alloc_swarm_graph_2 = sns.swarmplot(data=df_swarmplot22, x='AllocCPUS', y='Elapsed')
plt.margins(0.02)
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_alloc2)
plt.show()
```
%% Cell type:markdown id: tags:
# Graphing of the Green Section of Runtime per Requested gigs of RAM Clusters
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_allocCPUS_3 = df_3[(df_3['Elapsed'] <= UpperlimitElapsed_alloc3) &
(df_3['Elapsed'] >= LowerlimitElapsed_alloc3) &
(df_3['AllocCPUS'] <= UpperlimitAllocCPU_alloc3) &
(df_3['AllocCPUS'] >= LowerlimitAllocCPU_alloc3)]
df_allocCPUS_2.head(5)
```
%% Cell type:code id: tags:
```
df_swarmplot32 = df_allocCPUS_3.groupby(['AllocCPUS','Elapsed']).sum().reset_index()
#df_swarmplot32.head(5)
```
%% Cell type:code id: tags:
```
# must run
style.default_axes_and_ticks()
style.figsize()
elapsed_alloc_cluster_graph_3 = plt.scatter(df_allocCPUS_3['AllocCPUS'],df_allocCPUS_3['Elapsed'])
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_alloc3)
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
elapsed_alloc_swarm_graph_3 = sns.swarmplot(data=df_swarmplot32, x='AllocCPUS', y='Elapsed')
plt.margins(0.02)
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_alloc3)
plt.show()
```
%% Cell type:markdown id: tags:
# Detailed Look at Cores - In terms of Requested RAM
%% Cell type:code id: tags:
```
LowerlimitGB_core = 0
UpperlimitGB_core = 50
LowerlimitAllocCPU_core = 0
UpperlimitAllocCPU_core = 65
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_cores_cluster = df_1[(df_1['ReqMemCPU'] <= UpperlimitGB_core) &
(df_1['ReqMemCPU'] >= LowerlimitGB_core) &
(df_1['AllocCPUS'] <= UpperlimitAllocCPU_core) &
(df_1['AllocCPUS'] >= LowerlimitAllocCPU_core)]
df_runtime_cluster.head(5)
```
%% Cell type:code id: tags:
```
# must run
# clustered graph
style.default_axes_and_ticks()
style.figsize()
alloc_reqmem_cluster_graph = plt.scatter(df_cores_cluster['ReqMemCPU'],df_cores_cluster['AllocCPUS'])
plt.xlabel('ReqMemCPU(gigs)') plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('AllocCPUS') plt.ylabel('AllocCPUS')
#plt.yscale("log") plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_core)
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_cores_cluster)
Sum_of_squared_distances.append(km.inertia_)
```
%% Cell type:code id: tags:
```
# must run
# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# sets to clusters and returns the cluster points
kmeans = KMeans(n_clusters=5, random_state=111)
kmeans.fit(df_cores_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
# must run
# clustered graph # clustered graph
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
alloc_reqmem_cluster_graph = plt.scatter(df_allocCPUS_cluster['ReqMemCPU'],df_allocCPUS_cluster['AllocCPUS'], c=kmeans.labels_, cmap='rainbow') alloc_reqmem_cluster_graph = plt.scatter(df_cores_cluster['ReqMemCPU'],df_cores_cluster['AllocCPUS'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black') plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('ReqMemCPU(gigs)') plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('AllocCPUS') plt.ylabel('AllocCPUS')
plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_alloc) plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_core)
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_4 = df_1.loc[:,['ReqMemCPU', 'AllocCPUS']]
```
%% Cell type:code id: tags:
```
# must run
# purple
LowerlimitElapsed_alloc1 = 0
UpperlimitElapsed_alloc1 = 35
LowerlimitAllocCPU_alloc1 = 0
UpperlimitAllocCPU_alloc1 = 40
#red
LowerlimitElapsed_alloc2 = 0
UpperlimitElapsed_alloc2 = 35
LowerlimitAllocCPU_alloc2 = 0
UpperlimitAllocCPU_alloc2 = 16
#green
LowerlimitElapsed_alloc3 = 35
UpperlimitElapsed_alloc3 = 160
LowerlimitAllocCPU_alloc3 = 0
UpperlimitAllocCPU_alloc3 = 32
```
%% Cell type:code id: tags:
```
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment