Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
createAndParseSACCT
Manage
Activity
Members
Labels
Plan
Issues
82
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Ryan Randles Jones
createAndParseSACCT
Commits
26c21463
Commit
26c21463
authored
4 years ago
by
Ryan Randles Jones
Browse files
Options
Downloads
Patches
Plain Diff
added documentation
parent
384ff64c
No related branches found
No related tags found
1 merge request
!1
Kmeans clustering
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Runtime-and-CoreCount.ipynb
+395
-0
395 additions, 0 deletions
Runtime-and-CoreCount.ipynb
with
395 additions
and
0 deletions
Runtime-and-CoreCount.ipynb
0 → 100644
+
395
−
0
View file @
26c21463
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"import sqlite3\n",
"import slurm2sql\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"import seaborn as sb\n",
"import plotly.express as px\n",
"import matplotlib.ticker as ticker\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"from RC_styles import rc_styles as style"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database of info from March 2020 using sqlite 3\n",
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df is starting database\n",
"df = pd.read_sql('SELECT * FROM slurm', db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# voluntary\n",
"\n",
"# for displaying all available column options\n",
"pd.set_option('display.max_columns', None)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts units in ReqMemCPU column from bytes to gigs\n",
"df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts Elapsed time to hours (from seconds)\n",
"df['Elapsed'] = df['Elapsed'].div(3600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_completed is dataframe of all completed jobs\n",
"df_completed = df[df.State.str.contains('COMPLETED')]\n",
"#df_completed.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ReqMemCPU,Corecount,Runtime"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for ReqMemCPU\n",
"UpperlimitGB = 50\n",
"LowerlimitGB = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for AllocCPUS\n",
"UpperlimitAllocCPU = 20\n",
"LowerlimitAllocCPU = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# dataset of needed columns for all graphs below\n",
"df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
"df_1.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# rounds ReqMemCPU up to nearest whole number\n",
"df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# rounds Elapsed up to nearest 2 decimal places\n",
"df_1['Elapsed'] = df_1['Elapsed'].round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sorts dataset by AllocCPUS for easy visualization\n",
"df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)\n",
"df_1_sorted.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n",
"df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n",
"df_runtime.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates a facet grid from df_runtime dataset\n",
"# Elapsed time in hours and ReqMemCPU in gigs\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"full_facet = sb.PairGrid(df_runtime)\n",
"full_facet.map(plt.scatter);\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"runtime_graph = sns.scatterplot(x=\"ReqMemCPU\", y=\"AllocCPUS\",data=df_runtime)\n",
"\n",
"plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)\n",
"\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('AllocCPUS')\n",
"#plt.yscale(\"log\")\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above for clustering\n",
"df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n",
"df_runtime_cluster.tail(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n",
"Sum_of_squared_distances = []\n",
"K = range(1,10)\n",
"for k in K:\n",
" km = KMeans(n_clusters=k)\n",
" km = km.fit(df_runtime_cluster)\n",
" Sum_of_squared_distances.append(km.inertia_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n",
"plt.plot(K, Sum_of_squared_distances, 'bx-')\n",
"plt.xlabel('k')\n",
"plt.ylabel('Sum_of_squared_distances')\n",
"plt.title('Elbow Method For Optimal k')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans = KMeans(n_clusters=3, random_state=111)\n",
"kmeans.fit(df_runtime_cluster)\n",
"print(kmeans.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n",
"reqmem_alloc.map(sns.regplot, color=\"blue\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# clustered graph\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
"\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('Elapsed(hours)')\n",
"plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# clustered graph\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
"\n",
"plt.xlabel('AllocCPUS')\n",
"plt.ylabel('Elapsed(hours)')\n",
"plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn as sb
import plotly.express as px
import matplotlib.ticker as ticker
import numpy as np
```
%% Cell type:code id: tags:
```
# must run
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
# must run
from sklearn.cluster import KMeans
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
# must run
# converts Elapsed time to hours (from seconds)
df['Elapsed'] = df['Elapsed'].div(3600)
```
%% Cell type:code id: tags:
```
# must run
# df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5)
```
%% Cell type:markdown id: tags:
# ReqMemCPU,Corecount,Runtime
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for ReqMemCPU
UpperlimitGB = 50
LowerlimitGB = 0
```
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for AllocCPUS
UpperlimitAllocCPU = 20
LowerlimitAllocCPU = 0
```
%% Cell type:code id: tags:
```
# must run
# dataset of needed columns for all graphs below
df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_1.head(5)
```
%% Cell type:code id: tags:
```
# must run
# rounds ReqMemCPU up to nearest whole number
df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
# must run
# rounds Elapsed up to nearest 2 decimal places
df_1['Elapsed'] = df_1['Elapsed'].round(2)
```
%% Cell type:code id: tags:
```
# must run
# sorts dataset by AllocCPUS for easy visualization
df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)
df_1_sorted.head(5)
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above
df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]
df_runtime.head(5)
```
%% Cell type:code id: tags:
```
# must run
# creates a facet grid from df_runtime dataset
# Elapsed time in hours and ReqMemCPU in gigs
style.default_axes_and_ticks()
style.figsize()
full_facet = sb.PairGrid(df_runtime)
full_facet.map(plt.scatter);
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
runtime_graph = sns.scatterplot(x="ReqMemCPU", y="AllocCPUS",data=df_runtime)
plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB)
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('AllocCPUS')
#plt.yscale("log")
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above for clustering
df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]
df_runtime_cluster.tail(5)
```
%% Cell type:code id: tags:
```
# must run
# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_runtime_cluster)
Sum_of_squared_distances.append(km.inertia_)
```
%% Cell type:code id: tags:
```
# must run
# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# sets to clusters and returns the cluster points
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_runtime_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
# must run
# facet grid of the two graphs being clustered using df_runtime_cluster dataset
style.default_axes_and_ticks()
style.figsize()
reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4)
reqmem_alloc.map(sns.regplot, color="blue")
```
%% Cell type:code id: tags:
```
# must run
# clustered graph
style.default_axes_and_ticks()
style.figsize()
runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)
plt.show()
```
%% Cell type:code id: tags:
```
# must run
# clustered graph
style.default_axes_and_ticks()
style.figsize()
alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU)
plt.show()
```
%% Cell type:code id: tags:
```
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment