diff --git a/slurm-2sql.ipynb b/slurm-2sql.ipynb
index 56fbb0c673e1dc5ce5f7d18afefc10e29076b7f6..fb8cecd6b06aefa1de61267f19fd0b676714caac 100644
--- a/slurm-2sql.ipynb
+++ b/slurm-2sql.ipynb
@@ -88,8 +88,9 @@
"source": [
"# must run\n",
"\n",
- "# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID\n",
- "df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]\n",
+ "# df_2 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID\n",
+ "# it is used to pull out needed information and create separate datasets to compare\n",
+ "df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]\n",
"#df_2.head(5)"
]
},
@@ -101,19 +102,19 @@
"source": [
"# must run\n",
"\n",
- "# df_user is df_2 with only user defined jobs\n",
- "df_3 = df_2[df_2['JobStep'].isnull()] # jobs where jobstep is None\n",
- "df_3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_user = df_3.loc[:,['User', 'JobName', 'ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]\n",
- "df_user"
+ "# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings \n",
+ "\n",
+ "nan_value = float(\"NaN\")\n",
+ "\n",
+ "df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]\n",
+ "\n",
+ "df_3.replace(\"\", nan_value, inplace=True)\n",
+ "\n",
+ "df_3.dropna(subset = [\"User\"], inplace=True)\n",
+ " \n",
+ "# df_user is a dataset consisting of each user and the total amout of RAM per CPU they have requested over all jobs they have run \n",
+ "df_user = df_3.groupby(['User']).sum().reset_index()\n",
+ "#df_user.head(5)"
]
},
{
@@ -126,7 +127,7 @@
"\n",
"# df_batch is df_2 with only batch jobs\n",
"df_batch = df_2.JobName.str.contains('batch')\n",
- "#df_batch"
+ "#df_2[df_batch].head(5)"
]
},
{
@@ -137,16 +138,13 @@
"source": [
"# must run\n",
"\n",
- "# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point\n",
- "JobsCPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]\n",
- "#JobsCPU_cutoff\n",
- "JobsNode_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]\n",
+ "# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n",
+ "batch_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]\n",
+ "#print(batch_cutoff.head(5))\n",
"\n",
- "UsersCPU_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n",
- "#UsersCPU_cutoff\n",
- "UsersNode_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n",
- "\n",
- "\n"
+ "# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n",
+ "user_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n",
+ "#user_cutoff.head(5)"
]
},
{
@@ -155,24 +153,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# voluntary\n",
+ "#voluntary\n",
"\n",
- "# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
- "# can change what to include or exclude\n",
- "JobsCPU_cutoff.describe(include=None, exclude=None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# voluntary\n",
+ "# all the users who run array jobs before the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 16 users out of 230\n",
+ "arrayjobs = df_user[(df_user != 0).all(1)]\n",
+ "print(arrayjobs.head(5))\n",
"\n",
- "# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
- "# can change what to include or exclude\n",
- "JobsNode_cutoff.describe(include=None, exclude=None)"
+ "# all the users who run array jobs after the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 1 to 2 users out of 230\n",
+ "arrayjobs_after_cutoff = user_cutoff[(user_cutoff != 0).all(1)]\n",
+ "arrayjobs_after_cutoff"
]
},
{
@@ -185,7 +174,7 @@
"\n",
"# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
"# can change what to include or exclude\n",
- "UsersCPU_cutoff.describe(include=None, exclude=None)"
+ "batch_cutoff.describe(include=None, exclude=None)"
]
},
{
@@ -198,41 +187,7 @@
"\n",
"# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
"# can change what to include or exclude\n",
- "UsersNode_cutoff.describe(include=None, exclude=None)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# msut run\n",
- "\n",
- "# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff\n",
- "JobsCPU_arraytask = JobsCPU_cutoff.dropna(subset=['ArrayTaskID'])\n",
- "JobsNode_arraytask = JobsNode_cutoff.dropna(subset=['ArrayTaskID'])\n",
- "\n",
- "UsersCPU_arraytask = UsersCPU_cutoff.dropna(subset=['ArrayTaskID'])\n",
- "UsersNode_arraytask = UsersNode_cutoff.dropna(subset=['ArrayTaskID'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# must run\n",
- "\n",
- "# creates databases of Requested Ram per CPU and per Node that do not have an array task id using the upper RAM limit cutoff\n",
- "JobsCPU_nonarraytask = JobsCPU_cutoff[JobsCPU_cutoff['ArrayTaskID'].isnull()]\n",
- "JobsNode_nonarraytask = JobsNode_cutoff[JobsNode_cutoff['ArrayTaskID'].isnull()]\n",
- "#JobsCPU_nonarraytask.head(5)\n",
- "\n",
- "UsersCPU_nonarraytask = UsersCPU_cutoff[UsersCPU_cutoff['ArrayTaskID'].isnull()]\n",
- "UsersNode_nonarraytask = UsersNode_cutoff[UsersNode_cutoff['ArrayTaskID'].isnull()]\n",
- "#UsersCPU_nonarraytask.head(5)"
+ "user_cutoff.describe(include=None, exclude=None)"
]
},
{
@@ -247,18 +202,18 @@
"metadata": {},
"source": [
"Graphs: <br>\n",
- " Jobs Requesting RAM per CPU for all Jobs\n",
+ " Number of Jobs Requesting RAM per CPU for all Jobs\n",
" <br>\n",
- " Users Requesting RAM per CPU for all Jobs\n",
+ " Number of Users Requesting RAM per CPU for all Jobs\n",
" <br>\n",
- " Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs\n",
+ " Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs\n",
" <br>\n",
- " Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs\n",
+ " Detailed look at Users Requesting RAM per CPU for All Jobs\n",
" <br>\n",
"\n",
"These graphs create histograms using the data for the month of March 2020.\n",
"The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.\n",
- "The y axis measures how many jobs requested that amount RAM per CPU."
+ "The y axis measures how many jobs/users requested that amount RAM per CPU."
]
},
{
@@ -267,11 +222,12 @@
"metadata": {},
"outputs": [],
"source": [
- "# shows all user requested cpu memory for array and non array jobs\n",
- "Jobs_fig = sns.distplot(JobsCPU_cutoff['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array and Non Array Jobs', color = \"green\")\n",
+ "# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)\n",
+ "Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = \"green\")\n",
"Jobs_fig.set_yscale('log')\n",
+ "\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n",
- "plt.title('Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
+ "plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Jobs Requesting')"
]
@@ -282,11 +238,12 @@
"metadata": {},
"outputs": [],
"source": [
- "# shows all user requested cpu memory for array and non array jobs\n",
- "Users_fig = sns.distplot(UsersCPU_cutoff['ReqMemCPU'], kde=False, label='Users Requesting RAM per CPU for Array and Non Array Jobs', color = \"green\")\n",
+ "# shows number of users requesting cpu memory for all jobs (array and non array jobs)\n",
+ "Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = \"green\")\n",
"Users_fig.set_yscale('log')\n",
+ "\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n",
- "plt.title('Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
+ "plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Users Requesting')"
]
@@ -297,15 +254,15 @@
"metadata": {},
"outputs": [],
"source": [
- "#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.\n",
- "Jobs_arraytask_fig = sns.distplot(JobsCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = \"green\")\n",
- "Jobs_arraytask_fig.set_yscale('log')\n",
+ "# shows uthe nmber of jobs vs users requesting cpu memory for all jobs (array and non array jobs)\n",
+ "Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = \"green\")\n",
+ "Jobs_fig.set_yscale('log')\n",
"\n",
- "Jobs_nonarraytask_fig = sns.distplot(JobsCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')\n",
- "Jobs_nonarraytask_fig.set_yscale('log')\n",
+ "Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for for all Jobs')\n",
+ "Users_fig.set_yscale('log')\n",
"\n",
- "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)\n",
- "plt.title('Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)\n",
+ "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n",
+ "plt.title('Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Jobs Requesting')"
]
@@ -316,18 +273,27 @@
"metadata": {},
"outputs": [],
"source": [
- "#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.\n",
- "Users_arraytask_fig = sns.distplot(UsersCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = \"green\")\n",
- "Users_arraytask_fig.set_yscale('log')\n",
+ "# shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs)\n",
"\n",
- "Users_nonarraytask_fig = sns.distplot(UsersCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')\n",
- "Users_nonarraytask_fig.set_yscale('log')\n",
- "\n",
- "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)\n",
- "plt.title('Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)\n",
- "plt.xlabel('Requested Gigs of RAM')\n",
- "plt.ylabel('Number of Jobs Requesting')"
+ "Users_fig = px.histogram(user_cutoff, x=\"ReqMemCPU\",\n",
+ " title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB,\n",
+ " labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column\n",
+ " opacity=0.8,\n",
+ " log_y=True, # represent bars with log scale\n",
+ " marginal=\"box\", # can be `box`, `violin`\n",
+ " hover_data=user_cutoff.columns,\n",
+ " nbins=30,\n",
+ " color_discrete_sequence=['goldenrod'] # color of histogram bars\n",
+ " )\n",
+ "Users_fig.show()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {