diff --git a/slurm-2sql.ipynb b/slurm-2sql.ipynb index 56fbb0c673e1dc5ce5f7d18afefc10e29076b7f6..fb8cecd6b06aefa1de61267f19fd0b676714caac 100644 --- a/slurm-2sql.ipynb +++ b/slurm-2sql.ipynb @@ -88,8 +88,9 @@ "source": [ "# must run\n", "\n", - "# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID\n", - "df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]\n", + "# df_2 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID\n", + "# it is used to pull out needed information and create separate datasets to compare\n", + "df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]\n", "#df_2.head(5)" ] }, @@ -101,19 +102,19 @@ "source": [ "# must run\n", "\n", - "# df_user is df_2 with only user defined jobs\n", - "df_3 = df_2[df_2['JobStep'].isnull()] # jobs where jobstep is None\n", - "df_3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_user = df_3.loc[:,['User', 'JobName', 'ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]\n", - "df_user" + "# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings \n", + "\n", + "nan_value = float(\"NaN\")\n", + "\n", + "df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]\n", + "\n", + "df_3.replace(\"\", nan_value, inplace=True)\n", + "\n", + "df_3.dropna(subset = [\"User\"], inplace=True)\n", + " \n", + "# df_user is a dataset consisting of each user and the total amout of RAM per CPU they have requested over all jobs they have run \n", + "df_user = df_3.groupby(['User']).sum().reset_index()\n", + "#df_user.head(5)" ] }, { @@ -126,7 +127,7 @@ "\n", "# df_batch is df_2 with only batch jobs\n", "df_batch = df_2.JobName.str.contains('batch')\n", - "#df_batch" + "#df_2[df_batch].head(5)" ] }, { @@ -137,16 +138,13 @@ "source": [ "# must run\n", "\n", - "# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point\n", - "JobsCPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]\n", - "#JobsCPU_cutoff\n", - "JobsNode_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]\n", + "# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n", + "batch_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]\n", + "#print(batch_cutoff.head(5))\n", "\n", - "UsersCPU_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n", - "#UsersCPU_cutoff\n", - "UsersNode_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n", - "\n", - "\n" + "# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n", + "user_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n", + "#user_cutoff.head(5)" ] }, { @@ -155,24 +153,15 @@ "metadata": {}, "outputs": [], "source": [ - "# voluntary\n", + "#voluntary\n", "\n", - "# gives mean, min, max, std, and 3 percentiles for cutoff data\n", - "# can change what to include or exclude\n", - "JobsCPU_cutoff.describe(include=None, exclude=None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# voluntary\n", + "# all the users who run array jobs before the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 16 users out of 230\n", + "arrayjobs = df_user[(df_user != 0).all(1)]\n", + "print(arrayjobs.head(5))\n", "\n", - "# gives mean, min, max, std, and 3 percentiles for cutoff data\n", - "# can change what to include or exclude\n", - "JobsNode_cutoff.describe(include=None, exclude=None)" + "# all the users who run array jobs after the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 1 to 2 users out of 230\n", + "arrayjobs_after_cutoff = user_cutoff[(user_cutoff != 0).all(1)]\n", + "arrayjobs_after_cutoff" ] }, { @@ -185,7 +174,7 @@ "\n", "# gives mean, min, max, std, and 3 percentiles for cutoff data\n", "# can change what to include or exclude\n", - "UsersCPU_cutoff.describe(include=None, exclude=None)" + "batch_cutoff.describe(include=None, exclude=None)" ] }, { @@ -198,41 +187,7 @@ "\n", "# gives mean, min, max, std, and 3 percentiles for cutoff data\n", "# can change what to include or exclude\n", - "UsersNode_cutoff.describe(include=None, exclude=None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# msut run\n", - "\n", - "# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff\n", - "JobsCPU_arraytask = JobsCPU_cutoff.dropna(subset=['ArrayTaskID'])\n", - "JobsNode_arraytask = JobsNode_cutoff.dropna(subset=['ArrayTaskID'])\n", - "\n", - "UsersCPU_arraytask = UsersCPU_cutoff.dropna(subset=['ArrayTaskID'])\n", - "UsersNode_arraytask = UsersNode_cutoff.dropna(subset=['ArrayTaskID'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# must run\n", - "\n", - "# creates databases of Requested Ram per CPU and per Node that do not have an array task id using the upper RAM limit cutoff\n", - "JobsCPU_nonarraytask = JobsCPU_cutoff[JobsCPU_cutoff['ArrayTaskID'].isnull()]\n", - "JobsNode_nonarraytask = JobsNode_cutoff[JobsNode_cutoff['ArrayTaskID'].isnull()]\n", - "#JobsCPU_nonarraytask.head(5)\n", - "\n", - "UsersCPU_nonarraytask = UsersCPU_cutoff[UsersCPU_cutoff['ArrayTaskID'].isnull()]\n", - "UsersNode_nonarraytask = UsersNode_cutoff[UsersNode_cutoff['ArrayTaskID'].isnull()]\n", - "#UsersCPU_nonarraytask.head(5)" + "user_cutoff.describe(include=None, exclude=None)" ] }, { @@ -247,18 +202,18 @@ "metadata": {}, "source": [ "Graphs: <br>\n", - " Jobs Requesting RAM per CPU for all Jobs\n", + " Number of Jobs Requesting RAM per CPU for all Jobs\n", " <br>\n", - " Users Requesting RAM per CPU for all Jobs\n", + " Number of Users Requesting RAM per CPU for all Jobs\n", " <br>\n", - " Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs\n", + " Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs\n", " <br>\n", - " Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs\n", + " Detailed look at Users Requesting RAM per CPU for All Jobs\n", " <br>\n", "\n", "These graphs create histograms using the data for the month of March 2020.\n", "The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.\n", - "The y axis measures how many jobs requested that amount RAM per CPU." + "The y axis measures how many jobs/users requested that amount RAM per CPU." ] }, { @@ -267,11 +222,12 @@ "metadata": {}, "outputs": [], "source": [ - "# shows all user requested cpu memory for array and non array jobs\n", - "Jobs_fig = sns.distplot(JobsCPU_cutoff['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array and Non Array Jobs', color = \"green\")\n", + "# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)\n", + "Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = \"green\")\n", "Jobs_fig.set_yscale('log')\n", + "\n", "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n", - "plt.title('Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n", + "plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n", "plt.xlabel('Requested Gigs of RAM')\n", "plt.ylabel('Number of Jobs Requesting')" ] @@ -282,11 +238,12 @@ "metadata": {}, "outputs": [], "source": [ - "# shows all user requested cpu memory for array and non array jobs\n", - "Users_fig = sns.distplot(UsersCPU_cutoff['ReqMemCPU'], kde=False, label='Users Requesting RAM per CPU for Array and Non Array Jobs', color = \"green\")\n", + "# shows number of users requesting cpu memory for all jobs (array and non array jobs)\n", + "Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = \"green\")\n", "Users_fig.set_yscale('log')\n", + "\n", "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n", - "plt.title('Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n", + "plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n", "plt.xlabel('Requested Gigs of RAM')\n", "plt.ylabel('Number of Users Requesting')" ] @@ -297,15 +254,15 @@ "metadata": {}, "outputs": [], "source": [ - "#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.\n", - "Jobs_arraytask_fig = sns.distplot(JobsCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = \"green\")\n", - "Jobs_arraytask_fig.set_yscale('log')\n", + "# shows uthe nmber of jobs vs users requesting cpu memory for all jobs (array and non array jobs)\n", + "Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = \"green\")\n", + "Jobs_fig.set_yscale('log')\n", "\n", - "Jobs_nonarraytask_fig = sns.distplot(JobsCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')\n", - "Jobs_nonarraytask_fig.set_yscale('log')\n", + "Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for for all Jobs')\n", + "Users_fig.set_yscale('log')\n", "\n", - "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)\n", - "plt.title('Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)\n", + "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n", + "plt.title('Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n", "plt.xlabel('Requested Gigs of RAM')\n", "plt.ylabel('Number of Jobs Requesting')" ] @@ -316,18 +273,27 @@ "metadata": {}, "outputs": [], "source": [ - "#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.\n", - "Users_arraytask_fig = sns.distplot(UsersCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = \"green\")\n", - "Users_arraytask_fig.set_yscale('log')\n", + "# shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs)\n", "\n", - "Users_nonarraytask_fig = sns.distplot(UsersCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')\n", - "Users_nonarraytask_fig.set_yscale('log')\n", - "\n", - "plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)\n", - "plt.title('Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)\n", - "plt.xlabel('Requested Gigs of RAM')\n", - "plt.ylabel('Number of Jobs Requesting')" + "Users_fig = px.histogram(user_cutoff, x=\"ReqMemCPU\",\n", + " title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB,\n", + " labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column\n", + " opacity=0.8,\n", + " log_y=True, # represent bars with log scale\n", + " marginal=\"box\", # can be `box`, `violin`\n", + " hover_data=user_cutoff.columns,\n", + " nbins=30,\n", + " color_discrete_sequence=['goldenrod'] # color of histogram bars\n", + " )\n", + "Users_fig.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {