diff --git a/Cluster_Analysis.ipynb b/Cluster_Analysis.ipynb index 45b46e4b3937ab5983871abd60d5cb3a87b6df30..6cca51e9f0f184a7b8d4ad842030f27ce3b98fae 100644 --- a/Cluster_Analysis.ipynb +++ b/Cluster_Analysis.ipynb @@ -217,7 +217,7 @@ "outputs": [], "source": [ "#creating a database based on the start date\n", - "slurm2sql.slurm2sql(db, ['-S',start_date, '-E', end_date,'-a'])" + "slurm2sql.slurm2sql(db, ['-S',start_date, '-E', end_date,'-X', '-a']) #-X is allocations, -a is all users" ] }, { @@ -256,7 +256,7 @@ "source": [ "# df_1 is dataframe of all completed jobs\n", "df_1 = df[df.State.str.contains('COMPLETED')]\n", - "df_1.head(5)" + "df_1.head(20)" ] }, { @@ -304,8 +304,7 @@ " (df_completed['AllocCPUS'] >= LowerlimitAllocCPU)\n", " & \n", " (df_completed['Elapsed'] <= UpperlimitElapsed) & \n", - " (df_completed['Elapsed'] >= LowerlimitElapsed)]\n", - "df_clustering.head(5)" + " (df_completed['Elapsed'] >= LowerlimitElapsed)]" ] }, { @@ -415,7 +414,7 @@ "else:\n", " clusterpoints = kmeans_cluster.cluster_centers_\n", " print(\"none\")\n", - " print(clusterpoints[:,0],clusterpoints[:,1])\n" + " print(clusterpoints[:,0],clusterpoints[:,1])" ] }, { @@ -434,16 +433,16 @@ "# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the parameters in the labels shown above\n", "\n", "#Purple\n", - "df_0 = df_clustering[kmeans_cluster.labels_ == 0]\n", + "cluster_0 = df_clustering[kmeans_cluster.labels_ == 0]\n", "\n", "#Green\n", - "df_1 = df_clustering[kmeans_cluster.labels_ == 1]\n", + "cluster_1 = df_clustering[kmeans_cluster.labels_ == 1]\n", "\n", "#Yellow\n", - "df_2 = df_clustering[kmeans_cluster.labels_ == 2]\n", + "cluster_2 = df_clustering[kmeans_cluster.labels_ == 2]\n", "\n", "#Red\n", - "df_3 = df_clustering[kmeans_cluster.labels_ == 3]" + "cluster_3 = df_clustering[kmeans_cluster.labels_ == 3]" ] }, { @@ -455,24 +454,24 @@ "# returns the min and max ReqMemCPU, Elapsed, and AllocCPUS for each cluster using the datasets created above. \n", "# These are the parameters for the scatter plots of each cluster\n", "print(\"Purple Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_0.ReqMemCPU.min(),\" \",\"max =\",df_0.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_0.Elapsed.min(),\" \",\"max =\",df_0.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_0.AllocCPUS.min(),\" \",\"max =\",df_0.AllocCPUS.max())\n", + "print(\"ReqMemCPU:\", \"min =\",cluster_0.ReqMemCPU.min(),\" \",\"max =\",cluster_0.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",cluster_0.Elapsed.min(),\" \",\"max =\",cluster_0.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",cluster_0.AllocCPUS.min(),\" \",\"max =\",cluster_0.AllocCPUS.max())\n", "\n", "print(\"\\nBlue Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_1.ReqMemCPU.min(),\" \",\"max =\",df_1.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_1.Elapsed.min(),\" \",\"max =\",df_1.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_1.AllocCPUS.min(),\" \",\"max =\",df_1.AllocCPUS.max())\n", + "print(\"ReqMemCPU:\", \"min =\",cluster_1.ReqMemCPU.min(),\" \",\"max =\",cluster_1.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",cluster_1.Elapsed.min(),\" \",\"max =\",cluster_1.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",cluster_1.AllocCPUS.min(),\" \",\"max =\",cluster_1.AllocCPUS.max())\n", "\n", "print(\"\\nYellow Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_2.ReqMemCPU.min(),\" \",\"max =\",df_2.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_2.Elapsed.min(),\" \",\"max =\",df_2.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_2.AllocCPUS.min(),\" \",\"max =\",df_2.AllocCPUS.max())\n", + "print(\"ReqMemCPU:\", \"min =\",cluster_2.ReqMemCPU.min(),\" \",\"max =\",cluster_2.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",cluster_2.Elapsed.min(),\" \",\"max =\",cluster_2.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",cluster_2.AllocCPUS.min(),\" \",\"max =\",cluster_2.AllocCPUS.max())\n", "\n", "print(\"\\nRed Cluster\")\n", - "print(\"ReqMemCPU:\", \"min =\",df_3.ReqMemCPU.min(),\" \",\"max =\",df_3.ReqMemCPU.max())\n", - "print(\"Elapsed:\", \"min =\",df_3.Elapsed.min(),\" \",\"max =\",df_3.Elapsed.max())\n", - "print(\"AllocCPUS:\", \"min =\",df_3.AllocCPUS.min(),\" \",\"max =\",df_3.AllocCPUS.max())" + "print(\"ReqMemCPU:\", \"min =\",cluster_3.ReqMemCPU.min(),\" \",\"max =\",cluster_3.ReqMemCPU.max())\n", + "print(\"Elapsed:\", \"min =\",cluster_3.Elapsed.min(),\" \",\"max =\",cluster_3.Elapsed.max())\n", + "print(\"AllocCPUS:\", \"min =\",cluster_3.AllocCPUS.min(),\" \",\"max =\",cluster_3.AllocCPUS.max())" ] }, { @@ -485,24 +484,24 @@ "# The groupby does not change the data, but it does make a small enough dataset\n", "\n", "# for purple cluster \n", - "df_0_2d1 = df_0.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", - "df_0_2d2 = df_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "df_0_2d3 = df_0.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()\n", + "df_0_2d1 = cluster_0.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", + "df_0_2d2 = cluster_0.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "df_0_2d3 = cluster_0.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()\n", "\n", "# for blue cluster\n", - "df_1_2d1 = df_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", - "df_1_2d2 = df_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "df_1_2d3 = df_1.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()\n", + "df_1_2d1 = cluster_1.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", + "df_1_2d2 = cluster_1.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "df_1_2d3 = cluster_1.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()\n", "\n", "# for yellow cluster\n", - "df_2_2d1 = df_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", - "df_2_2d2 = df_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "df_2_2d3 = df_2.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()\n", + "df_2_2d1 = cluster_2.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", + "df_2_2d2 = cluster_2.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "df_2_2d3 = cluster_2.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()\n", "\n", "# for red cluster\n", - "df_3_2d1 = df_3.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", - "df_3_2d2 = df_3.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", - "df_3_2d3 = df_3.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()" + "df_3_2d1 = cluster_3.groupby(['ReqMemCPU','Elapsed']).sum().reset_index()\n", + "df_3_2d2 = cluster_3.groupby(['AllocCPUS','Elapsed']).sum().reset_index()\n", + "df_3_2d3 = cluster_3.groupby(['ReqMemCPU','AllocCPUS']).sum().reset_index()" ] }, { @@ -514,14 +513,14 @@ "# Creating bins \n", "\n", "####Purple\n", - "purple_rqmem_min = np.min(df_0.ReqMemCPU.min())\n", - "purple_rqmem_max = np.max(df_0.ReqMemCPU.max())\n", + "purple_rqmem_min = np.min(cluster_0.ReqMemCPU.min())\n", + "purple_rqmem_max = np.max(cluster_0.ReqMemCPU.max())\n", " \n", - "purple_elapsed_min = np.min(df_0.Elapsed.min()) \n", - "purple_elapsed_max = np.max(df_0.Elapsed.max()) \n", + "purple_elapsed_min = np.min(cluster_0.Elapsed.min()) \n", + "purple_elapsed_max = np.max(cluster_0.Elapsed.max()) \n", "\n", - "purple_alloc_min = np.min(df_0.AllocCPUS.min()) \n", - "purple_alloc_max = np.max(df_0.AllocCPUS.max())\n", + "purple_alloc_min = np.min(cluster_0.AllocCPUS.min()) \n", + "purple_alloc_max = np.max(cluster_0.AllocCPUS.max())\n", " \n", " \n", "x_purple_rqmem_elapsed_bins = list(range(purple_rqmem_max))\n", @@ -535,14 +534,14 @@ "\n", "\n", "####Blue\n", - "blue_rqmem_min = np.min(df_1.ReqMemCPU.min())\n", - "blue_rqmem_max = np.max(df_1.ReqMemCPU.max())\n", + "blue_rqmem_min = np.min(cluster_1.ReqMemCPU.min())\n", + "blue_rqmem_max = np.max(cluster_1.ReqMemCPU.max())\n", " \n", - "blue_elapsed_min = np.min(df_1.Elapsed.min()) \n", - "blue_elapsed_max = np.max(df_1.Elapsed.max()) \n", + "blue_elapsed_min = np.min(cluster_1.Elapsed.min()) \n", + "blue_elapsed_max = np.max(cluster_1.Elapsed.max()) \n", "\n", - "blue_alloc_min = np.min(df_1.AllocCPUS.min()) \n", - "blue_alloc_max = np.max(df_1.AllocCPUS.max())\n", + "blue_alloc_min = np.min(cluster_1.AllocCPUS.min()) \n", + "blue_alloc_max = np.max(cluster_1.AllocCPUS.max())\n", " \n", " \n", "x_blue_rqmem_elapsed_bins = list(range(blue_rqmem_max))\n", @@ -555,14 +554,14 @@ "y_blue_reqmem_alloc_bins = list(range(int(blue_alloc_max))) \n", "\n", "####Yellow\n", - "yellow_rqmem_min = np.min(df_2.ReqMemCPU.min())\n", - "yellow_rqmem_max = np.max(df_2.ReqMemCPU.max())\n", + "yellow_rqmem_min = np.min(cluster_2.ReqMemCPU.min())\n", + "yellow_rqmem_max = np.max(cluster_2.ReqMemCPU.max())\n", " \n", - "yellow_elapsed_min = np.min(df_2.Elapsed.min()) \n", - "yellow_elapsed_max = np.max(df_2.Elapsed.max()) \n", + "yellow_elapsed_min = np.min(cluster_2.Elapsed.min()) \n", + "yellow_elapsed_max = np.max(cluster_2.Elapsed.max()) \n", "\n", - "yellow_alloc_min = np.min(df_2.AllocCPUS.min()) \n", - "yellow_alloc_max = np.max(df_2.AllocCPUS.max())\n", + "yellow_alloc_min = np.min(cluster_2.AllocCPUS.min()) \n", + "yellow_alloc_max = np.max(cluster_2.AllocCPUS.max())\n", " \n", " \n", "x_yellow_rqmem_elapsed_bins = list(range(yellow_rqmem_max))\n", @@ -576,14 +575,14 @@ "\n", "\n", "####Red\n", - "red_rqmem_min = np.min(df_3.ReqMemCPU.min())\n", - "red_rqmem_max = np.max(df_3.ReqMemCPU.max())\n", + "red_rqmem_min = np.min(cluster_3.ReqMemCPU.min())\n", + "red_rqmem_max = np.max(cluster_3.ReqMemCPU.max())\n", " \n", - "red_elapsed_min = np.min(df_3.Elapsed.min()) \n", - "red_elapsed_max = np.max(df_3.Elapsed.max()) \n", + "red_elapsed_min = np.min(cluster_3.Elapsed.min()) \n", + "red_elapsed_max = np.max(cluster_3.Elapsed.max()) \n", "\n", - "red_alloc_min = np.min(df_3.AllocCPUS.min()) \n", - "red_alloc_max = np.max(df_3.AllocCPUS.max())\n", + "red_alloc_min = np.min(cluster_3.AllocCPUS.min()) \n", + "red_alloc_max = np.max(cluster_3.AllocCPUS.max())\n", " \n", " \n", "x_red_rqmem_elapsed_bins = list(range(red_rqmem_max))\n", @@ -596,6 +595,50 @@ "y_red_reqmem_alloc_bins = list(range(red_alloc_max)) # list range gives one bin per cpu" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary Stats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# number of purple cluster jobs and users\n", + "cluster_0_jobs = cluster_0.shape[0]\n", + "users_0 = cluster_0.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "users_0['user'] = pd.Series(df_1['User'])\n", + "cluster_0_users = users_0.drop_duplicates(subset=['user'])\n", + "\n", + "# number of green cluster jobs and users\n", + "cluster_1_jobs = cluster_1.shape[0]\n", + "users_1 = cluster_1.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "users_1['user'] = pd.Series(df_1['User'])\n", + "cluster_1_users = users_1.drop_duplicates(subset=['user'])\n", + "\n", + "# number of yellow cluster jobs and users\n", + "cluster_2_jobs = cluster_2.shape[0]\n", + "users_2 = cluster_2.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "users_2['user'] = pd.Series(df_1['User'])\n", + "cluster_2_users = users_2.drop_duplicates(subset=['user'])\n", + "\n", + "# number of red cluster jobs and users\n", + "cluster_3_jobs = cluster_3.shape[0]\n", + "users_3 = cluster_3.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n", + "users_3['user'] = pd.Series(df_1['User'])\n", + "cluster_3_users = users_3.drop_duplicates(subset=['user'])\n", + "\n", + "\n", + "summary_stats = pd.DataFrame({'Job Count': [cluster_0_jobs, cluster_1_jobs, cluster_2_jobs, cluster_3_jobs],\n", + " 'User Count': [cluster_0_users.shape[0], cluster_1_users.shape[0], cluster_2_users.shape[0], cluster_3_users.shape[0]]},\n", + " index=['Purple Cluster','Blue Cluster', 'Yellow Cluster', 'Red Cluster'])\n", + "summary_stats.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -609,6 +652,8 @@ "metadata": {}, "outputs": [], "source": [ + "print(summary_stats)\n", + "\n", "figure = plt.figure()\n", "\n", "figure.set_size_inches(20,15)\n", @@ -863,6 +908,13 @@ "\n", "plt.show()\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {