From 50d5dac71ca3f62837837f10d9876aa5021a149a Mon Sep 17 00:00:00 2001 From: Niklas Halle Date: Mon, 23 Jun 2025 07:00:25 +0000 Subject: [PATCH] filter exterme outliers before plotting the boxplots --- batch_analysis_analysis.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/batch_analysis_analysis.py b/batch_analysis_analysis.py index 7bc42a6..ed194c8 100644 --- a/batch_analysis_analysis.py +++ b/batch_analysis_analysis.py @@ -44,11 +44,18 @@ def main(): for idx, (col, color) in enumerate(zip(['Mean', 'Std', 'Min', 'Max', 'Count'], colors)): ax = axs[idx] + # Prepare the data for the current column + current_plot_data = plot_data[col].dropna() + # Remove outliers for better visualization + filtered_plot_data = current_plot_data[current_plot_data.between(current_plot_data.quantile(.03), current_plot_data.quantile(0.97))] + + filtered_count = current_plot_data.count() - filtered_plot_data.count() + # Create boxplots - sns.boxplot(data=plot_data[col], ax=ax, color=color, showfliers=True, width=0.4) + sns.boxplot(data=filtered_plot_data, ax=ax, color=color, showfliers=False, width=0.4) # type: ignore # Add individual data points - sns.swarmplot(data=plot_data[col], ax=ax, color='black', size=3, alpha=0.6) + sns.swarmplot(data=filtered_plot_data, ax=ax, color='black', size=3, alpha=0.6) # type: ignore # Set labels and title ax.set_title(f'{col} Distribution', fontsize=14, fontweight='bold') @@ -56,13 +63,22 @@ def main(): ax.set_xlabel('') # No x-label needed ax.set_ylabel('Latency (ms)' if col != 'Count' else 'Count', fontsize=12) - # Calculate statistics of the statistics + # Calculate statistics of the statistics - here based on the original data with outliers! data_values = plot_data[col] + first_line_length = len(f"Mean: {data_values.mean():.2f}") + second_line_length = len(f"Std: {data_values.std():.2f}") + third_line_length = len(f"Min: {data_values.min():.2f}") + fourth_line_length = len(f"Max: {data_values.max():.2f}") + fivth_line_length = len(f"Filtered: {filtered_count}") + max_length = max(first_line_length, second_line_length, third_line_length, fourth_line_length, fivth_line_length) + 1 + # Prepare the text for the legend + stats_text = ( - f"Mean: {data_values.mean():.2f}\n" - f"Std: {data_values.std():.2f}\n" - f"Min: {data_values.min():.2f}\n" - f"Max: {data_values.max():.2f}" + f"Mean:{' ' * (max_length - first_line_length)}{data_values.mean():.2f}\n" + f"Std:{' ' * (max_length - second_line_length)}{data_values.std():.2f}\n" + f"Min:{' ' * (max_length - third_line_length)}{data_values.min():.2f}\n" + f"Max:{' ' * (max_length - fourth_line_length)}{data_values.max():.2f}\n" + f"Filtered:{' ' * (max_length - fivth_line_length)}{filtered_count}" ) # --- Place legend in the top right using axes fraction coordinates --- @@ -73,6 +89,7 @@ def main(): verticalalignment='top', horizontalalignment='right', fontsize=10, + fontfamily='monospace', bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.3', edgecolor='gray') ) @@ -86,7 +103,6 @@ def main(): ) # Save the figure with a filename that includes the chain name - plt.tight_layout() output_file = args.input.replace('.csv', f'_chain_{chain_name_fs}_analysis.png') plt.savefig(output_file, dpi=300) plt.close()