added count, beautified boxplot output

This commit is contained in:
Niklas Halle 2025-06-16 11:00:18 +00:00
parent b5b0f2f84b
commit 9bf91d654d
3 changed files with 71 additions and 43 deletions

View file

@ -1,19 +1,16 @@
import pandas as pd import pandas as pd
import numpy as np
import argparse import argparse
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser(description='Analyze chain data from CSV file.') parser = argparse.ArgumentParser(description='Analyze chain data from CSV file.')
parser.add_argument('--input', '-i', required=True, help='Path to the input CSV file') parser.add_argument('--input', '-i', required=True, help='Path to the input CSV file')
return parser.parse_args() return parser.parse_args()
def main(): def main():
args = parse_arguments() args = parse_arguments()
# Load the CSV file from the input argument # Load the CSV file from the input argument
df = pd.read_csv(args.input) df = pd.read_csv(args.input)
@ -21,56 +18,85 @@ def main():
if 'experiment_name' not in df.columns: if 'experiment_name' not in df.columns:
raise ValueError("Input CSV must contain 'experiment_name' column.") raise ValueError("Input CSV must contain 'experiment_name' column.")
experiment_name = df['experiment_name'].iloc[0] experiment_name = df['experiment_name'].iloc[0]
# Strip timestamp from experiment_name if it exists # Strip timestamp from experiment_name if it exists
experiment_name = experiment_name.split('-')[0] if '-' in experiment_name else experiment_name experiment_name = experiment_name.split('-')[0] if '-' in experiment_name else experiment_name
# Group data by chain # Group data by chain
chain_groups = df.groupby('chain') chain_groups = df.groupby('chain')
# For each chain, create a plot with four boxplots (mean, std, min, max) # For each chain, create a figure with five subplots for boxplots (mean, std, min, max, count)
for chain_name, chain_data in chain_groups: for chain_name, chain_data in chain_groups:
# Create a figure for this chain fig, axs = plt.subplots(1, 5, figsize=(18, 6), constrained_layout=True)
plt.figure(figsize=(12, 8))
# Normalize chain name for filename # Normalize chain name for filename
chain_name_fs = str(chain_name).replace('--> /', '-').replace('/', '_').replace(' ', '') chain_name_fs = str(chain_name).replace('--> /', '-').replace('/', '_').replace(' ', '')
# Create a DataFrame with the columns we want to plot # Create a DataFrame with the columns we want to plot
plot_data = pd.DataFrame({ plot_data = chain_data[['mean', 'std', 'min', 'max', 'count']].copy()
'Mean': chain_data['mean'], plot_data.columns = ['Mean', 'Std', 'Min', 'Max', 'Count']
'Std': chain_data['std'],
'Min': chain_data['min'], # Make all plots have the same color palette
'Max': chain_data['max'] palette = sns.color_palette("husl", 4)
}) # Add a distinct color for the 'Count' plot, as it is a different metric
colors = palette + ['lightcoral']
# Create boxplots
ax = sns.boxplot(data=plot_data, palette='Set3') for idx, (col, color) in enumerate(zip(['Mean', 'Std', 'Min', 'Max', 'Count'], colors)):
ax = axs[idx]
# Add individual data points
sns.stripplot(data=plot_data, color='black', alpha=0.5, size=4, jitter=True) # Create boxplots
sns.boxplot(data=plot_data[col], ax=ax, color=color, showfliers=True, width=0.4)
# Set labels and title
plt.title(f'Statistics for Chain: {chain_name}\nAcross {len(chain_data)} Experiment Runs\n{experiment_name}', fontsize=14) # Add individual data points
plt.ylabel('Latency (ms)', fontsize=12) sns.swarmplot(data=plot_data[col], ax=ax, color='black', size=3, alpha=0.6)
plt.xlabel('Statistic Type', fontsize=12)
# Set labels and title
# Add grid for better readability ax.set_title(f'{col} Distribution', fontsize=14, fontweight='bold')
plt.grid(axis='y', linestyle='--', alpha=0.7) ax.set_xticks([]) # Remove x-ticks for clarity
ax.set_xlabel('') # No x-label needed
# Tighten layout and save the figure ax.set_ylabel('Latency (ms)' if col != 'Count' else 'Count', fontsize=12)
# Calculate statistics of the statistics
data_values = plot_data[col]
stats_text = (
f"Mean: {data_values.mean():.2f}\n"
f"Std: {data_values.std():.2f}\n"
f"Min: {data_values.min():.2f}\n"
f"Max: {data_values.max():.2f}"
)
# --- Place legend in the top right using axes fraction coordinates ---
ax.text(
0.95, 0.98, # axes fraction: 95% right, 98% up
stats_text,
transform=ax.transAxes,
verticalalignment='top',
horizontalalignment='right',
fontsize=10,
bbox=dict(facecolor='white', alpha=0.9, boxstyle='round,pad=0.3', edgecolor='gray')
)
# Add grid for better readability
ax.grid(axis='y', linestyle='--', alpha=0.4)
# Set the overall title for the figure
plt.suptitle(
f'Statistics for Chain: {chain_name}\nAcross {len(chain_data)} Experiment Runs - {experiment_name}',
fontsize=18, fontweight='bold'
)
# Save the figure with a filename that includes the chain name
plt.tight_layout() plt.tight_layout()
output_file = args.input.replace('.csv', f'_chain_{chain_name_fs}_analysis.png') output_file = args.input.replace('.csv', f'_chain_{chain_name_fs}_analysis.png')
plt.savefig(output_file, dpi=300) plt.savefig(output_file, dpi=300)
plt.close() plt.close()
# Also calculate and print summary statistics for this chain # Print summary statistics for the chain
summary = chain_data.describe() summary = chain_data.describe()
print(f"\nSummary for chain: {chain_name}") print(f"\nSummary for chain: {chain_name}")
print(summary[['mean', 'std', 'min', 'max']]) print(summary[['mean', 'std', 'min', 'max', 'count']])
print(f"\nAnalysis complete. Plots saved with base name: {args.input.replace('.csv', '_chain_*_analysis.png')}")
print(f"\nAnalysis complete. Plots saved with base name: {args.input.replace('.csv', '_chain_*_analysis.png')}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -51,7 +51,7 @@ def main(base_dir, name_filter):
pm.execute_notebook( pm.execute_notebook(
"./trace-analysis.ipynb", "./trace-analysis.ipynb",
os.path.join(current_artifact, "output", "trace-analysis.ipynb"), os.path.join(current_artifact, "output", "trace-analysis.ipynb"),
log_output=True log_output=False
) )
except Exception as e: except Exception as e:
LOGGER.exception(e) LOGGER.exception(e)

View file

@ -683,6 +683,7 @@
" std_latency = np.std(e2e_latencies)\n", " std_latency = np.std(e2e_latencies)\n",
" min_latency = np.min(e2e_latencies)\n", " min_latency = np.min(e2e_latencies)\n",
" max_latency = np.max(e2e_latencies)\n", " max_latency = np.max(e2e_latencies)\n",
" count_latencies = len(e2e_latencies)\n",
" ax.axvline(mean_latency, c=\"red\", linewidth=2)\n", " ax.axvline(mean_latency, c=\"red\", linewidth=2)\n",
" _, max_ylim = ax.get_ylim()\n", " _, max_ylim = ax.get_ylim()\n",
" # Create a multi-line string with all stats\n", " # Create a multi-line string with all stats\n",
@ -690,7 +691,8 @@
" f\"Mean: {mean_latency:.2f} ms\\n\"\n", " f\"Mean: {mean_latency:.2f} ms\\n\"\n",
" f\"Std: {std_latency:.2f} ms\\n\"\n", " f\"Std: {std_latency:.2f} ms\\n\"\n",
" f\"Min: {min_latency:.2f} ms\\n\"\n", " f\"Min: {min_latency:.2f} ms\\n\"\n",
" f\"Max: {max_latency:.2f} ms\"\n", " f\"Max: {max_latency:.2f} ms\\n\"\n",
" f\"Count: {count_latencies}\"\n",
" )\n", " )\n",
" # Place text near top right of plot\n", " # Place text near top right of plot\n",
" ax.text(\n", " ax.text(\n",
@ -703,10 +705,10 @@
" bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.3')\n", " bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.3')\n",
" )\n", " )\n",
" plt.savefig(os.path.join(OUT_PATH, f\"plot_e2es_{name}.png\"))\n", " plt.savefig(os.path.join(OUT_PATH, f\"plot_e2es_{name}.png\"))\n",
" result_strings.append(f\"Chain {topics[0]} --> {topics[-1]} E2E stats: Mean: {mean_latency:.2f} ms, Std: {std_latency:.2f} ms, Min: {min_latency:.2f} ms, Max: {max_latency:.2f} ms\")\n", " result_strings.append(f\"Chain {topics[0]} --> {topics[-1]} E2E stats: Mean: {mean_latency:.2f} ms, Std: {std_latency:.2f} ms, Min: {min_latency:.2f} ms, Max: {max_latency:.2f} ms, Count: {count_latencies}\")\n",
" # also do it as csv of order: exepriment_name, chain, mean, std, min, max\n", " # also do it as csv of order: exepriment_name, chain, mean, std, min, max\n",
" result_strings_csv.append(\n", " result_strings_csv.append(\n",
" f\"{EXPERIMENT_NAME},{topics[0]} --> {topics[-1]},{mean_latency:.2f},{std_latency:.2f},{min_latency:.2f},{max_latency:.2f}\"\n", " f\"{EXPERIMENT_NAME},{topics[0]} --> {topics[-1]},{mean_latency:.2f},{std_latency:.2f},{min_latency:.2f},{max_latency:.2f},{count_latencies}\"\n",
" )\n", " )\n",
"\n", "\n",
" ##################################################\n", " ##################################################\n",