From a24aeeffe7601b39479a100edfea346469ddecc6 Mon Sep 17 00:00:00 2001 From: Niklas Halle Date: Tue, 5 Aug 2025 10:54:44 +0000 Subject: [PATCH] added a bunch of helper scripts --- add_csv_header.sh | 30 +++++ batch_analysis_analysis.py | 20 +++ csv2table.py | 214 +++++++++++++++++++++++++++++++++ csvfix.sh | 29 +++++ run_batch_analysis_analysis.sh | 23 ++++ run_batch_analyze.sh | 31 +++++ 6 files changed, 347 insertions(+) create mode 100755 add_csv_header.sh create mode 100755 csv2table.py create mode 100755 csvfix.sh create mode 100755 run_batch_analysis_analysis.sh create mode 100755 run_batch_analyze.sh diff --git a/add_csv_header.sh b/add_csv_header.sh new file mode 100755 index 0000000..0460927 --- /dev/null +++ b/add_csv_header.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# Usage: ./add_csv_header.sh /path/to/trace_root + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 /path/to/trace_root" + exit 1 +fi + +TRACE_ROOT="$1" +HEADER="experiment_name,chain,mean,std,min,max,count" + +if [[ ! -d "$TRACE_ROOT" ]]; then + echo "Error: '$TRACE_ROOT' is not a directory." + exit 1 +fi + +# Find all results.csv files one level below the trace root +find "$TRACE_ROOT" -mindepth 2 -maxdepth 2 -type f -name results.csv | while IFS= read -r csvfile; do + # Insert header only if not already present + first_line=$(head -n 1 "$csvfile") + if [[ "$first_line" != "$HEADER" ]]; then + echo "Adding header to $csvfile" + sed -i "1i$HEADER" "$csvfile" + else + echo "Header already present in $csvfile, skipping." + fi +done diff --git a/batch_analysis_analysis.py b/batch_analysis_analysis.py index ff48b38..543f7cf 100644 --- a/batch_analysis_analysis.py +++ b/batch_analysis_analysis.py @@ -27,6 +27,9 @@ def main(): # Group data by chain chain_groups = df.groupby('chain') + # Prepare list to collect summary data for CSV export + summary_data = [] + # For each chain, create a figure with five subplots for boxplots (mean, std, min, max, count) for chain_name, chain_data in chain_groups: fig, axs = plt.subplots(1, 5, figsize=(18, 6), constrained_layout=True) @@ -38,6 +41,17 @@ def main(): plot_data = chain_data[['mean', 'std', 'min', 'max', 'count']].copy() plot_data.columns = ['Mean', 'Std', 'Min', 'Max', 'Count'] + # Calculate summary statistics for CSV export + chain_summary = { + 'chain': chain_name, + 'mean_count': plot_data['Count'].mean(), + 'mean_mean': plot_data['Mean'].mean(), + 'mean_std': plot_data['Std'].mean(), + 'mean_min': plot_data['Min'].mean(), + 'mean_max': plot_data['Max'].mean() + } + summary_data.append(chain_summary) + # Make all plots have the same color palette palette = sns.color_palette("husl", 4) # Add a distinct color for the 'Count' plot, as it is a different metric @@ -114,7 +128,13 @@ def main(): print(f"\nSummary for chain: {chain_name}") print(summary[['mean', 'std', 'min', 'max', 'count']]) + # Create and save the summary CSV + summary_df = pd.DataFrame(summary_data) + summary_csv_file = args.input.replace('.csv', '_summary.csv') + summary_df.to_csv(summary_csv_file, index=False) + print(f"\nAnalysis complete. Plots saved with base name: {args.input.replace('.csv', '_chain_*_analysis.png')}") + print(f"Summary CSV saved as: {summary_csv_file}") if __name__ == "__main__": main() \ No newline at end of file diff --git a/csv2table.py b/csv2table.py new file mode 100755 index 0000000..2b139ea --- /dev/null +++ b/csv2table.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 + +import pandas as pd +import numpy as np + +def csv_to_latex_table(csv_file_path, output_file_path=None, decimal_places=2): + """ + Convert a CSV file to a LaTeX table with proper formatting. + + Args: + csv_file_path (str): Path to the input CSV file + output_file_path (str, optional): Path to save the LaTeX output + decimal_places (int): Number of decimal places for numeric values + + Returns: + str: LaTeX table code + """ + + # Read the CSV file + df = pd.read_csv(csv_file_path) + + # Create a more readable version of the chain column + df['chain'] = df['chain'].str.replace('/input/', '').str.replace('/output/', '→ ') + df['chain'] = df['chain'].str.replace('/', ' ') + + # Round numeric columns to specified decimal places + numeric_columns = df.select_dtypes(include=[np.number]).columns + df[numeric_columns] = df[numeric_columns].round(decimal_places) + + # Start building the LaTeX table + latex_code = [] + + # Document setup (optional - can be removed if embedding in existing document) + latex_code.append("\\documentclass{article}") + latex_code.append("\\usepackage{booktabs}") + latex_code.append("\\usepackage{array}") + latex_code.append("\\usepackage{longtable}") + latex_code.append("\\begin{document}") + latex_code.append("") + + # Table setup + num_cols = len(df.columns) + col_spec = "l" + "r" * (num_cols - 1) # Left align first column, right align others + + latex_code.append("\\begin{longtable}{" + col_spec + "}") + latex_code.append("\\toprule") + + # Create header + headers = [] + for col in df.columns: + if col == 'chain': + headers.append("Chain") + else: + # Convert column names to more readable format + readable_name = col.replace('_', ' ').title() + headers.append(readable_name) + + latex_code.append(" & ".join(headers) + " \\\\") + latex_code.append("\\midrule") + latex_code.append("\\endfirsthead") + latex_code.append("") + + # Header for continuation pages + latex_code.append("\\multicolumn{" + str(num_cols) + "}{c}") + latex_code.append("{\\tablename\\ \\thetable{} -- continued from previous page} \\\\") + latex_code.append("\\toprule") + latex_code.append(" & ".join(headers) + " \\\\") + latex_code.append("\\midrule") + latex_code.append("\\endhead") + latex_code.append("") + + # Footer for non-final pages + latex_code.append("\\midrule") + latex_code.append("\\multicolumn{" + str(num_cols) + "}{r}{Continued on next page} \\\\") + latex_code.append("\\endfoot") + latex_code.append("") + + # Final footer + latex_code.append("\\bottomrule") + latex_code.append("\\endlastfoot") + latex_code.append("") + + # Add data rows + for _, row in df.iterrows(): + row_data = [] + for i, value in enumerate(row): + if i == 0: # Chain column - use texttt for monospace + # Split long chains for better formatting + chain_parts = str(value).split('→') + if len(chain_parts) == 2: + formatted_chain = f"\\texttt{{{chain_parts[0].strip()}}} → \\texttt{{{chain_parts[1].strip()}}}" + else: + formatted_chain = f"\\texttt{{{str(value)}}}" + row_data.append(formatted_chain) + else: + row_data.append(str(value)) + + latex_code.append(" & ".join(row_data) + " \\\\") + + latex_code.append("\\end{longtable}") + latex_code.append("") + latex_code.append("\\end{document}") + + # Join all lines + latex_output = "\n".join(latex_code) + + # Save to file if path is provided + if output_file_path: + with open(output_file_path, 'w', encoding='utf-8') as f: + f.write(latex_output) + print(f"LaTeX table saved to {output_file_path}") + + return latex_output + +def csv_to_latex_table_simple(csv_file_path, decimal_places=2): + """ + Convert CSV to LaTeX table without document wrapper (for embedding). + + Args: + csv_file_path (str): Path to the input CSV file + decimal_places (int): Number of decimal places for numeric values + + Returns: + str: LaTeX table code only + """ + + # Read the CSV file + df = pd.read_csv(csv_file_path) + + # Create a more readable version of the chain column + df['chain'] = df['chain'].str.replace('/input/', '').str.replace('/output/', ' → ') + df['chain'] = df['chain'].str.replace('/', ' ') + + # Round numeric columns + numeric_columns = df.select_dtypes(include=[np.number]).columns + df[numeric_columns] = df[numeric_columns].round(decimal_places) + + # Build table + latex_code = [] + num_cols = len(df.columns) + col_spec = "l" + "r" * (num_cols - 1) + + latex_code.append("\\begin{tabular}{" + col_spec + "}") + latex_code.append(" \\toprule") + + # Headers + headers = [] + for col in df.columns: + if col == 'chain': + headers.append("Chain") + else: + readable_name = col.replace('_', ' ').title() + headers.append(readable_name) + + latex_code.append(" & ".join(headers) + " \\\\") + latex_code.append("\\midrule") + + # Data rows + for _, row in df.iterrows(): + row_data = [] + for i, value in enumerate(row): + if i == 0: # Chain column + chain_parts = str(value).split(' → ') + if len(chain_parts) == 2: + formatted_chain = f"\\texttt{{{chain_parts[0].strip()}}} → \\texttt{{{chain_parts[1].strip()}}}" + else: + formatted_chain = f"\\texttt{{{str(value)}}}" + row_data.append(formatted_chain) + else: + row_data.append(str(value)) + + latex_code.append(" & ".join(row_data) + " \\\\") + + latex_code.append("\\bottomrule") + latex_code.append("\\end{tabular}") + + return "\n".join(latex_code) + +if __name__ == "__main__": + import argparse + import sys + + parser = argparse.ArgumentParser(description='Convert CSV file to LaTeX table') + parser.add_argument('csv_file', help='Path to the input CSV file') + parser.add_argument('-o', '--output', help='Output LaTeX file path (optional)') + parser.add_argument('-d', '--decimals', type=int, default=2, + help='Number of decimal places for numeric values (default: 2)') + parser.add_argument('-s', '--simple', action='store_true', + help='Generate simple table only (no document wrapper)') + + args = parser.parse_args() + + try: + if args.simple: + # Generate simple table for embedding + latex_output = csv_to_latex_table_simple(args.csv_file, args.decimals) + print(latex_output) + + if args.output: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(latex_output) + print(f"\nSimple LaTeX table saved to {args.output}", file=sys.stderr) + else: + # Generate complete LaTeX document + latex_output = csv_to_latex_table(args.csv_file, args.output, args.decimals) + if not args.output: + print(latex_output) + + except FileNotFoundError: + print(f"Error: CSV file '{args.csv_file}' not found.", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) \ No newline at end of file diff --git a/csvfix.sh b/csvfix.sh new file mode 100755 index 0000000..78b2679 --- /dev/null +++ b/csvfix.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Check if filename parameter is provided +if [ $# -eq 0 ]; then + echo "Usage: $0 " + echo "Example: $0 data.csv" + exit 1 +fi + +CSV_FILE="$1" + +# Check if file exists +if [ ! -f "$CSV_FILE" ]; then + echo "Error: File '$CSV_FILE' not found!" + exit 1 +fi + +echo "Processing file: $CSV_FILE" +echo "========================================" + +# Count prefixes from first column +echo -e "\n=== Total number of unique prefixes ===" +cut -d',' -f1 "$CSV_FILE" | sort | uniq | wc -l + +# Readable output format +echo "=== Formatted output ===" +echo "Prefix -> Count" +echo "---------------" +cut -d',' -f1 "$CSV_FILE" | sort | uniq -c | sort -nr | awk '{printf "%-20s -> %d\n", $2, $1}' diff --git a/run_batch_analysis_analysis.sh b/run_batch_analysis_analysis.sh new file mode 100755 index 0000000..1ebb4d3 --- /dev/null +++ b/run_batch_analysis_analysis.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Usage: ./run_batch_analysis_analysis.sh /path/to/target_dir + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 /path/to/target_dir" + exit 1 +fi + +TARGET_DIR="$1" + +if [[ ! -d "$TARGET_DIR" ]]; then + echo "Error: '$TARGET_DIR' is not a directory." + exit 1 +fi + +# Find all results.csv files directly under subdirectories +find "$TARGET_DIR" -mindepth 2 -maxdepth 2 -type f -name results.csv | while IFS= read -r csvfile; do + echo "Analyzing $csvfile" + ./batch_analysis_analysis.py -i "$csvfile" +done diff --git a/run_batch_analyze.sh b/run_batch_analyze.sh new file mode 100755 index 0000000..6dbc4d0 --- /dev/null +++ b/run_batch_analyze.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Usage: ./run_batch_analyze.sh /path/to/trace_root + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 /path/to/trace_root" + exit 1 +fi + +TRACE_ROOT="$1" + +if [[ ! -d "$TRACE_ROOT" ]]; then + echo "Error: '$TRACE_ROOT' is not a directory." + exit 1 +fi + +for dir in "$TRACE_ROOT"/*; do + if [[ -d "$dir" ]]; then + dirname=$(basename "$dir") + # Extract everything before first underscore as type + type=$(echo "$dirname" | grep -oP '^[a-z]+(?=_)') + if [[ -z "$type" ]]; then + echo "Warning: Could not extract type from '$dirname', skipping." + continue + fi + echo "Running batch_analyze.py on $dir with filter ${type}*" + ./batch_analyze.py -d "$dir" -f "${type}*" + fi +done