From a24aeeffe7601b39479a100edfea346469ddecc6 Mon Sep 17 00:00:00 2001
From: Niklas Halle <niklas@niklashalle.net>
Date: Tue, 5 Aug 2025 10:54:44 +0000
Subject: [PATCH] added a bunch of helper scripts

---
 add_csv_header.sh              |  30 +++++
 batch_analysis_analysis.py     |  20 +++
 csv2table.py                   | 214 +++++++++++++++++++++++++++++++++
 csvfix.sh                      |  29 +++++
 run_batch_analysis_analysis.sh |  23 ++++
 run_batch_analyze.sh           |  31 +++++
 6 files changed, 347 insertions(+)
 create mode 100755 add_csv_header.sh
 create mode 100755 csv2table.py
 create mode 100755 csvfix.sh
 create mode 100755 run_batch_analysis_analysis.sh
 create mode 100755 run_batch_analyze.sh

diff --git a/add_csv_header.sh b/add_csv_header.sh
new file mode 100755
index 0000000..0460927
--- /dev/null
+++ b/add_csv_header.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Usage: ./add_csv_header.sh /path/to/trace_root
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "Usage: $0 /path/to/trace_root"
+  exit 1
+fi
+
+TRACE_ROOT="$1"
+HEADER="experiment_name,chain,mean,std,min,max,count"
+
+if [[ ! -d "$TRACE_ROOT" ]]; then
+  echo "Error: '$TRACE_ROOT' is not a directory."
+  exit 1
+fi
+
+# Find all results.csv files one level below the trace root
+find "$TRACE_ROOT" -mindepth 2 -maxdepth 2 -type f -name results.csv | while IFS= read -r csvfile; do
+  # Insert header only if not already present
+  first_line=$(head -n 1 "$csvfile")
+  if [[ "$first_line" != "$HEADER" ]]; then
+    echo "Adding header to $csvfile"
+    sed -i "1i$HEADER" "$csvfile"
+  else
+    echo "Header already present in $csvfile, skipping."
+  fi
+done
diff --git a/batch_analysis_analysis.py b/batch_analysis_analysis.py
index ff48b38..543f7cf 100644
--- a/batch_analysis_analysis.py
+++ b/batch_analysis_analysis.py
@@ -27,6 +27,9 @@ def main():
     # Group data by chain
     chain_groups = df.groupby('chain')
 
+    # Prepare list to collect summary data for CSV export
+    summary_data = []
+
     # For each chain, create a figure with five subplots for boxplots (mean, std, min, max, count)
     for chain_name, chain_data in chain_groups:
         fig, axs = plt.subplots(1, 5, figsize=(18, 6), constrained_layout=True)
@@ -38,6 +41,17 @@ def main():
         plot_data = chain_data[['mean', 'std', 'min', 'max', 'count']].copy()
         plot_data.columns = ['Mean', 'Std', 'Min', 'Max', 'Count']
 
+        # Calculate summary statistics for CSV export
+        chain_summary = {
+            'chain': chain_name,
+            'mean_count': plot_data['Count'].mean(),
+            'mean_mean': plot_data['Mean'].mean(),
+            'mean_std': plot_data['Std'].mean(),
+            'mean_min': plot_data['Min'].mean(),
+            'mean_max': plot_data['Max'].mean()
+        }
+        summary_data.append(chain_summary)
+
         # Make all plots have the same color palette
         palette = sns.color_palette("husl", 4)
         # Add a distinct color for the 'Count' plot, as it is a different metric
@@ -114,7 +128,13 @@ def main():
         print(f"\nSummary for chain: {chain_name}")
         print(summary[['mean', 'std', 'min', 'max', 'count']])
 
+    # Create and save the summary CSV
+    summary_df = pd.DataFrame(summary_data)
+    summary_csv_file = args.input.replace('.csv', '_summary.csv')
+    summary_df.to_csv(summary_csv_file, index=False)
+
     print(f"\nAnalysis complete. Plots saved with base name: {args.input.replace('.csv', '_chain_*_analysis.png')}")
+    print(f"Summary CSV saved as: {summary_csv_file}")
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/csv2table.py b/csv2table.py
new file mode 100755
index 0000000..2b139ea
--- /dev/null
+++ b/csv2table.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import numpy as np
+
+def csv_to_latex_table(csv_file_path, output_file_path=None, decimal_places=2):
+    """
+    Convert a CSV file to a LaTeX table with proper formatting.
+    
+    Args:
+        csv_file_path (str): Path to the input CSV file
+        output_file_path (str, optional): Path to save the LaTeX output
+        decimal_places (int): Number of decimal places for numeric values
+    
+    Returns:
+        str: LaTeX table code
+    """
+    
+    # Read the CSV file
+    df = pd.read_csv(csv_file_path)
+    
+    # Create a more readable version of the chain column
+    df['chain'] = df['chain'].str.replace('/input/', '').str.replace('/output/', '→ ')
+    df['chain'] = df['chain'].str.replace('/', ' ')
+    
+    # Round numeric columns to specified decimal places
+    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    df[numeric_columns] = df[numeric_columns].round(decimal_places)
+    
+    # Start building the LaTeX table
+    latex_code = []
+    
+    # Document setup (optional - can be removed if embedding in existing document)
+    latex_code.append("\\documentclass{article}")
+    latex_code.append("\\usepackage{booktabs}")
+    latex_code.append("\\usepackage{array}")
+    latex_code.append("\\usepackage{longtable}")
+    latex_code.append("\\begin{document}")
+    latex_code.append("")
+    
+    # Table setup
+    num_cols = len(df.columns)
+    col_spec = "l" + "r" * (num_cols - 1)  # Left align first column, right align others
+    
+    latex_code.append("\\begin{longtable}{" + col_spec + "}")
+    latex_code.append("\\toprule")
+    
+    # Create header
+    headers = []
+    for col in df.columns:
+        if col == 'chain':
+            headers.append("Chain")
+        else:
+            # Convert column names to more readable format
+            readable_name = col.replace('_', ' ').title()
+            headers.append(readable_name)
+    
+    latex_code.append(" & ".join(headers) + " \\\\")
+    latex_code.append("\\midrule")
+    latex_code.append("\\endfirsthead")
+    latex_code.append("")
+    
+    # Header for continuation pages
+    latex_code.append("\\multicolumn{" + str(num_cols) + "}{c}")
+    latex_code.append("{\\tablename\\ \\thetable{} -- continued from previous page} \\\\")
+    latex_code.append("\\toprule")
+    latex_code.append(" & ".join(headers) + " \\\\")
+    latex_code.append("\\midrule")
+    latex_code.append("\\endhead")
+    latex_code.append("")
+    
+    # Footer for non-final pages
+    latex_code.append("\\midrule")
+    latex_code.append("\\multicolumn{" + str(num_cols) + "}{r}{Continued on next page} \\\\")
+    latex_code.append("\\endfoot")
+    latex_code.append("")
+    
+    # Final footer
+    latex_code.append("\\bottomrule")
+    latex_code.append("\\endlastfoot")
+    latex_code.append("")
+    
+    # Add data rows
+    for _, row in df.iterrows():
+        row_data = []
+        for i, value in enumerate(row):
+            if i == 0:  # Chain column - use texttt for monospace
+                # Split long chains for better formatting
+                chain_parts = str(value).split('→')
+                if len(chain_parts) == 2:
+                    formatted_chain = f"\\texttt{{{chain_parts[0].strip()}}} → \\texttt{{{chain_parts[1].strip()}}}"
+                else:
+                    formatted_chain = f"\\texttt{{{str(value)}}}"
+                row_data.append(formatted_chain)
+            else:
+                row_data.append(str(value))
+        
+        latex_code.append(" & ".join(row_data) + " \\\\")
+    
+    latex_code.append("\\end{longtable}")
+    latex_code.append("")
+    latex_code.append("\\end{document}")
+    
+    # Join all lines
+    latex_output = "\n".join(latex_code)
+    
+    # Save to file if path is provided
+    if output_file_path:
+        with open(output_file_path, 'w', encoding='utf-8') as f:
+            f.write(latex_output)
+        print(f"LaTeX table saved to {output_file_path}")
+    
+    return latex_output
+
+def csv_to_latex_table_simple(csv_file_path, decimal_places=2):
+    """
+    Convert CSV to LaTeX table without document wrapper (for embedding).
+    
+    Args:
+        csv_file_path (str): Path to the input CSV file
+        decimal_places (int): Number of decimal places for numeric values
+    
+    Returns:
+        str: LaTeX table code only
+    """
+    
+    # Read the CSV file
+    df = pd.read_csv(csv_file_path)
+    
+    # Create a more readable version of the chain column
+    df['chain'] = df['chain'].str.replace('/input/', '').str.replace('/output/', ' → ')
+    df['chain'] = df['chain'].str.replace('/', ' ')
+    
+    # Round numeric columns
+    numeric_columns = df.select_dtypes(include=[np.number]).columns
+    df[numeric_columns] = df[numeric_columns].round(decimal_places)
+    
+    # Build table
+    latex_code = []
+    num_cols = len(df.columns)
+    col_spec = "l" + "r" * (num_cols - 1)
+    
+    latex_code.append("\\begin{tabular}{" + col_spec + "}")
+    latex_code.append("    \\toprule")
+    
+    # Headers
+    headers = []
+    for col in df.columns:
+        if col == 'chain':
+            headers.append("Chain")
+        else:
+            readable_name = col.replace('_', ' ').title()
+            headers.append(readable_name)
+    
+    latex_code.append(" & ".join(headers) + " \\\\")
+    latex_code.append("\\midrule")
+    
+    # Data rows
+    for _, row in df.iterrows():
+        row_data = []
+        for i, value in enumerate(row):
+            if i == 0:  # Chain column
+                chain_parts = str(value).split(' → ')
+                if len(chain_parts) == 2:
+                    formatted_chain = f"\\texttt{{{chain_parts[0].strip()}}} → \\texttt{{{chain_parts[1].strip()}}}"
+                else:
+                    formatted_chain = f"\\texttt{{{str(value)}}}"
+                row_data.append(formatted_chain)
+            else:
+                row_data.append(str(value))
+        
+        latex_code.append(" & ".join(row_data) + " \\\\")
+    
+    latex_code.append("\\bottomrule")
+    latex_code.append("\\end{tabular}")
+    
+    return "\n".join(latex_code)
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+    
+    parser = argparse.ArgumentParser(description='Convert CSV file to LaTeX table')
+    parser.add_argument('csv_file', help='Path to the input CSV file')
+    parser.add_argument('-o', '--output', help='Output LaTeX file path (optional)')
+    parser.add_argument('-d', '--decimals', type=int, default=2, 
+                       help='Number of decimal places for numeric values (default: 2)')
+    parser.add_argument('-s', '--simple', action='store_true',
+                       help='Generate simple table only (no document wrapper)')
+    
+    args = parser.parse_args()
+    
+    try:
+        if args.simple:
+            # Generate simple table for embedding
+            latex_output = csv_to_latex_table_simple(args.csv_file, args.decimals)
+            print(latex_output)
+            
+            if args.output:
+                with open(args.output, 'w', encoding='utf-8') as f:
+                    f.write(latex_output)
+                print(f"\nSimple LaTeX table saved to {args.output}", file=sys.stderr)
+        else:
+            # Generate complete LaTeX document
+            latex_output = csv_to_latex_table(args.csv_file, args.output, args.decimals)
+            if not args.output:
+                print(latex_output)
+            
+    except FileNotFoundError:
+        print(f"Error: CSV file '{args.csv_file}' not found.", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
\ No newline at end of file
diff --git a/csvfix.sh b/csvfix.sh
new file mode 100755
index 0000000..78b2679
--- /dev/null
+++ b/csvfix.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Check if filename parameter is provided
+if [ $# -eq 0 ]; then
+    echo "Usage: $0 <csv_filename>"
+    echo "Example: $0 data.csv"
+    exit 1
+fi
+
+CSV_FILE="$1"
+
+# Check if file exists
+if [ ! -f "$CSV_FILE" ]; then
+    echo "Error: File '$CSV_FILE' not found!"
+    exit 1
+fi
+
+echo "Processing file: $CSV_FILE"
+echo "========================================"
+
+# Count prefixes from first column
+echo -e "\n=== Total number of unique prefixes ==="
+cut -d',' -f1 "$CSV_FILE" | sort | uniq | wc -l
+
+# Readable output format
+echo "=== Formatted output ==="
+echo "Prefix -> Count"
+echo "---------------"
+cut -d',' -f1 "$CSV_FILE" | sort | uniq -c | sort -nr | awk '{printf "%-20s -> %d\n", $2, $1}'
diff --git a/run_batch_analysis_analysis.sh b/run_batch_analysis_analysis.sh
new file mode 100755
index 0000000..1ebb4d3
--- /dev/null
+++ b/run_batch_analysis_analysis.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Usage: ./run_batch_analysis_analysis.sh /path/to/target_dir
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "Usage: $0 /path/to/target_dir"
+  exit 1
+fi
+
+TARGET_DIR="$1"
+
+if [[ ! -d "$TARGET_DIR" ]]; then
+  echo "Error: '$TARGET_DIR' is not a directory."
+  exit 1
+fi
+
+# Find all results.csv files directly under subdirectories
+find "$TARGET_DIR" -mindepth 2 -maxdepth 2 -type f -name results.csv | while IFS= read -r csvfile; do
+  echo "Analyzing $csvfile"
+  ./batch_analysis_analysis.py -i "$csvfile"
+done
diff --git a/run_batch_analyze.sh b/run_batch_analyze.sh
new file mode 100755
index 0000000..6dbc4d0
--- /dev/null
+++ b/run_batch_analyze.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Usage: ./run_batch_analyze.sh /path/to/trace_root
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "Usage: $0 /path/to/trace_root"
+  exit 1
+fi
+
+TRACE_ROOT="$1"
+
+if [[ ! -d "$TRACE_ROOT" ]]; then
+  echo "Error: '$TRACE_ROOT' is not a directory."
+  exit 1
+fi
+
+for dir in "$TRACE_ROOT"/*; do
+  if [[ -d "$dir" ]]; then
+    dirname=$(basename "$dir")
+    # Extract everything before first underscore as type
+    type=$(echo "$dirname" | grep -oP '^[a-z]+(?=_)')
+    if [[ -z "$type" ]]; then
+      echo "Warning: Could not extract type from '$dirname', skipping."
+      continue
+    fi
+    echo "Running batch_analyze.py on $dir with filter ${type}*"
+    ./batch_analyze.py -d "$dir" -f "${type}*"
+  fi
+done