added a bunch of helper scripts

This commit is contained in:
Niklas Halle 2025-08-05 10:54:44 +00:00
parent f72408cd88
commit a24aeeffe7
6 changed files with 347 additions and 0 deletions

30
add_csv_header.sh Executable file
View file

@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Usage: ./add_csv_header.sh /path/to/trace_root
set -euo pipefail
if [[ $# -ne 1 ]]; then
echo "Usage: $0 /path/to/trace_root"
exit 1
fi
TRACE_ROOT="$1"
HEADER="experiment_name,chain,mean,std,min,max,count"
if [[ ! -d "$TRACE_ROOT" ]]; then
echo "Error: '$TRACE_ROOT' is not a directory."
exit 1
fi
# Find all results.csv files one level below the trace root
find "$TRACE_ROOT" -mindepth 2 -maxdepth 2 -type f -name results.csv | while IFS= read -r csvfile; do
# Insert header only if not already present
first_line=$(head -n 1 "$csvfile")
if [[ "$first_line" != "$HEADER" ]]; then
echo "Adding header to $csvfile"
sed -i "1i$HEADER" "$csvfile"
else
echo "Header already present in $csvfile, skipping."
fi
done

View file

@ -27,6 +27,9 @@ def main():
# Group data by chain
chain_groups = df.groupby('chain')
# Prepare list to collect summary data for CSV export
summary_data = []
# For each chain, create a figure with five subplots for boxplots (mean, std, min, max, count)
for chain_name, chain_data in chain_groups:
fig, axs = plt.subplots(1, 5, figsize=(18, 6), constrained_layout=True)
@ -38,6 +41,17 @@ def main():
plot_data = chain_data[['mean', 'std', 'min', 'max', 'count']].copy()
plot_data.columns = ['Mean', 'Std', 'Min', 'Max', 'Count']
# Calculate summary statistics for CSV export
chain_summary = {
'chain': chain_name,
'mean_count': plot_data['Count'].mean(),
'mean_mean': plot_data['Mean'].mean(),
'mean_std': plot_data['Std'].mean(),
'mean_min': plot_data['Min'].mean(),
'mean_max': plot_data['Max'].mean()
}
summary_data.append(chain_summary)
# Make all plots have the same color palette
palette = sns.color_palette("husl", 4)
# Add a distinct color for the 'Count' plot, as it is a different metric
@ -114,7 +128,13 @@ def main():
print(f"\nSummary for chain: {chain_name}")
print(summary[['mean', 'std', 'min', 'max', 'count']])
# Create and save the summary CSV
summary_df = pd.DataFrame(summary_data)
summary_csv_file = args.input.replace('.csv', '_summary.csv')
summary_df.to_csv(summary_csv_file, index=False)
print(f"\nAnalysis complete. Plots saved with base name: {args.input.replace('.csv', '_chain_*_analysis.png')}")
print(f"Summary CSV saved as: {summary_csv_file}")
if __name__ == "__main__":
main()

214
csv2table.py Executable file
View file

@ -0,0 +1,214 @@
#!/usr/bin/env python3
import pandas as pd
import numpy as np
def csv_to_latex_table(csv_file_path, output_file_path=None, decimal_places=2):
"""
Convert a CSV file to a LaTeX table with proper formatting.
Args:
csv_file_path (str): Path to the input CSV file
output_file_path (str, optional): Path to save the LaTeX output
decimal_places (int): Number of decimal places for numeric values
Returns:
str: LaTeX table code
"""
# Read the CSV file
df = pd.read_csv(csv_file_path)
# Create a more readable version of the chain column
df['chain'] = df['chain'].str.replace('/input/', '').str.replace('/output/', '')
df['chain'] = df['chain'].str.replace('/', ' ')
# Round numeric columns to specified decimal places
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].round(decimal_places)
# Start building the LaTeX table
latex_code = []
# Document setup (optional - can be removed if embedding in existing document)
latex_code.append("\\documentclass{article}")
latex_code.append("\\usepackage{booktabs}")
latex_code.append("\\usepackage{array}")
latex_code.append("\\usepackage{longtable}")
latex_code.append("\\begin{document}")
latex_code.append("")
# Table setup
num_cols = len(df.columns)
col_spec = "l" + "r" * (num_cols - 1) # Left align first column, right align others
latex_code.append("\\begin{longtable}{" + col_spec + "}")
latex_code.append("\\toprule")
# Create header
headers = []
for col in df.columns:
if col == 'chain':
headers.append("Chain")
else:
# Convert column names to more readable format
readable_name = col.replace('_', ' ').title()
headers.append(readable_name)
latex_code.append(" & ".join(headers) + " \\\\")
latex_code.append("\\midrule")
latex_code.append("\\endfirsthead")
latex_code.append("")
# Header for continuation pages
latex_code.append("\\multicolumn{" + str(num_cols) + "}{c}")
latex_code.append("{\\tablename\\ \\thetable{} -- continued from previous page} \\\\")
latex_code.append("\\toprule")
latex_code.append(" & ".join(headers) + " \\\\")
latex_code.append("\\midrule")
latex_code.append("\\endhead")
latex_code.append("")
# Footer for non-final pages
latex_code.append("\\midrule")
latex_code.append("\\multicolumn{" + str(num_cols) + "}{r}{Continued on next page} \\\\")
latex_code.append("\\endfoot")
latex_code.append("")
# Final footer
latex_code.append("\\bottomrule")
latex_code.append("\\endlastfoot")
latex_code.append("")
# Add data rows
for _, row in df.iterrows():
row_data = []
for i, value in enumerate(row):
if i == 0: # Chain column - use texttt for monospace
# Split long chains for better formatting
chain_parts = str(value).split('')
if len(chain_parts) == 2:
formatted_chain = f"\\texttt{{{chain_parts[0].strip()}}}\\texttt{{{chain_parts[1].strip()}}}"
else:
formatted_chain = f"\\texttt{{{str(value)}}}"
row_data.append(formatted_chain)
else:
row_data.append(str(value))
latex_code.append(" & ".join(row_data) + " \\\\")
latex_code.append("\\end{longtable}")
latex_code.append("")
latex_code.append("\\end{document}")
# Join all lines
latex_output = "\n".join(latex_code)
# Save to file if path is provided
if output_file_path:
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(latex_output)
print(f"LaTeX table saved to {output_file_path}")
return latex_output
def csv_to_latex_table_simple(csv_file_path, decimal_places=2):
"""
Convert CSV to LaTeX table without document wrapper (for embedding).
Args:
csv_file_path (str): Path to the input CSV file
decimal_places (int): Number of decimal places for numeric values
Returns:
str: LaTeX table code only
"""
# Read the CSV file
df = pd.read_csv(csv_file_path)
# Create a more readable version of the chain column
df['chain'] = df['chain'].str.replace('/input/', '').str.replace('/output/', '')
df['chain'] = df['chain'].str.replace('/', ' ')
# Round numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].round(decimal_places)
# Build table
latex_code = []
num_cols = len(df.columns)
col_spec = "l" + "r" * (num_cols - 1)
latex_code.append("\\begin{tabular}{" + col_spec + "}")
latex_code.append(" \\toprule")
# Headers
headers = []
for col in df.columns:
if col == 'chain':
headers.append("Chain")
else:
readable_name = col.replace('_', ' ').title()
headers.append(readable_name)
latex_code.append(" & ".join(headers) + " \\\\")
latex_code.append("\\midrule")
# Data rows
for _, row in df.iterrows():
row_data = []
for i, value in enumerate(row):
if i == 0: # Chain column
chain_parts = str(value).split('')
if len(chain_parts) == 2:
formatted_chain = f"\\texttt{{{chain_parts[0].strip()}}}\\texttt{{{chain_parts[1].strip()}}}"
else:
formatted_chain = f"\\texttt{{{str(value)}}}"
row_data.append(formatted_chain)
else:
row_data.append(str(value))
latex_code.append(" & ".join(row_data) + " \\\\")
latex_code.append("\\bottomrule")
latex_code.append("\\end{tabular}")
return "\n".join(latex_code)
if __name__ == "__main__":
import argparse
import sys
parser = argparse.ArgumentParser(description='Convert CSV file to LaTeX table')
parser.add_argument('csv_file', help='Path to the input CSV file')
parser.add_argument('-o', '--output', help='Output LaTeX file path (optional)')
parser.add_argument('-d', '--decimals', type=int, default=2,
help='Number of decimal places for numeric values (default: 2)')
parser.add_argument('-s', '--simple', action='store_true',
help='Generate simple table only (no document wrapper)')
args = parser.parse_args()
try:
if args.simple:
# Generate simple table for embedding
latex_output = csv_to_latex_table_simple(args.csv_file, args.decimals)
print(latex_output)
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(latex_output)
print(f"\nSimple LaTeX table saved to {args.output}", file=sys.stderr)
else:
# Generate complete LaTeX document
latex_output = csv_to_latex_table(args.csv_file, args.output, args.decimals)
if not args.output:
print(latex_output)
except FileNotFoundError:
print(f"Error: CSV file '{args.csv_file}' not found.", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error: {str(e)}", file=sys.stderr)
sys.exit(1)

29
csvfix.sh Executable file
View file

@ -0,0 +1,29 @@
#!/bin/bash
# Check if filename parameter is provided
if [ $# -eq 0 ]; then
echo "Usage: $0 <csv_filename>"
echo "Example: $0 data.csv"
exit 1
fi
CSV_FILE="$1"
# Check if file exists
if [ ! -f "$CSV_FILE" ]; then
echo "Error: File '$CSV_FILE' not found!"
exit 1
fi
echo "Processing file: $CSV_FILE"
echo "========================================"
# Count prefixes from first column
echo -e "\n=== Total number of unique prefixes ==="
cut -d',' -f1 "$CSV_FILE" | sort | uniq | wc -l
# Readable output format
echo "=== Formatted output ==="
echo "Prefix -> Count"
echo "---------------"
cut -d',' -f1 "$CSV_FILE" | sort | uniq -c | sort -nr | awk '{printf "%-20s -> %d\n", $2, $1}'

23
run_batch_analysis_analysis.sh Executable file
View file

@ -0,0 +1,23 @@
#!/usr/bin/env bash
# Usage: ./run_batch_analysis_analysis.sh /path/to/target_dir
set -euo pipefail
if [[ $# -ne 1 ]]; then
echo "Usage: $0 /path/to/target_dir"
exit 1
fi
TARGET_DIR="$1"
if [[ ! -d "$TARGET_DIR" ]]; then
echo "Error: '$TARGET_DIR' is not a directory."
exit 1
fi
# Find all results.csv files directly under subdirectories
find "$TARGET_DIR" -mindepth 2 -maxdepth 2 -type f -name results.csv | while IFS= read -r csvfile; do
echo "Analyzing $csvfile"
./batch_analysis_analysis.py -i "$csvfile"
done

31
run_batch_analyze.sh Executable file
View file

@ -0,0 +1,31 @@
#!/usr/bin/env bash
# Usage: ./run_batch_analyze.sh /path/to/trace_root
set -euo pipefail
if [[ $# -ne 1 ]]; then
echo "Usage: $0 /path/to/trace_root"
exit 1
fi
TRACE_ROOT="$1"
if [[ ! -d "$TRACE_ROOT" ]]; then
echo "Error: '$TRACE_ROOT' is not a directory."
exit 1
fi
for dir in "$TRACE_ROOT"/*; do
if [[ -d "$dir" ]]; then
dirname=$(basename "$dir")
# Extract everything before first underscore as type
type=$(echo "$dirname" | grep -oP '^[a-z]+(?=_)')
if [[ -z "$type" ]]; then
echo "Warning: Could not extract type from '$dirname', skipping."
continue
fi
echo "Running batch_analyze.py on $dir with filter ${type}*"
./batch_analyze.py -d "$dir" -f "${type}*"
fi
done