llm-rag-ds-optimizer/scripts/plot_results.py

"""Plot benchmark results and save to PNG, export to CSV."""

import json
import csv
from pathlib import Path

import matplotlib.pyplot as plt


def load_results(result_dir: Path = Path("benchmarks/results")) -> dict:
    """Load all benchmark results."""
    results = {}

    # Load old-style results (flat JSON files)
    for json_file in result_dir.glob("*.json"):
        if "benchmark" in json_file.stem:
            with open(json_file) as f:
                data = json.load(f)
                benchmark_name = data.get("benchmark", json_file.stem.replace("_benchmark", ""))
                results[benchmark_name] = data

    # Load new-style results (corpus/date/results.json)
    for corpus_dir in result_dir.iterdir():
        if corpus_dir.is_dir():
            for date_dir in corpus_dir.iterdir():
                if date_dir.is_dir():
                    results_file = date_dir / "results.json"
                    if results_file.exists():
                        with open(results_file) as f:
                            data_list = json.load(f)
                            if isinstance(data_list, list) and data_list:
                                # Use first result as representative or aggregate
                                corpus_name = corpus_dir.name
                                date_str = date_dir.name
                                key = f"{corpus_name}_{date_str}"
                                results[key] = data_list[0]  # Simplified

    return results


def export_to_csv(results: dict, output_file: Path = Path("benchmarks/results/benchmark_results.csv")):
    """Export benchmark results to CSV."""
    output_file.parent.mkdir(parents=True, exist_ok=True)

    rows = []
    for bench_name, data in results.items():
        # Extract key metrics
        row = {
            "benchmark": bench_name,
            "p50_ms": data.get("attach_p50_ms") or data.get("search_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms") or 0.0,
            "p95_ms": data.get("attach_p95_ms") or data.get("search_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms") or 0.0,
            "p99_ms": data.get("attach_p99_ms") or data.get("search_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms") or 0.0,
            "peak_rss_mb": data.get("peak_rss_mb", 0.0),
            "memory_delta_mb": data.get("memory_delta_mb", 0.0),
        }

        # Add specific metrics if available
        if "attach_p50_ms" in data:
            row.update({
                "attach_p50_ms": data.get("attach_p50_ms", 0),
                "attach_p95_ms": data.get("attach_p95_ms", 0),
                "attach_p99_ms": data.get("attach_p99_ms", 0),
                "get_p50_ms": data.get("get_p50_ms", 0),
                "get_p95_ms": data.get("get_p95_ms", 0),
                "get_p99_ms": data.get("get_p99_ms", 0),
            })
        if "search_p50_ms" in data:
            row.update({
                "search_p50_ms": data.get("search_p50_ms", 0),
                "search_p95_ms": data.get("search_p95_ms", 0),
                "search_p99_ms": data.get("search_p99_ms", 0),
            })

        # Add build peak RSS if available
        if "build_peak_rss_mb" in data:
            row["build_peak_rss_mb"] = data.get("build_peak_rss_mb", 0.0)

        rows.append(row)

    if rows:
        fieldnames = set()
        for row in rows:
            fieldnames.update(row.keys())
        fieldnames = sorted(fieldnames)

        with open(output_file, "w", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows)

        print(f"Results exported to CSV: {output_file}")


def plot_latency_distribution(results: dict, output_dir: Path = Path("benchmarks/figures")):
    """Plot latency distributions."""
    output_dir.mkdir(parents=True, exist_ok=True)

    benchmarks = []
    p50_values = []
    p95_values = []
    p99_values = []

    for name, data in results.items():
        # Try different metric names
        p50 = data.get("search_p50_ms") or data.get("attach_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms", 0)
        p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
        p99 = data.get("search_p99_ms") or data.get("attach_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms", 0)

        if p50 > 0 or p95 > 0 or p99 > 0:
            benchmarks.append(name)
            p50_values.append(p50)
            p95_values.append(p95)
            p99_values.append(p99)

    if benchmarks:
        fig, ax = plt.subplots(figsize=(12, 7))
        x = range(len(benchmarks))
        width = 0.25

        ax.bar([i - width for i in x], p50_values, width, label="P50", alpha=0.8, color="#2ecc71")
        ax.bar(x, p95_values, width, label="P95", alpha=0.8, color="#3498db")
        ax.bar([i + width for i in x], p99_values, width, label="P99", alpha=0.8, color="#e74c3c")

        ax.set_xlabel("Benchmark", fontsize=12, fontweight="bold")
        ax.set_ylabel("Latency (ms)", fontsize=12, fontweight="bold")
        ax.set_title("Latency Percentiles by Benchmark", fontsize=14, fontweight="bold")
        ax.set_xticks(x)
        ax.set_xticklabels(benchmarks, rotation=45, ha="right")
        ax.legend(fontsize=10)
        ax.grid(True, alpha=0.3, linestyle="--")

        # Add value labels on bars
        for i, (p50, p95, p99) in enumerate(zip(p50_values, p95_values, p99_values)):
            if p50 > 0:
                ax.text(i - width, p50, f"{p50:.2f}", ha="center", va="bottom", fontsize=8)
            if p95 > 0:
                ax.text(i, p95, f"{p95:.2f}", ha="center", va="bottom", fontsize=8)
            if p99 > 0:
                ax.text(i + width, p99, f"{p99:.2f}", ha="center", va="bottom", fontsize=8)

        plt.tight_layout()
        output_file = output_dir / "latency_distribution.png"
        plt.savefig(output_file, dpi=300, bbox_inches="tight")
        print(f"Latency plot saved to {output_file}")
        plt.close()


def plot_comparison_chart(results: dict, output_dir: Path = Path("benchmarks/figures")):
    """Plot comparison chart of all benchmarks."""
    output_dir.mkdir(parents=True, exist_ok=True)

    benchmarks = []
    p95_latencies = []

    for name, data in results.items():
        p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
        if p95 > 0:
            benchmarks.append(name)
            p95_latencies.append(p95)

    if benchmarks:
        fig, ax = plt.subplots(figsize=(10, 6))
        colors = plt.cm.viridis(range(len(benchmarks)))
        bars = ax.barh(benchmarks, p95_latencies, color=colors, alpha=0.8)

        ax.set_xlabel("P95 Latency (ms)", fontsize=12, fontweight="bold")
        ax.set_title("Benchmark Performance Comparison (P95 Latency)", fontsize=14, fontweight="bold")
        ax.grid(True, alpha=0.3, linestyle="--", axis="x")

        # Add value labels
        for bar, latency in zip(bars, p95_latencies):
            width = bar.get_width()
            ax.text(width, bar.get_y() + bar.get_height()/2, f"{latency:.2f}ms",
                   ha="left", va="center", fontsize=9, fontweight="bold")

        plt.tight_layout()
        output_file = output_dir / "benchmark_comparison.png"
        plt.savefig(output_file, dpi=300, bbox_inches="tight")
        print(f"Comparison plot saved to {output_file}")
        plt.close()


def plot_memory_usage(results: dict, output_dir: Path = Path("benchmarks/figures")):
    """Plot memory usage (peak RSS) by benchmark."""
    output_dir.mkdir(parents=True, exist_ok=True)

    benchmarks = []
    peak_rss_values = []
    memory_delta_values = []

    for name, data in results.items():
        peak_rss = data.get("peak_rss_mb", 0.0)
        memory_delta = data.get("memory_delta_mb", 0.0)
        if peak_rss > 0:
            benchmarks.append(name)
            peak_rss_values.append(peak_rss)
            memory_delta_values.append(memory_delta)

    if benchmarks:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

        # Plot 1: Peak RSS
        colors1 = plt.cm.plasma(range(len(benchmarks)))
        bars1 = ax1.barh(benchmarks, peak_rss_values, color=colors1, alpha=0.8)
        ax1.set_xlabel("Peak RSS (MB)", fontsize=12, fontweight="bold")
        ax1.set_title("Peak Memory Usage by Benchmark", fontsize=14, fontweight="bold")
        ax1.grid(True, alpha=0.3, linestyle="--", axis="x")

        # Add value labels
        for bar, rss in zip(bars1, peak_rss_values):
            width = bar.get_width()
            ax1.text(width, bar.get_y() + bar.get_height()/2, f"{rss:.2f}MB",
                   ha="left", va="center", fontsize=9, fontweight="bold")

        # Plot 2: Memory Delta
        colors2 = plt.cm.coolwarm(range(len(benchmarks)))
        bars2 = ax2.barh(benchmarks, memory_delta_values, color=colors2, alpha=0.8)
        ax2.set_xlabel("Memory Delta (MB)", fontsize=12, fontweight="bold")
        ax2.set_title("Memory Allocation Delta by Benchmark", fontsize=14, fontweight="bold")
        ax2.grid(True, alpha=0.3, linestyle="--", axis="x")

        # Add value labels
        for bar, delta in zip(bars2, memory_delta_values):
            width = bar.get_width()
            ax2.text(width, bar.get_y() + bar.get_height()/2, f"{delta:.2f}MB",
                   ha="left", va="center", fontsize=9, fontweight="bold")

        plt.tight_layout()
        output_file = output_dir / "memory_usage.png"
        plt.savefig(output_file, dpi=300, bbox_inches="tight")
        print(f"Memory usage plot saved to {output_file}")
        plt.close()


if __name__ == "__main__":
    results = load_results()
    if results:
        export_to_csv(results)
        plot_latency_distribution(results)
        plot_comparison_chart(results)
        plot_memory_usage(results)
        print(f"\nProcessed {len(results)} benchmark results")
    else:
        print("No benchmark results found. Run benchmarks first.")