Files
llm-rag-ds-optimizer/scripts/plot_results.py

245 lines
10 KiB
Python

"""Plot benchmark results and save to PNG, export to CSV."""
import json
import csv
from pathlib import Path
import matplotlib.pyplot as plt
def load_results(result_dir: Path = Path("benchmarks/results")) -> dict:
"""Load all benchmark results."""
results = {}
# Load old-style results (flat JSON files)
for json_file in result_dir.glob("*.json"):
if "benchmark" in json_file.stem:
with open(json_file) as f:
data = json.load(f)
benchmark_name = data.get("benchmark", json_file.stem.replace("_benchmark", ""))
results[benchmark_name] = data
# Load new-style results (corpus/date/results.json)
for corpus_dir in result_dir.iterdir():
if corpus_dir.is_dir():
for date_dir in corpus_dir.iterdir():
if date_dir.is_dir():
results_file = date_dir / "results.json"
if results_file.exists():
with open(results_file) as f:
data_list = json.load(f)
if isinstance(data_list, list) and data_list:
# Use first result as representative or aggregate
corpus_name = corpus_dir.name
date_str = date_dir.name
key = f"{corpus_name}_{date_str}"
results[key] = data_list[0] # Simplified
return results
def export_to_csv(results: dict, output_file: Path = Path("benchmarks/results/benchmark_results.csv")):
"""Export benchmark results to CSV."""
output_file.parent.mkdir(parents=True, exist_ok=True)
rows = []
for bench_name, data in results.items():
# Extract key metrics
row = {
"benchmark": bench_name,
"p50_ms": data.get("attach_p50_ms") or data.get("search_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms") or 0.0,
"p95_ms": data.get("attach_p95_ms") or data.get("search_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms") or 0.0,
"p99_ms": data.get("attach_p99_ms") or data.get("search_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms") or 0.0,
"peak_rss_mb": data.get("peak_rss_mb", 0.0),
"memory_delta_mb": data.get("memory_delta_mb", 0.0),
}
# Add specific metrics if available
if "attach_p50_ms" in data:
row.update({
"attach_p50_ms": data.get("attach_p50_ms", 0),
"attach_p95_ms": data.get("attach_p95_ms", 0),
"attach_p99_ms": data.get("attach_p99_ms", 0),
"get_p50_ms": data.get("get_p50_ms", 0),
"get_p95_ms": data.get("get_p95_ms", 0),
"get_p99_ms": data.get("get_p99_ms", 0),
})
if "search_p50_ms" in data:
row.update({
"search_p50_ms": data.get("search_p50_ms", 0),
"search_p95_ms": data.get("search_p95_ms", 0),
"search_p99_ms": data.get("search_p99_ms", 0),
})
# Add build peak RSS if available
if "build_peak_rss_mb" in data:
row["build_peak_rss_mb"] = data.get("build_peak_rss_mb", 0.0)
rows.append(row)
if rows:
fieldnames = set()
for row in rows:
fieldnames.update(row.keys())
fieldnames = sorted(fieldnames)
with open(output_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
print(f"Results exported to CSV: {output_file}")
def plot_latency_distribution(results: dict, output_dir: Path = Path("benchmarks/figures")):
"""Plot latency distributions."""
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = []
p50_values = []
p95_values = []
p99_values = []
for name, data in results.items():
# Try different metric names
p50 = data.get("search_p50_ms") or data.get("attach_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms", 0)
p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
p99 = data.get("search_p99_ms") or data.get("attach_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms", 0)
if p50 > 0 or p95 > 0 or p99 > 0:
benchmarks.append(name)
p50_values.append(p50)
p95_values.append(p95)
p99_values.append(p99)
if benchmarks:
fig, ax = plt.subplots(figsize=(12, 7))
x = range(len(benchmarks))
width = 0.25
ax.bar([i - width for i in x], p50_values, width, label="P50", alpha=0.8, color="#2ecc71")
ax.bar(x, p95_values, width, label="P95", alpha=0.8, color="#3498db")
ax.bar([i + width for i in x], p99_values, width, label="P99", alpha=0.8, color="#e74c3c")
ax.set_xlabel("Benchmark", fontsize=12, fontweight="bold")
ax.set_ylabel("Latency (ms)", fontsize=12, fontweight="bold")
ax.set_title("Latency Percentiles by Benchmark", fontsize=14, fontweight="bold")
ax.set_xticks(x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3, linestyle="--")
# Add value labels on bars
for i, (p50, p95, p99) in enumerate(zip(p50_values, p95_values, p99_values)):
if p50 > 0:
ax.text(i - width, p50, f"{p50:.2f}", ha="center", va="bottom", fontsize=8)
if p95 > 0:
ax.text(i, p95, f"{p95:.2f}", ha="center", va="bottom", fontsize=8)
if p99 > 0:
ax.text(i + width, p99, f"{p99:.2f}", ha="center", va="bottom", fontsize=8)
plt.tight_layout()
output_file = output_dir / "latency_distribution.png"
plt.savefig(output_file, dpi=300, bbox_inches="tight")
print(f"Latency plot saved to {output_file}")
plt.close()
def plot_comparison_chart(results: dict, output_dir: Path = Path("benchmarks/figures")):
"""Plot comparison chart of all benchmarks."""
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = []
p95_latencies = []
for name, data in results.items():
p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
if p95 > 0:
benchmarks.append(name)
p95_latencies.append(p95)
if benchmarks:
fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.viridis(range(len(benchmarks)))
bars = ax.barh(benchmarks, p95_latencies, color=colors, alpha=0.8)
ax.set_xlabel("P95 Latency (ms)", fontsize=12, fontweight="bold")
ax.set_title("Benchmark Performance Comparison (P95 Latency)", fontsize=14, fontweight="bold")
ax.grid(True, alpha=0.3, linestyle="--", axis="x")
# Add value labels
for bar, latency in zip(bars, p95_latencies):
width = bar.get_width()
ax.text(width, bar.get_y() + bar.get_height()/2, f"{latency:.2f}ms",
ha="left", va="center", fontsize=9, fontweight="bold")
plt.tight_layout()
output_file = output_dir / "benchmark_comparison.png"
plt.savefig(output_file, dpi=300, bbox_inches="tight")
print(f"Comparison plot saved to {output_file}")
plt.close()
def plot_memory_usage(results: dict, output_dir: Path = Path("benchmarks/figures")):
"""Plot memory usage (peak RSS) by benchmark."""
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = []
peak_rss_values = []
memory_delta_values = []
for name, data in results.items():
peak_rss = data.get("peak_rss_mb", 0.0)
memory_delta = data.get("memory_delta_mb", 0.0)
if peak_rss > 0:
benchmarks.append(name)
peak_rss_values.append(peak_rss)
memory_delta_values.append(memory_delta)
if benchmarks:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot 1: Peak RSS
colors1 = plt.cm.plasma(range(len(benchmarks)))
bars1 = ax1.barh(benchmarks, peak_rss_values, color=colors1, alpha=0.8)
ax1.set_xlabel("Peak RSS (MB)", fontsize=12, fontweight="bold")
ax1.set_title("Peak Memory Usage by Benchmark", fontsize=14, fontweight="bold")
ax1.grid(True, alpha=0.3, linestyle="--", axis="x")
# Add value labels
for bar, rss in zip(bars1, peak_rss_values):
width = bar.get_width()
ax1.text(width, bar.get_y() + bar.get_height()/2, f"{rss:.2f}MB",
ha="left", va="center", fontsize=9, fontweight="bold")
# Plot 2: Memory Delta
colors2 = plt.cm.coolwarm(range(len(benchmarks)))
bars2 = ax2.barh(benchmarks, memory_delta_values, color=colors2, alpha=0.8)
ax2.set_xlabel("Memory Delta (MB)", fontsize=12, fontweight="bold")
ax2.set_title("Memory Allocation Delta by Benchmark", fontsize=14, fontweight="bold")
ax2.grid(True, alpha=0.3, linestyle="--", axis="x")
# Add value labels
for bar, delta in zip(bars2, memory_delta_values):
width = bar.get_width()
ax2.text(width, bar.get_y() + bar.get_height()/2, f"{delta:.2f}MB",
ha="left", va="center", fontsize=9, fontweight="bold")
plt.tight_layout()
output_file = output_dir / "memory_usage.png"
plt.savefig(output_file, dpi=300, bbox_inches="tight")
print(f"Memory usage plot saved to {output_file}")
plt.close()
if __name__ == "__main__":
results = load_results()
if results:
export_to_csv(results)
plot_latency_distribution(results)
plot_comparison_chart(results)
plot_memory_usage(results)
print(f"\nProcessed {len(results)} benchmark results")
else:
print("No benchmark results found. Run benchmarks first.")