Initial commit: LLM-DS optimizer framework with data files excluded

This commit is contained in:
Carlos Gutierrez
2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions

196
scripts/analyze_variance.py Normal file
View File

@@ -0,0 +1,196 @@
"""Analyze variance in benchmark results and identify flaky benchmarks."""
import argparse
import json
from pathlib import Path
from typing import Any
import numpy as np
try:
from scipy import stats
HAS_SCIPY = True
except ImportError:
HAS_SCIPY = False
def load_benchmark_results(results_file: Path) -> list[dict]:
"""Load benchmark results from JSON file."""
with open(results_file) as f:
return json.load(f)
def identify_flaky_configurations(
results: list[dict],
cv_threshold: float = 20.0,
metrics: list[str] | None = None,
) -> list[dict[str, Any]]:
"""
Identify flaky benchmark configurations based on coefficient of variation.
Args:
results: List of aggregated result dictionaries
cv_threshold: CV threshold (%) above which a benchmark is considered flaky
metrics: List of metrics to check (default: critical metrics)
Returns:
List of flaky configuration summaries
"""
if metrics is None:
metrics = ["search_p50_ms", "search_p95_ms", "qps"]
flaky_configs = []
for result in results:
flaky_metrics = []
for metric in metrics:
cv_key = f"{metric}_cv"
if cv_key in result:
cv = result[cv_key]
if cv > cv_threshold:
mean_val = result.get(f"{metric}_mean", 0)
std_val = result.get(f"{metric}_std", 0)
flaky_metrics.append({
"metric": metric,
"mean": mean_val,
"std": std_val,
"cv": cv,
})
if flaky_metrics:
flaky_configs.append({
"corpus": result.get("corpus"),
"size": result.get("size"),
"ef_search": result.get("ef_search"),
"M": result.get("M"),
"repetitions": result.get("repetitions"),
"flaky_metrics": flaky_metrics,
})
return flaky_configs
def generate_variance_report(
aggregated_file: Path,
output_file: Path | None = None,
cv_threshold: float = 20.0,
) -> dict[str, Any]:
"""
Generate a variance analysis report.
Args:
aggregated_file: Path to aggregated results JSON
output_file: Optional output file for report
cv_threshold: CV threshold for flaky detection
Returns:
Report dictionary
"""
results = load_benchmark_results(aggregated_file)
if not results:
return {"error": "No results found"}
# Calculate overall statistics
all_cvs = []
for result in results:
for key in result.keys():
if key.endswith("_cv") and isinstance(result[key], (int, float)):
all_cvs.append(result[key])
# Identify flaky configurations
flaky_configs = identify_flaky_configurations(results, cv_threshold)
# Group by corpus
by_corpus = {}
for result in results:
corpus = result.get("corpus", "unknown")
if corpus not in by_corpus:
by_corpus[corpus] = []
by_corpus[corpus].append(result)
report = {
"summary": {
"total_configurations": len(results),
"flaky_configurations": len(flaky_configs),
"flaky_percentage": (len(flaky_configs) / len(results) * 100) if results else 0,
"average_cv": float(np.mean(all_cvs)) if all_cvs else 0.0,
"max_cv": float(np.max(all_cvs)) if all_cvs else 0.0,
},
"flaky_configurations": flaky_configs,
"by_corpus": {
corpus: {
"count": len(configs),
"flaky_count": sum(1 for c in configs if any(m["cv"] > cv_threshold for m in identify_flaky_configurations([c], cv_threshold)[0].get("flaky_metrics", []))),
}
for corpus, configs in by_corpus.items()
},
}
if output_file:
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Variance report saved to {output_file}")
return report
def main():
parser = argparse.ArgumentParser(description="Analyze variance in benchmark results")
parser.add_argument(
"--results",
type=Path,
required=True,
help="Path to aggregated results JSON file"
)
parser.add_argument(
"--output",
type=Path,
help="Output file for variance report"
)
parser.add_argument(
"--cv-threshold",
type=float,
default=20.0,
help="Coefficient of variation threshold (%) for flaky detection (default: 20.0)"
)
args = parser.parse_args()
if not args.results.exists():
print(f"Error: Results file not found: {args.results}")
return
report = generate_variance_report(
aggregated_file=args.results,
output_file=args.output,
cv_threshold=args.cv_threshold,
)
# Print summary
print("\n" + "="*70)
print("Variance Analysis Report")
print("="*70)
summary = report.get("summary", {})
print(f"Total configurations: {summary.get('total_configurations', 0)}")
print(f"Flaky configurations: {summary.get('flaky_configurations', 0)} ({summary.get('flaky_percentage', 0):.1f}%)")
print(f"Average CV: {summary.get('average_cv', 0):.2f}%")
print(f"Max CV: {summary.get('max_cv', 0):.2f}%")
flaky = report.get("flaky_configurations", [])
if flaky:
print(f"\n⚠️ Flaky Configurations ({len(flaky)}):")
for config in flaky[:10]: # Show first 10
print(f" - {config.get('corpus')} (size={config.get('size')}, ef={config.get('ef_search')}, M={config.get('M')}):")
for metric in config.get("flaky_metrics", []):
print(f"{metric['metric']}: CV={metric['cv']:.1f}% (mean={metric['mean']:.2f}±{metric['std']:.2f})")
if len(flaky) > 10:
print(f" ... and {len(flaky) - 10} more")
else:
print("\n✅ No flaky configurations detected!")
print("="*70)
if __name__ == "__main__":
main()