Initial commit: LLM-DS optimizer framework with data files excluded
This commit is contained in:
196
scripts/analyze_variance.py
Normal file
196
scripts/analyze_variance.py
Normal file
@@ -0,0 +1,196 @@
|
||||
"""Analyze variance in benchmark results and identify flaky benchmarks."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
try:
|
||||
from scipy import stats
|
||||
HAS_SCIPY = True
|
||||
except ImportError:
|
||||
HAS_SCIPY = False
|
||||
|
||||
|
||||
def load_benchmark_results(results_file: Path) -> list[dict]:
|
||||
"""Load benchmark results from JSON file."""
|
||||
with open(results_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def identify_flaky_configurations(
|
||||
results: list[dict],
|
||||
cv_threshold: float = 20.0,
|
||||
metrics: list[str] | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Identify flaky benchmark configurations based on coefficient of variation.
|
||||
|
||||
Args:
|
||||
results: List of aggregated result dictionaries
|
||||
cv_threshold: CV threshold (%) above which a benchmark is considered flaky
|
||||
metrics: List of metrics to check (default: critical metrics)
|
||||
|
||||
Returns:
|
||||
List of flaky configuration summaries
|
||||
"""
|
||||
if metrics is None:
|
||||
metrics = ["search_p50_ms", "search_p95_ms", "qps"]
|
||||
|
||||
flaky_configs = []
|
||||
|
||||
for result in results:
|
||||
flaky_metrics = []
|
||||
for metric in metrics:
|
||||
cv_key = f"{metric}_cv"
|
||||
if cv_key in result:
|
||||
cv = result[cv_key]
|
||||
if cv > cv_threshold:
|
||||
mean_val = result.get(f"{metric}_mean", 0)
|
||||
std_val = result.get(f"{metric}_std", 0)
|
||||
flaky_metrics.append({
|
||||
"metric": metric,
|
||||
"mean": mean_val,
|
||||
"std": std_val,
|
||||
"cv": cv,
|
||||
})
|
||||
|
||||
if flaky_metrics:
|
||||
flaky_configs.append({
|
||||
"corpus": result.get("corpus"),
|
||||
"size": result.get("size"),
|
||||
"ef_search": result.get("ef_search"),
|
||||
"M": result.get("M"),
|
||||
"repetitions": result.get("repetitions"),
|
||||
"flaky_metrics": flaky_metrics,
|
||||
})
|
||||
|
||||
return flaky_configs
|
||||
|
||||
|
||||
def generate_variance_report(
|
||||
aggregated_file: Path,
|
||||
output_file: Path | None = None,
|
||||
cv_threshold: float = 20.0,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Generate a variance analysis report.
|
||||
|
||||
Args:
|
||||
aggregated_file: Path to aggregated results JSON
|
||||
output_file: Optional output file for report
|
||||
cv_threshold: CV threshold for flaky detection
|
||||
|
||||
Returns:
|
||||
Report dictionary
|
||||
"""
|
||||
results = load_benchmark_results(aggregated_file)
|
||||
|
||||
if not results:
|
||||
return {"error": "No results found"}
|
||||
|
||||
# Calculate overall statistics
|
||||
all_cvs = []
|
||||
for result in results:
|
||||
for key in result.keys():
|
||||
if key.endswith("_cv") and isinstance(result[key], (int, float)):
|
||||
all_cvs.append(result[key])
|
||||
|
||||
# Identify flaky configurations
|
||||
flaky_configs = identify_flaky_configurations(results, cv_threshold)
|
||||
|
||||
# Group by corpus
|
||||
by_corpus = {}
|
||||
for result in results:
|
||||
corpus = result.get("corpus", "unknown")
|
||||
if corpus not in by_corpus:
|
||||
by_corpus[corpus] = []
|
||||
by_corpus[corpus].append(result)
|
||||
|
||||
report = {
|
||||
"summary": {
|
||||
"total_configurations": len(results),
|
||||
"flaky_configurations": len(flaky_configs),
|
||||
"flaky_percentage": (len(flaky_configs) / len(results) * 100) if results else 0,
|
||||
"average_cv": float(np.mean(all_cvs)) if all_cvs else 0.0,
|
||||
"max_cv": float(np.max(all_cvs)) if all_cvs else 0.0,
|
||||
},
|
||||
"flaky_configurations": flaky_configs,
|
||||
"by_corpus": {
|
||||
corpus: {
|
||||
"count": len(configs),
|
||||
"flaky_count": sum(1 for c in configs if any(m["cv"] > cv_threshold for m in identify_flaky_configurations([c], cv_threshold)[0].get("flaky_metrics", []))),
|
||||
}
|
||||
for corpus, configs in by_corpus.items()
|
||||
},
|
||||
}
|
||||
|
||||
if output_file:
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f"Variance report saved to {output_file}")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Analyze variance in benchmark results")
|
||||
parser.add_argument(
|
||||
"--results",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to aggregated results JSON file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
help="Output file for variance report"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cv-threshold",
|
||||
type=float,
|
||||
default=20.0,
|
||||
help="Coefficient of variation threshold (%) for flaky detection (default: 20.0)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.results.exists():
|
||||
print(f"Error: Results file not found: {args.results}")
|
||||
return
|
||||
|
||||
report = generate_variance_report(
|
||||
aggregated_file=args.results,
|
||||
output_file=args.output,
|
||||
cv_threshold=args.cv_threshold,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*70)
|
||||
print("Variance Analysis Report")
|
||||
print("="*70)
|
||||
summary = report.get("summary", {})
|
||||
print(f"Total configurations: {summary.get('total_configurations', 0)}")
|
||||
print(f"Flaky configurations: {summary.get('flaky_configurations', 0)} ({summary.get('flaky_percentage', 0):.1f}%)")
|
||||
print(f"Average CV: {summary.get('average_cv', 0):.2f}%")
|
||||
print(f"Max CV: {summary.get('max_cv', 0):.2f}%")
|
||||
|
||||
flaky = report.get("flaky_configurations", [])
|
||||
if flaky:
|
||||
print(f"\n⚠️ Flaky Configurations ({len(flaky)}):")
|
||||
for config in flaky[:10]: # Show first 10
|
||||
print(f" - {config.get('corpus')} (size={config.get('size')}, ef={config.get('ef_search')}, M={config.get('M')}):")
|
||||
for metric in config.get("flaky_metrics", []):
|
||||
print(f" • {metric['metric']}: CV={metric['cv']:.1f}% (mean={metric['mean']:.2f}±{metric['std']:.2f})")
|
||||
if len(flaky) > 10:
|
||||
print(f" ... and {len(flaky) - 10} more")
|
||||
else:
|
||||
print("\n✅ No flaky configurations detected!")
|
||||
|
||||
print("="*70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user