197 lines
6.1 KiB
Python
197 lines
6.1 KiB
Python
"""Analyze variance in benchmark results and identify flaky benchmarks."""
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
try:
|
|
from scipy import stats
|
|
HAS_SCIPY = True
|
|
except ImportError:
|
|
HAS_SCIPY = False
|
|
|
|
|
|
def load_benchmark_results(results_file: Path) -> list[dict]:
|
|
"""Load benchmark results from JSON file."""
|
|
with open(results_file) as f:
|
|
return json.load(f)
|
|
|
|
|
|
def identify_flaky_configurations(
|
|
results: list[dict],
|
|
cv_threshold: float = 20.0,
|
|
metrics: list[str] | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
"""
|
|
Identify flaky benchmark configurations based on coefficient of variation.
|
|
|
|
Args:
|
|
results: List of aggregated result dictionaries
|
|
cv_threshold: CV threshold (%) above which a benchmark is considered flaky
|
|
metrics: List of metrics to check (default: critical metrics)
|
|
|
|
Returns:
|
|
List of flaky configuration summaries
|
|
"""
|
|
if metrics is None:
|
|
metrics = ["search_p50_ms", "search_p95_ms", "qps"]
|
|
|
|
flaky_configs = []
|
|
|
|
for result in results:
|
|
flaky_metrics = []
|
|
for metric in metrics:
|
|
cv_key = f"{metric}_cv"
|
|
if cv_key in result:
|
|
cv = result[cv_key]
|
|
if cv > cv_threshold:
|
|
mean_val = result.get(f"{metric}_mean", 0)
|
|
std_val = result.get(f"{metric}_std", 0)
|
|
flaky_metrics.append({
|
|
"metric": metric,
|
|
"mean": mean_val,
|
|
"std": std_val,
|
|
"cv": cv,
|
|
})
|
|
|
|
if flaky_metrics:
|
|
flaky_configs.append({
|
|
"corpus": result.get("corpus"),
|
|
"size": result.get("size"),
|
|
"ef_search": result.get("ef_search"),
|
|
"M": result.get("M"),
|
|
"repetitions": result.get("repetitions"),
|
|
"flaky_metrics": flaky_metrics,
|
|
})
|
|
|
|
return flaky_configs
|
|
|
|
|
|
def generate_variance_report(
|
|
aggregated_file: Path,
|
|
output_file: Path | None = None,
|
|
cv_threshold: float = 20.0,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Generate a variance analysis report.
|
|
|
|
Args:
|
|
aggregated_file: Path to aggregated results JSON
|
|
output_file: Optional output file for report
|
|
cv_threshold: CV threshold for flaky detection
|
|
|
|
Returns:
|
|
Report dictionary
|
|
"""
|
|
results = load_benchmark_results(aggregated_file)
|
|
|
|
if not results:
|
|
return {"error": "No results found"}
|
|
|
|
# Calculate overall statistics
|
|
all_cvs = []
|
|
for result in results:
|
|
for key in result.keys():
|
|
if key.endswith("_cv") and isinstance(result[key], (int, float)):
|
|
all_cvs.append(result[key])
|
|
|
|
# Identify flaky configurations
|
|
flaky_configs = identify_flaky_configurations(results, cv_threshold)
|
|
|
|
# Group by corpus
|
|
by_corpus = {}
|
|
for result in results:
|
|
corpus = result.get("corpus", "unknown")
|
|
if corpus not in by_corpus:
|
|
by_corpus[corpus] = []
|
|
by_corpus[corpus].append(result)
|
|
|
|
report = {
|
|
"summary": {
|
|
"total_configurations": len(results),
|
|
"flaky_configurations": len(flaky_configs),
|
|
"flaky_percentage": (len(flaky_configs) / len(results) * 100) if results else 0,
|
|
"average_cv": float(np.mean(all_cvs)) if all_cvs else 0.0,
|
|
"max_cv": float(np.max(all_cvs)) if all_cvs else 0.0,
|
|
},
|
|
"flaky_configurations": flaky_configs,
|
|
"by_corpus": {
|
|
corpus: {
|
|
"count": len(configs),
|
|
"flaky_count": sum(1 for c in configs if any(m["cv"] > cv_threshold for m in identify_flaky_configurations([c], cv_threshold)[0].get("flaky_metrics", []))),
|
|
}
|
|
for corpus, configs in by_corpus.items()
|
|
},
|
|
}
|
|
|
|
if output_file:
|
|
with open(output_file, "w") as f:
|
|
json.dump(report, f, indent=2)
|
|
print(f"Variance report saved to {output_file}")
|
|
|
|
return report
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Analyze variance in benchmark results")
|
|
parser.add_argument(
|
|
"--results",
|
|
type=Path,
|
|
required=True,
|
|
help="Path to aggregated results JSON file"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
help="Output file for variance report"
|
|
)
|
|
parser.add_argument(
|
|
"--cv-threshold",
|
|
type=float,
|
|
default=20.0,
|
|
help="Coefficient of variation threshold (%) for flaky detection (default: 20.0)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.results.exists():
|
|
print(f"Error: Results file not found: {args.results}")
|
|
return
|
|
|
|
report = generate_variance_report(
|
|
aggregated_file=args.results,
|
|
output_file=args.output,
|
|
cv_threshold=args.cv_threshold,
|
|
)
|
|
|
|
# Print summary
|
|
print("\n" + "="*70)
|
|
print("Variance Analysis Report")
|
|
print("="*70)
|
|
summary = report.get("summary", {})
|
|
print(f"Total configurations: {summary.get('total_configurations', 0)}")
|
|
print(f"Flaky configurations: {summary.get('flaky_configurations', 0)} ({summary.get('flaky_percentage', 0):.1f}%)")
|
|
print(f"Average CV: {summary.get('average_cv', 0):.2f}%")
|
|
print(f"Max CV: {summary.get('max_cv', 0):.2f}%")
|
|
|
|
flaky = report.get("flaky_configurations", [])
|
|
if flaky:
|
|
print(f"\n⚠️ Flaky Configurations ({len(flaky)}):")
|
|
for config in flaky[:10]: # Show first 10
|
|
print(f" - {config.get('corpus')} (size={config.get('size')}, ef={config.get('ef_search')}, M={config.get('M')}):")
|
|
for metric in config.get("flaky_metrics", []):
|
|
print(f" • {metric['metric']}: CV={metric['cv']:.1f}% (mean={metric['mean']:.2f}±{metric['std']:.2f})")
|
|
if len(flaky) > 10:
|
|
print(f" ... and {len(flaky) - 10} more")
|
|
else:
|
|
print("\n✅ No flaky configurations detected!")
|
|
|
|
print("="*70)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|