Initial commit: LLM-DS optimizer framework with data files excluded
This commit is contained in:
281
scripts/run_multi_dataset_benchmarks.py
Normal file
281
scripts/run_multi_dataset_benchmarks.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""Run benchmarks across multiple datasets for comparison."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def prepare_dataset(
|
||||
source: str,
|
||||
corpus_name: str,
|
||||
output_dir: Path,
|
||||
limit: int | None = None,
|
||||
download: bool = True,
|
||||
) -> Path | None:
|
||||
"""Prepare a dataset: download, prepare embeddings, ready for benchmarking."""
|
||||
corpus_dir = output_dir / "raw" / corpus_name
|
||||
embeddings_dir = output_dir / "embeddings"
|
||||
corpus_file = None
|
||||
|
||||
# Find existing corpus file (check multiple possible names)
|
||||
possible_files = ["corpus.jsonl", "reviews.jsonl", "business_reviews.jsonl", "pages.jsonl"]
|
||||
for filename in possible_files:
|
||||
if (corpus_dir / filename).exists():
|
||||
corpus_file = corpus_dir / filename
|
||||
break
|
||||
|
||||
# Also check beir subdirectory for fiqa
|
||||
if corpus_file is None and corpus_name == "fiqa":
|
||||
beir_dir = output_dir / "raw" / "beir" / corpus_name
|
||||
if (beir_dir / "corpus.jsonl").exists():
|
||||
corpus_file = beir_dir / "corpus.jsonl"
|
||||
|
||||
# Download if needed and not exists
|
||||
if download and corpus_file is None:
|
||||
print(f"\n📥 Downloading {corpus_name}...")
|
||||
try:
|
||||
if source.startswith("beir:"):
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/download_corpus.py",
|
||||
"--source", source,
|
||||
"--output", str(corpus_dir),
|
||||
]
|
||||
else:
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/download_corpus.py",
|
||||
"--source", source,
|
||||
"--output", str(corpus_dir),
|
||||
]
|
||||
if limit:
|
||||
cmd.extend(["--limit", str(limit)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ Download failed: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Find corpus file after download
|
||||
if (corpus_dir / "corpus.jsonl").exists():
|
||||
corpus_file = corpus_dir / "corpus.jsonl"
|
||||
elif corpus_name == "amazon23" and (corpus_dir / "reviews.jsonl").exists():
|
||||
corpus_file = corpus_dir / "reviews.jsonl"
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error downloading {corpus_name}: {e}")
|
||||
return None
|
||||
|
||||
if corpus_file is None or not corpus_file.exists():
|
||||
print(f"⚠️ Corpus file not found for {corpus_name}")
|
||||
return None
|
||||
|
||||
# Check embeddings
|
||||
emb_file = embeddings_dir / f"{corpus_name}.npy"
|
||||
if not emb_file.exists():
|
||||
print(f"\n🔢 Preparing embeddings for {corpus_name}...")
|
||||
embeddings_dir.mkdir(parents=True, exist_ok=True)
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/prepare_embeddings.py",
|
||||
"--input", str(corpus_file),
|
||||
"--output", str(emb_file),
|
||||
"--dim", "384",
|
||||
"--seed", "42",
|
||||
]
|
||||
if limit:
|
||||
cmd.extend(["--limit", str(limit)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ Embedding preparation failed: {result.stderr}")
|
||||
return None
|
||||
|
||||
return corpus_file
|
||||
|
||||
|
||||
def run_benchmarks_for_dataset(
|
||||
corpus_name: str,
|
||||
corpus_file: Path,
|
||||
emb_file: Path,
|
||||
sizes: list[str],
|
||||
ef_values: list[int],
|
||||
M_values: list[int],
|
||||
num_queries: int = 50, # Reduced for faster multi-dataset runs
|
||||
output_dir: Path = Path("benchmarks/results"),
|
||||
) -> Path | None:
|
||||
"""Run benchmarks for a single dataset."""
|
||||
print(f"\n🚀 Running benchmarks for {corpus_name}...")
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/run_benchmarks.py",
|
||||
"--corpus", corpus_name,
|
||||
"--corpus-file", str(corpus_file),
|
||||
"--emb-file", str(emb_file),
|
||||
"--sizes", *sizes,
|
||||
"--ef", *[str(e) for e in ef_values],
|
||||
"--M", *[str(m) for m in M_values],
|
||||
"--num-queries", str(num_queries),
|
||||
"--output-dir", str(output_dir),
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ Benchmark failed for {corpus_name}: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Find the results directory
|
||||
results_dir = output_dir / corpus_name
|
||||
if results_dir.exists():
|
||||
timestamp_dirs = sorted([d for d in results_dir.iterdir() if d.is_dir()], key=lambda x: x.name)
|
||||
if timestamp_dirs:
|
||||
return timestamp_dirs[-1] / "results.json"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run benchmarks across multiple datasets")
|
||||
parser.add_argument(
|
||||
"--datasets",
|
||||
nargs="+",
|
||||
default=["fiqa", "amazon23", "msmarco"],
|
||||
help="Datasets to benchmark"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sizes",
|
||||
nargs="+",
|
||||
default=["10k", "25k", "50k"],
|
||||
help="Corpus sizes (e.g., 10k 25k 50k)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ef",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[50, 100],
|
||||
help="HNSW efSearch values"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--M",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[8, 16],
|
||||
help="HNSW M values"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-queries",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Number of queries per benchmark"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-download",
|
||||
action="store_true",
|
||||
help="Skip downloading datasets (use existing)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Limit documents per dataset (for large datasets)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("benchmarks/results"),
|
||||
help="Output directory"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Dataset sources mapping
|
||||
dataset_sources = {
|
||||
"fiqa": "beir:fiqa",
|
||||
"amazon23": "amazon23",
|
||||
"msmarco": "msmarco",
|
||||
}
|
||||
|
||||
data_dir = Path("data")
|
||||
embeddings_dir = data_dir / "embeddings"
|
||||
embeddings_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = {}
|
||||
|
||||
print("=" * 70)
|
||||
print("Multi-Dataset Benchmark Runner")
|
||||
print("=" * 70)
|
||||
print(f"Datasets: {', '.join(args.datasets)}")
|
||||
print(f"Sizes: {', '.join(args.sizes)}")
|
||||
print(f"efSearch: {', '.join(map(str, args.ef))}")
|
||||
print(f"M: {', '.join(map(str, args.M))}")
|
||||
print("=" * 70)
|
||||
|
||||
for corpus_name in args.datasets:
|
||||
if corpus_name not in dataset_sources:
|
||||
print(f"⚠️ Unknown dataset: {corpus_name}, skipping")
|
||||
continue
|
||||
|
||||
source = dataset_sources[corpus_name]
|
||||
limit = args.limit if corpus_name in ["amazon23", "msmarco"] else None
|
||||
|
||||
# Prepare dataset
|
||||
corpus_file = prepare_dataset(
|
||||
source=source,
|
||||
corpus_name=corpus_name,
|
||||
output_dir=data_dir,
|
||||
limit=limit,
|
||||
download=not args.skip_download,
|
||||
)
|
||||
|
||||
if corpus_file is None:
|
||||
print(f"⚠️ Skipping {corpus_name} - preparation failed")
|
||||
continue
|
||||
|
||||
# Check embeddings
|
||||
emb_file = embeddings_dir / f"{corpus_name}.npy"
|
||||
if not emb_file.exists():
|
||||
print(f"⚠️ Embeddings not found for {corpus_name}, skipping")
|
||||
continue
|
||||
|
||||
# Run benchmarks
|
||||
results_file = run_benchmarks_for_dataset(
|
||||
corpus_name=corpus_name,
|
||||
corpus_file=corpus_file,
|
||||
emb_file=emb_file,
|
||||
sizes=args.sizes,
|
||||
ef_values=args.ef,
|
||||
M_values=args.M,
|
||||
num_queries=args.num_queries,
|
||||
output_dir=args.output_dir,
|
||||
)
|
||||
|
||||
if results_file and results_file.exists():
|
||||
with open(results_file) as f:
|
||||
results[corpus_name] = json.load(f)
|
||||
print(f"✓ {corpus_name} benchmarks completed")
|
||||
else:
|
||||
print(f"⚠️ {corpus_name} benchmarks incomplete")
|
||||
|
||||
# Save combined results
|
||||
if results:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
combined_file = args.output_dir / f"multi_dataset_{timestamp}.json"
|
||||
combined_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(combined_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\n✓ Combined results saved to {combined_file}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Multi-dataset benchmarks completed!")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user