Initial commit: LLM-DS optimizer framework with data files excluded

This commit is contained in:
Carlos Gutierrez
2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions

View File

@@ -0,0 +1,281 @@
"""Run benchmarks across multiple datasets for comparison."""
import argparse
import json
import subprocess
import sys
from pathlib import Path
from datetime import datetime
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent))
def prepare_dataset(
source: str,
corpus_name: str,
output_dir: Path,
limit: int | None = None,
download: bool = True,
) -> Path | None:
"""Prepare a dataset: download, prepare embeddings, ready for benchmarking."""
corpus_dir = output_dir / "raw" / corpus_name
embeddings_dir = output_dir / "embeddings"
corpus_file = None
# Find existing corpus file (check multiple possible names)
possible_files = ["corpus.jsonl", "reviews.jsonl", "business_reviews.jsonl", "pages.jsonl"]
for filename in possible_files:
if (corpus_dir / filename).exists():
corpus_file = corpus_dir / filename
break
# Also check beir subdirectory for fiqa
if corpus_file is None and corpus_name == "fiqa":
beir_dir = output_dir / "raw" / "beir" / corpus_name
if (beir_dir / "corpus.jsonl").exists():
corpus_file = beir_dir / "corpus.jsonl"
# Download if needed and not exists
if download and corpus_file is None:
print(f"\n📥 Downloading {corpus_name}...")
try:
if source.startswith("beir:"):
cmd = [
sys.executable,
"scripts/download_corpus.py",
"--source", source,
"--output", str(corpus_dir),
]
else:
cmd = [
sys.executable,
"scripts/download_corpus.py",
"--source", source,
"--output", str(corpus_dir),
]
if limit:
cmd.extend(["--limit", str(limit)])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"⚠️ Download failed: {result.stderr}")
return None
# Find corpus file after download
if (corpus_dir / "corpus.jsonl").exists():
corpus_file = corpus_dir / "corpus.jsonl"
elif corpus_name == "amazon23" and (corpus_dir / "reviews.jsonl").exists():
corpus_file = corpus_dir / "reviews.jsonl"
except Exception as e:
print(f"⚠️ Error downloading {corpus_name}: {e}")
return None
if corpus_file is None or not corpus_file.exists():
print(f"⚠️ Corpus file not found for {corpus_name}")
return None
# Check embeddings
emb_file = embeddings_dir / f"{corpus_name}.npy"
if not emb_file.exists():
print(f"\n🔢 Preparing embeddings for {corpus_name}...")
embeddings_dir.mkdir(parents=True, exist_ok=True)
cmd = [
sys.executable,
"scripts/prepare_embeddings.py",
"--input", str(corpus_file),
"--output", str(emb_file),
"--dim", "384",
"--seed", "42",
]
if limit:
cmd.extend(["--limit", str(limit)])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"⚠️ Embedding preparation failed: {result.stderr}")
return None
return corpus_file
def run_benchmarks_for_dataset(
corpus_name: str,
corpus_file: Path,
emb_file: Path,
sizes: list[str],
ef_values: list[int],
M_values: list[int],
num_queries: int = 50, # Reduced for faster multi-dataset runs
output_dir: Path = Path("benchmarks/results"),
) -> Path | None:
"""Run benchmarks for a single dataset."""
print(f"\n🚀 Running benchmarks for {corpus_name}...")
cmd = [
sys.executable,
"scripts/run_benchmarks.py",
"--corpus", corpus_name,
"--corpus-file", str(corpus_file),
"--emb-file", str(emb_file),
"--sizes", *sizes,
"--ef", *[str(e) for e in ef_values],
"--M", *[str(m) for m in M_values],
"--num-queries", str(num_queries),
"--output-dir", str(output_dir),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"⚠️ Benchmark failed for {corpus_name}: {result.stderr}")
return None
# Find the results directory
results_dir = output_dir / corpus_name
if results_dir.exists():
timestamp_dirs = sorted([d for d in results_dir.iterdir() if d.is_dir()], key=lambda x: x.name)
if timestamp_dirs:
return timestamp_dirs[-1] / "results.json"
return None
def main():
parser = argparse.ArgumentParser(description="Run benchmarks across multiple datasets")
parser.add_argument(
"--datasets",
nargs="+",
default=["fiqa", "amazon23", "msmarco"],
help="Datasets to benchmark"
)
parser.add_argument(
"--sizes",
nargs="+",
default=["10k", "25k", "50k"],
help="Corpus sizes (e.g., 10k 25k 50k)"
)
parser.add_argument(
"--ef",
nargs="+",
type=int,
default=[50, 100],
help="HNSW efSearch values"
)
parser.add_argument(
"--M",
nargs="+",
type=int,
default=[8, 16],
help="HNSW M values"
)
parser.add_argument(
"--num-queries",
type=int,
default=50,
help="Number of queries per benchmark"
)
parser.add_argument(
"--skip-download",
action="store_true",
help="Skip downloading datasets (use existing)"
)
parser.add_argument(
"--limit",
type=int,
help="Limit documents per dataset (for large datasets)"
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("benchmarks/results"),
help="Output directory"
)
args = parser.parse_args()
# Dataset sources mapping
dataset_sources = {
"fiqa": "beir:fiqa",
"amazon23": "amazon23",
"msmarco": "msmarco",
}
data_dir = Path("data")
embeddings_dir = data_dir / "embeddings"
embeddings_dir.mkdir(parents=True, exist_ok=True)
results = {}
print("=" * 70)
print("Multi-Dataset Benchmark Runner")
print("=" * 70)
print(f"Datasets: {', '.join(args.datasets)}")
print(f"Sizes: {', '.join(args.sizes)}")
print(f"efSearch: {', '.join(map(str, args.ef))}")
print(f"M: {', '.join(map(str, args.M))}")
print("=" * 70)
for corpus_name in args.datasets:
if corpus_name not in dataset_sources:
print(f"⚠️ Unknown dataset: {corpus_name}, skipping")
continue
source = dataset_sources[corpus_name]
limit = args.limit if corpus_name in ["amazon23", "msmarco"] else None
# Prepare dataset
corpus_file = prepare_dataset(
source=source,
corpus_name=corpus_name,
output_dir=data_dir,
limit=limit,
download=not args.skip_download,
)
if corpus_file is None:
print(f"⚠️ Skipping {corpus_name} - preparation failed")
continue
# Check embeddings
emb_file = embeddings_dir / f"{corpus_name}.npy"
if not emb_file.exists():
print(f"⚠️ Embeddings not found for {corpus_name}, skipping")
continue
# Run benchmarks
results_file = run_benchmarks_for_dataset(
corpus_name=corpus_name,
corpus_file=corpus_file,
emb_file=emb_file,
sizes=args.sizes,
ef_values=args.ef,
M_values=args.M,
num_queries=args.num_queries,
output_dir=args.output_dir,
)
if results_file and results_file.exists():
with open(results_file) as f:
results[corpus_name] = json.load(f)
print(f"{corpus_name} benchmarks completed")
else:
print(f"⚠️ {corpus_name} benchmarks incomplete")
# Save combined results
if results:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
combined_file = args.output_dir / f"multi_dataset_{timestamp}.json"
combined_file.parent.mkdir(parents=True, exist_ok=True)
with open(combined_file, "w") as f:
json.dump(results, f, indent=2)
print(f"\n✓ Combined results saved to {combined_file}")
print("\n" + "=" * 70)
print("Multi-dataset benchmarks completed!")
print("=" * 70)
if __name__ == "__main__":
main()