Initial commit: LLM-DS optimizer framework with data files excluded

2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
@@ -0,0 +1,2 @@
+# Empty file to make scripts a package
+
--- a/scripts/analyze_variance.py
+++ b/scripts/analyze_variance.py
@@ -0,0 +1,196 @@
+"""Analyze variance in benchmark results and identify flaky benchmarks."""
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+try:
+    from scipy import stats
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+
+
+def load_benchmark_results(results_file: Path) -> list[dict]:
+    """Load benchmark results from JSON file."""
+    with open(results_file) as f:
+        return json.load(f)
+
+
+def identify_flaky_configurations(
+    results: list[dict],
+    cv_threshold: float = 20.0,
+    metrics: list[str] | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Identify flaky benchmark configurations based on coefficient of variation.
+    
+    Args:
+        results: List of aggregated result dictionaries
+        cv_threshold: CV threshold (%) above which a benchmark is considered flaky
+        metrics: List of metrics to check (default: critical metrics)
+    
+    Returns:
+        List of flaky configuration summaries
+    """
+    if metrics is None:
+        metrics = ["search_p50_ms", "search_p95_ms", "qps"]
+    
+    flaky_configs = []
+    
+    for result in results:
+        flaky_metrics = []
+        for metric in metrics:
+            cv_key = f"{metric}_cv"
+            if cv_key in result:
+                cv = result[cv_key]
+                if cv > cv_threshold:
+                    mean_val = result.get(f"{metric}_mean", 0)
+                    std_val = result.get(f"{metric}_std", 0)
+                    flaky_metrics.append({
+                        "metric": metric,
+                        "mean": mean_val,
+                        "std": std_val,
+                        "cv": cv,
+                    })
+        
+        if flaky_metrics:
+            flaky_configs.append({
+                "corpus": result.get("corpus"),
+                "size": result.get("size"),
+                "ef_search": result.get("ef_search"),
+                "M": result.get("M"),
+                "repetitions": result.get("repetitions"),
+                "flaky_metrics": flaky_metrics,
+            })
+    
+    return flaky_configs
+
+
+def generate_variance_report(
+    aggregated_file: Path,
+    output_file: Path | None = None,
+    cv_threshold: float = 20.0,
+) -> dict[str, Any]:
+    """
+    Generate a variance analysis report.
+    
+    Args:
+        aggregated_file: Path to aggregated results JSON
+        output_file: Optional output file for report
+        cv_threshold: CV threshold for flaky detection
+    
+    Returns:
+        Report dictionary
+    """
+    results = load_benchmark_results(aggregated_file)
+    
+    if not results:
+        return {"error": "No results found"}
+    
+    # Calculate overall statistics
+    all_cvs = []
+    for result in results:
+        for key in result.keys():
+            if key.endswith("_cv") and isinstance(result[key], (int, float)):
+                all_cvs.append(result[key])
+    
+    # Identify flaky configurations
+    flaky_configs = identify_flaky_configurations(results, cv_threshold)
+    
+    # Group by corpus
+    by_corpus = {}
+    for result in results:
+        corpus = result.get("corpus", "unknown")
+        if corpus not in by_corpus:
+            by_corpus[corpus] = []
+        by_corpus[corpus].append(result)
+    
+    report = {
+        "summary": {
+            "total_configurations": len(results),
+            "flaky_configurations": len(flaky_configs),
+            "flaky_percentage": (len(flaky_configs) / len(results) * 100) if results else 0,
+            "average_cv": float(np.mean(all_cvs)) if all_cvs else 0.0,
+            "max_cv": float(np.max(all_cvs)) if all_cvs else 0.0,
+        },
+        "flaky_configurations": flaky_configs,
+        "by_corpus": {
+            corpus: {
+                "count": len(configs),
+                "flaky_count": sum(1 for c in configs if any(m["cv"] > cv_threshold for m in identify_flaky_configurations([c], cv_threshold)[0].get("flaky_metrics", []))),
+            }
+            for corpus, configs in by_corpus.items()
+        },
+    }
+    
+    if output_file:
+        with open(output_file, "w") as f:
+            json.dump(report, f, indent=2)
+        print(f"Variance report saved to {output_file}")
+    
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Analyze variance in benchmark results")
+    parser.add_argument(
+        "--results",
+        type=Path,
+        required=True,
+        help="Path to aggregated results JSON file"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        help="Output file for variance report"
+    )
+    parser.add_argument(
+        "--cv-threshold",
+        type=float,
+        default=20.0,
+        help="Coefficient of variation threshold (%) for flaky detection (default: 20.0)"
+    )
+    
+    args = parser.parse_args()
+    
+    if not args.results.exists():
+        print(f"Error: Results file not found: {args.results}")
+        return
+    
+    report = generate_variance_report(
+        aggregated_file=args.results,
+        output_file=args.output,
+        cv_threshold=args.cv_threshold,
+    )
+    
+    # Print summary
+    print("\n" + "="*70)
+    print("Variance Analysis Report")
+    print("="*70)
+    summary = report.get("summary", {})
+    print(f"Total configurations: {summary.get('total_configurations', 0)}")
+    print(f"Flaky configurations: {summary.get('flaky_configurations', 0)} ({summary.get('flaky_percentage', 0):.1f}%)")
+    print(f"Average CV: {summary.get('average_cv', 0):.2f}%")
+    print(f"Max CV: {summary.get('max_cv', 0):.2f}%")
+    
+    flaky = report.get("flaky_configurations", [])
+    if flaky:
+        print(f"\n⚠️  Flaky Configurations ({len(flaky)}):")
+        for config in flaky[:10]:  # Show first 10
+            print(f"  - {config.get('corpus')} (size={config.get('size')}, ef={config.get('ef_search')}, M={config.get('M')}):")
+            for metric in config.get("flaky_metrics", []):
+                print(f"    • {metric['metric']}: CV={metric['cv']:.1f}% (mean={metric['mean']:.2f}±{metric['std']:.2f})")
+        if len(flaky) > 10:
+            print(f"  ... and {len(flaky) - 10} more")
+    else:
+        print("\n✅ No flaky configurations detected!")
+    
+    print("="*70)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/build_indices.py
+++ b/scripts/build_indices.py
@@ -0,0 +1,166 @@
+"""Build indices (BM25 + HNSW) for a corpus."""
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from llmds.hnsw import HNSW
+from llmds.inverted_index import InvertedIndex
+from llmds.tokenizer import Tokenizer
+
+
+def build_indices(
+    corpus_file: Path,
+    emb_file: Path | None,
+    index_dir: Path,
+    bm25: bool = True,
+    hnsw: bool = True,
+    ef_construction: int = 200,
+    M: int = 16,
+    embedding_dim: int = 384,
+) -> dict:
+    """
+    Build inverted index and/or HNSW for a corpus.
+
+    Args:
+        corpus_file: Path to corpus JSONL file
+        emb_file: Optional path to embeddings .npy file
+        index_dir: Directory to save indices
+        bm25: Whether to build BM25 inverted index
+        hnsw: Whether to build HNSW index
+        ef_construction: HNSW efConstruction parameter
+        M: HNSW M parameter
+        embedding_dim: Embedding dimension
+
+    Returns:
+        Dictionary with build statistics
+    """
+    index_dir.mkdir(parents=True, exist_ok=True)
+    
+    tokenizer = Tokenizer()
+    stats = {}
+    
+    # Load embeddings if available
+    embeddings = None
+    if emb_file and emb_file.exists():
+        print(f"Loading embeddings from {emb_file}...")
+        embeddings = np.load(emb_file)
+        print(f"Loaded {len(embeddings)} embeddings")
+    
+    # Build BM25 index
+    if bm25:
+        print("Building BM25 inverted index...")
+        start_time = time.time()
+        
+        index = InvertedIndex(tokenizer=tokenizer)
+        doc_count = 0
+        
+        with open(corpus_file, "r", encoding="utf-8") as f:
+            for line in f:
+                if line.strip():
+                    doc = json.loads(line)
+                    index.add_document(doc_id=int(doc["id"].split("_")[-1]) if doc["id"].split("_")[-1].isdigit() else doc_count, text=doc["text"])
+                    doc_count += 1
+                    
+                    if doc_count % 10000 == 0:
+                        print(f"Indexed {doc_count} documents...")
+        
+        # Save index metadata
+        index_stats = index.stats()
+        stats["bm25"] = {
+            "build_time_sec": time.time() - start_time,
+            "total_documents": index_stats["total_documents"],
+            "total_terms": index_stats["total_terms"],
+        }
+        
+        print(f"✓ BM25 index built: {stats['bm25']['total_documents']} documents, {stats['bm25']['build_time_sec']:.2f}s")
+    
+    # Build HNSW index
+    if hnsw:
+        if embeddings is None:
+            print("Warning: No embeddings provided. Generating deterministic embeddings...")
+            # Generate on-the-fly
+            embeddings = []
+            doc_count = 0
+            rng = np.random.RandomState(42)
+            with open(corpus_file, "r", encoding="utf-8") as f:
+                for line in f:
+                    if line.strip():
+                        doc = json.loads(line)
+                        emb = rng.randn(embedding_dim).astype(np.float32)
+                        emb = emb / np.linalg.norm(emb)
+                        embeddings.append(emb)
+                        doc_count += 1
+            embeddings = np.stack(embeddings)
+        
+        print(f"Building HNSW index (M={M}, efConstruction={ef_construction})...")
+        start_time = time.time()
+        
+        hnsw = HNSW(
+            dim=embedding_dim,
+            M=M,
+            ef_construction=ef_construction,
+            ef_search=50,
+            seed=42,  # Fixed seed for reproducible HNSW structure
+        )
+        
+        for i, emb in enumerate(embeddings):
+            hnsw.add(emb, i)
+            if (i + 1) % 10000 == 0:
+                print(f"Added {i + 1} vectors...")
+        
+        hnsw_stats = hnsw.stats()
+        stats["hnsw"] = {
+            "build_time_sec": time.time() - start_time,
+            "num_vectors": hnsw_stats["num_vectors"],
+            "num_layers": hnsw_stats["num_layers"],
+        }
+        
+        print(f"✓ HNSW index built: {stats['hnsw']['num_vectors']} vectors, {stats['hnsw']['build_time_sec']:.2f}s")
+    
+    # Save statistics
+    stats_file = index_dir / "build_stats.json"
+    with open(stats_file, "w") as f:
+        json.dump(stats, f, indent=2)
+    
+    print(f"✓ Indices built and saved to {index_dir}")
+    return stats
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Build indices for corpus")
+    parser.add_argument("--corpus", type=Path, required=True, help="Corpus JSONL file")
+    parser.add_argument("--emb", type=Path, help="Embeddings .npy file")
+    parser.add_argument("--index-dir", type=Path, required=True, help="Index output directory")
+    parser.add_argument("--bm25", action="store_true", help="Build BM25 index")
+    parser.add_argument("--hnsw", action="store_true", help="Build HNSW index")
+    parser.add_argument("--ef", type=int, default=200, help="HNSW efConstruction")
+    parser.add_argument("--M", type=int, default=16, help="HNSW M parameter")
+    parser.add_argument("--dim", type=int, default=384, help="Embedding dimension")
+    
+    args = parser.parse_args()
+    
+    if not args.bm25 and not args.hnsw:
+        print("Error: Must specify --bm25 and/or --hnsw")
+        sys.exit(1)
+    
+    build_indices(
+        corpus_file=args.corpus,
+        emb_file=args.emb,
+        index_dir=args.index_dir,
+        bm25=args.bm25,
+        hnsw=args.hnsw,
+        ef_construction=args.ef,
+        M=args.M,
+        embedding_dim=args.dim,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/download_corpus.py
+++ b/scripts/download_corpus.py
@@ -0,0 +1,73 @@
+"""Download and prepare datasets."""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from llmds.data_sources.msmarco import download_msmarco
+from llmds.data_sources.beir_loader import download_beir
+from llmds.data_sources.amazon_reviews import download_amazon_reviews
+from llmds.data_sources.yelp import download_yelp
+from llmds.data_sources.wikipedia import download_wikipedia
+from llmds.data_sources.commoncrawl import download_commoncrawl
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download datasets")
+    parser.add_argument(
+        "--source",
+        required=True,
+        help="Dataset source: msmarco, beir:task (e.g., beir:fiqa), amazon23, yelp, wikipedia, commoncrawl"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Output directory for corpus"
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit number of documents"
+    )
+    parser.add_argument(
+        "--cc-month",
+        type=str,
+        help="Common Crawl month (e.g., 'CC-MAIN-2025-14')"
+    )
+    
+    args = parser.parse_args()
+    
+    # Parse source (handle beir:task format)
+    source_parts = args.source.split(":", 1)
+    source_base = source_parts[0]
+    task = source_parts[1] if len(source_parts) > 1 else None
+    
+    if source_base == "msmarco":
+        download_msmarco(args.output)
+    elif source_base == "beir":
+        if not task:
+            print("Error: BEIR requires task name (e.g., 'beir:fiqa', 'beir:scidocs')")
+            sys.exit(1)
+        download_beir(task, args.output)
+    elif source_base == "amazon23":
+        download_amazon_reviews(args.output, limit=args.limit)
+    elif source_base == "yelp":
+        download_yelp(args.output)
+    elif source_base == "wikipedia":
+        download_wikipedia(args.output)
+    elif source_base == "commoncrawl":
+        download_commoncrawl(args.output, cc_month=args.cc_month, limit=args.limit)
+    else:
+        print(f"Error: Unknown source '{source_base}'. Use: msmarco, beir:task, amazon23, yelp, wikipedia, commoncrawl")
+        sys.exit(1)
+    
+    print(f"✓ Dataset downloaded to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/env_hash.py
+++ b/scripts/env_hash.py
@@ -0,0 +1,137 @@
+"""Generate environment hash for reproducibility tracking."""
+
+import platform
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+def get_blas_info():
+    """Get BLAS library information."""
+    try:
+        # Try to get BLAS config from numpy
+        blas_info = np.show_config()
+        return str(blas_info)
+    except Exception:
+        try:
+            # Fallback: try to get from numpy config
+            config = np.__config__
+            return str(config)
+        except Exception:
+            return "BLAS info unavailable"
+
+
+def get_numpy_config():
+    """Get NumPy configuration."""
+    try:
+        return {
+            "version": np.__version__,
+            "config": str(np.show_config()),
+        }
+    except Exception:
+        return {"version": np.__version__, "config": "unavailable"}
+
+
+def generate_env_hash(output_path: Path = Path("audit/env_hash.txt")):
+    """
+    Generate environment hash file with system and library information.
+    
+    Args:
+        output_path: Path to output file (default: audit/env_hash.txt)
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    lines = []
+    lines.append("=" * 80)
+    lines.append("Environment Hash")
+    lines.append("=" * 80)
+    lines.append("")
+    
+    # Python information
+    lines.append("Python:")
+    lines.append(f"  Version: {sys.version}")
+    lines.append(f"  Executable: {sys.executable}")
+    lines.append(f"  Platform: {platform.platform()}")
+    lines.append("")
+    
+    # OS information
+    lines.append("Operating System:")
+    lines.append(f"  System: {platform.system()}")
+    lines.append(f"  Release: {platform.release()}")
+    lines.append(f"  Version: {platform.version()}")
+    lines.append(f"  Architecture: {platform.machine()}")
+    lines.append(f"  Processor: {platform.processor()}")
+    lines.append("")
+    
+    # CPU information
+    try:
+        import psutil
+        lines.append("CPU:")
+        lines.append(f"  Physical cores: {psutil.cpu_count(logical=False)}")
+        lines.append(f"  Logical cores: {psutil.cpu_count(logical=True)}")
+        lines.append(f"  Frequency: {psutil.cpu_freq()}")
+        lines.append("")
+    except ImportError:
+        lines.append("CPU:")
+        lines.append(f"  Count: {platform.processor()}")
+        lines.append("")
+    
+    # NumPy configuration
+    lines.append("NumPy Configuration:")
+    np_config = get_numpy_config()
+    lines.append(f"  Version: {np_config['version']}")
+    lines.append("  Config:")
+    for line in np_config.get("config", "").split("\n"):
+        if line.strip():
+            lines.append(f"    {line}")
+    lines.append("")
+    
+    # BLAS information
+    lines.append("BLAS Information:")
+    blas_info = get_blas_info()
+    for line in blas_info.split("\n"):
+        if line.strip():
+            lines.append(f"  {line}")
+    lines.append("")
+    
+    # Python packages (if available)
+    try:
+        import pkg_resources
+        lines.append("Key Packages:")
+        key_packages = ["numpy", "scipy", "hypothesis", "pytest"]
+        for pkg_name in key_packages:
+            try:
+                pkg = pkg_resources.get_distribution(pkg_name)
+                lines.append(f"  {pkg_name}: {pkg.version}")
+            except Exception:
+                pass
+        lines.append("")
+    except ImportError:
+        pass
+    
+    lines.append("=" * 80)
+    
+    # Write to file
+    content = "\n".join(lines)
+    with open(output_path, "w") as f:
+        f.write(content)
+    
+    print(f"Environment hash written to: {output_path}")
+    return output_path
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Generate environment hash")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("audit/env_hash.txt"),
+        help="Output file path (default: audit/env_hash.txt)",
+    )
+    args = parser.parse_args()
+    
+    generate_env_hash(args.output)
+
--- a/scripts/generate_architecture_diagram.py
+++ b/scripts/generate_architecture_diagram.py
@@ -0,0 +1,235 @@
+"""Generate architecture diagram for the LLM Data Structures Optimizer.
+
+This script creates a visual architecture diagram showing the relationships
+between major components in the system.
+"""
+
+from pathlib import Path
+
+import matplotlib.patches as mpatches
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def generate_architecture_diagram(output_path: Path = Path("audit/ARCH_DIAGRAM.png")):
+    """
+    Generate architecture diagram showing system components and relationships.
+    
+    Args:
+        output_path: Path to save the diagram (default: audit/ARCH_DIAGRAM.png)
+    """
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    fig, ax = plt.subplots(figsize=(16, 12))
+    ax.set_xlim(0, 10)
+    ax.set_ylim(0, 10)
+    ax.axis("off")
+    
+    # Define colors
+    colors = {
+        "kv_cache": "#E8F4F8",
+        "scheduler": "#FFF4E6",
+        "retrieval": "#F0F8E8",
+        "data_structure": "#F5E6F8",
+    }
+    
+    # Title
+    ax.text(5, 9.5, "LLM Data Structures Optimizer Architecture", 
+            ha="center", va="top", fontsize=20, weight="bold")
+    
+    # ===== KV Cache System =====
+    kv_y = 7.5
+    ax.add_patch(mpatches.Rectangle((0.2, kv_y), 3.0, 1.5, 
+                                     facecolor=colors["kv_cache"], 
+                                     edgecolor="black", linewidth=2))
+    ax.text(1.7, kv_y + 1.2, "KV Cache System", 
+            ha="center", va="center", fontsize=14, weight="bold")
+    
+    # KVCache
+    ax.add_patch(mpatches.Rectangle((0.4, kv_y + 0.7), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(1.0, kv_y + 0.9, "KVCache", ha="center", va="center", fontsize=10)
+    
+    # PagedAllocator
+    ax.add_patch(mpatches.Rectangle((1.8, kv_y + 0.7), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(2.4, kv_y + 0.9, "PagedAllocator", ha="center", va="center", fontsize=10)
+    
+    # TokenLRU
+    ax.add_patch(mpatches.Rectangle((0.4, kv_y - 0.2), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(1.0, kv_y, "TokenLRU", ha="center", va="center", fontsize=10)
+    
+    # Connections within KV Cache
+    ax.arrow(1.6, kv_y + 0.9, 0.2, 0, head_width=0.05, head_length=0.05, 
+             fc="black", ec="black")
+    ax.arrow(1.0, kv_y + 0.5, 0, 0.2, head_width=0.05, head_length=0.05, 
+             fc="black", ec="black")
+    
+    # ===== Scheduler & Batching =====
+    scheduler_y = 5.5
+    ax.add_patch(mpatches.Rectangle((0.2, scheduler_y), 3.0, 1.5,
+                                     facecolor=colors["scheduler"],
+                                     edgecolor="black", linewidth=2))
+    ax.text(1.7, scheduler_y + 1.2, "Scheduler & Batching", 
+            ha="center", va="center", fontsize=14, weight="bold")
+    
+    # Scheduler
+    ax.add_patch(mpatches.Rectangle((0.4, scheduler_y + 0.7), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(1.0, scheduler_y + 0.9, "Scheduler", ha="center", va="center", fontsize=10)
+    
+    # IndexedHeap
+    ax.add_patch(mpatches.Rectangle((1.8, scheduler_y + 0.7), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(2.4, scheduler_y + 0.9, "IndexedHeap", ha="center", va="center", fontsize=10)
+    
+    # AdmissionController
+    ax.add_patch(mpatches.Rectangle((1.1, scheduler_y - 0.2), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(1.7, scheduler_y, "AdmissionController", ha="center", va="center", fontsize=10)
+    
+    # Connections within Scheduler
+    ax.arrow(1.6, scheduler_y + 0.9, 0.2, 0, head_width=0.05, head_length=0.05,
+             fc="black", ec="black")
+    ax.arrow(1.7, scheduler_y + 0.5, 0, 0.2, head_width=0.05, head_length=0.05,
+             fc="black", ec="black")
+    
+    # ===== Retrieval Pipeline =====
+    retrieval_y = 3.5
+    ax.add_patch(mpatches.Rectangle((0.2, retrieval_y), 3.0, 1.5,
+                                     facecolor=colors["retrieval"],
+                                     edgecolor="black", linewidth=2))
+    ax.text(1.7, retrieval_y + 1.2, "Retrieval Pipeline", 
+            ha="center", va="center", fontsize=14, weight="bold")
+    
+    # RetrievalPipeline
+    ax.add_patch(mpatches.Rectangle((1.1, retrieval_y + 0.7), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=2))
+    ax.text(1.7, retrieval_y + 0.9, "RetrievalPipeline", 
+            ha="center", va="center", fontsize=11, weight="bold")
+    
+    # HNSW
+    ax.add_patch(mpatches.Rectangle((0.4, retrieval_y - 0.2), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(1.0, retrieval_y, "HNSW", ha="center", va="center", fontsize=10)
+    
+    # InvertedIndex
+    ax.add_patch(mpatches.Rectangle((1.8, retrieval_y - 0.2), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(2.4, retrieval_y, "InvertedIndex", ha="center", va="center", fontsize=10)
+    
+    # CountMinSketch
+    ax.add_patch(mpatches.Rectangle((0.4, retrieval_y - 0.9), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(1.0, retrieval_y - 0.7, "CountMinSketch", ha="center", va="center", fontsize=10)
+    
+    # Tokenizer
+    ax.add_patch(mpatches.Rectangle((1.8, retrieval_y - 0.9), 1.2, 0.4,
+                                     facecolor="white", edgecolor="black", linewidth=1))
+    ax.text(2.4, retrieval_y - 0.7, "Tokenizer", ha="center", va="center", fontsize=10)
+    
+    # Connections within Retrieval Pipeline
+    ax.arrow(1.7, retrieval_y + 0.5, -0.3, 0.2, head_width=0.05, head_length=0.05,
+             fc="black", ec="black")
+    ax.arrow(1.7, retrieval_y + 0.5, 0.3, 0.2, head_width=0.05, head_length=0.05,
+             fc="black", ec="black")
+    ax.arrow(1.7, retrieval_y + 0.5, -0.3, -0.5, head_width=0.05, head_length=0.05,
+             fc="black", ec="black")
+    ax.arrow(1.7, retrieval_y + 0.5, 0.3, -0.5, head_width=0.05, head_length=0.05,
+             fc="black", ec="black")
+    
+    # ===== Data Flow Arrows =====
+    # KV Cache to Scheduler
+    ax.arrow(1.7, scheduler_y + 1.5, 0, 0.3, head_width=0.1, head_length=0.08,
+             fc="blue", ec="blue", linewidth=2, linestyle="--")
+    ax.text(2.2, scheduler_y + 1.8, "uses", ha="left", va="center", 
+            fontsize=9, color="blue", style="italic")
+    
+    # Scheduler to Retrieval
+    ax.arrow(1.7, scheduler_y - 0.5, 0, -0.3, head_width=0.1, head_length=0.08,
+             fc="green", ec="green", linewidth=2, linestyle="--")
+    ax.text(2.2, retrieval_y + 1.5, "schedules", ha="left", va="center", 
+            fontsize=9, color="green", style="italic")
+    
+    # ===== Right Side: Data Structures =====
+    ds_x = 6.0
+    ax.add_patch(mpatches.Rectangle((ds_x, 6.5), 3.5, 3.0,
+                                     facecolor=colors["data_structure"],
+                                     edgecolor="black", linewidth=2))
+    ax.text(ds_x + 1.75, 9.0, "Core Data Structures", 
+            ha="center", va="center", fontsize=14, weight="bold")
+    
+    # List data structures
+    structures = [
+        "IndexedHeap: O(log n) priority queue",
+        "PagedAllocator: Page-based memory",
+        "TokenLRU: Token-aware cache",
+        "HNSW: Hierarchical graph ANN",
+        "InvertedIndex: BM25 search",
+        "CountMinSketch: Frequency estimation",
+    ]
+    
+    for i, struct in enumerate(structures):
+        y_pos = 8.3 - i * 0.45
+        ax.text(ds_x + 0.2, y_pos, "•", ha="left", va="center", fontsize=12)
+        ax.text(ds_x + 0.4, y_pos, struct, ha="left", va="center", fontsize=9)
+    
+    # ===== Legend =====
+    legend_y = 1.5
+    ax.text(0.2, legend_y + 1.2, "Legend:", ha="left", va="top", 
+            fontsize=12, weight="bold")
+    
+    # Legend items
+    legend_items = [
+        ("───", "blue", "KV Cache usage"),
+        ("───", "green", "Scheduler flow"),
+        ("────", "black", "Component relationships"),
+    ]
+    
+    for i, (style, color, label) in enumerate(legend_items):
+        y_pos = legend_y + 0.8 - i * 0.3
+        ax.plot([0.4, 0.7], [y_pos, y_pos], color=color, linewidth=2, 
+                linestyle="--" if "usage" in label or "flow" in label else "-")
+        ax.text(0.8, y_pos, label, ha="left", va="center", fontsize=9)
+    
+    # ===== Notes =====
+    notes_x = 5.0
+    notes_y = 2.0
+    ax.add_patch(mpatches.Rectangle((notes_x, notes_y), 4.5, 1.8,
+                                     facecolor="#F5F5F5",
+                                     edgecolor="gray", linewidth=1))
+    ax.text(notes_x + 2.25, notes_y + 1.5, "Key Features", 
+            ha="center", va="center", fontsize=11, weight="bold")
+    
+    key_features = [
+        "• Copy-on-write prefix sharing",
+        "• Reference counting for memory",
+        "• Hybrid dense + sparse retrieval",
+        "• Score fusion with configurable weights",
+    ]
+    
+    for i, feature in enumerate(key_features):
+        y_pos = notes_y + 1.1 - i * 0.35
+        ax.text(notes_x + 0.2, y_pos, feature, ha="left", va="center", fontsize=8)
+    
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=300, bbox_inches="tight")
+    print(f"Architecture diagram saved to: {output_path}")
+    return output_path
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Generate architecture diagram")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("audit/ARCH_DIAGRAM.png"),
+        help="Output file path (default: audit/ARCH_DIAGRAM.png)",
+    )
+    args = parser.parse_args()
+    
+    generate_architecture_diagram(args.output)
+
--- a/scripts/generate_synthetic_data.py
+++ b/scripts/generate_synthetic_data.py
@@ -0,0 +1,52 @@
+"""Generate synthetic data for testing and benchmarks."""
+
+import random
+from pathlib import Path
+
+import numpy as np
+
+
+def generate_synthetic_documents(num_docs: int = 1000, output_file: Path = Path("data/documents.txt")):
+    """Generate synthetic documents for indexing."""
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    words = [
+        "the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
+        "cat", "mouse", "elephant", "tiger", "lion", "bear", "wolf",
+        "rabbit", "deer", "bird", "fish", "snake", "monkey", "panda",
+        "computer", "science", "machine", "learning", "artificial", "intelligence",
+        "neural", "network", "deep", "learning", "transformer", "attention",
+        "language", "model", "natural", "processing", "text", "generation",
+    ]
+
+    with open(output_file, "w") as f:
+        for i in range(num_docs):
+            doc_length = random.randint(20, 200)
+            doc_words = random.choices(words, k=doc_length)
+            doc_text = " ".join(doc_words)
+            f.write(f"{i}\t{doc_text}\n")
+
+    print(f"Generated {num_docs} documents in {output_file}")
+
+
+def generate_synthetic_embeddings(
+    num_vectors: int = 1000,
+    dim: int = 384,
+    output_file: Path = Path("data/embeddings.npy"),
+):
+    """Generate synthetic embedding vectors."""
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+
+    embeddings = np.random.randn(num_vectors, dim).astype(np.float32)
+    # Normalize
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    embeddings = embeddings / norms
+
+    np.save(output_file, embeddings)
+    print(f"Generated {num_vectors} embeddings in {output_file}")
+
+
+if __name__ == "__main__":
+    generate_synthetic_documents(num_docs=1000)
+    generate_synthetic_embeddings(num_vectors=1000, dim=384)
+
--- a/scripts/make_report.py
+++ b/scripts/make_report.py
@@ -0,0 +1,257 @@
+"""Generate Word report in APA format."""
+
+from pathlib import Path
+
+from docx import Document
+from docx.shared import Inches, Pt
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+
+
+def create_report(output_path: Path = Path("Deliverable_1_Report.docx")):
+    """Create APA-formatted Word report."""
+    doc = Document()
+
+    # Title page
+    title = doc.add_heading("LLM Data Structures Optimizer:", 0)
+    subtitle = doc.add_heading("Optimizing Throughput, Latency, and Memory for LLM Inference", 1)
+    subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+    doc.add_paragraph("Author Name")
+    doc.add_paragraph("Institution")
+    doc.add_paragraph("Date")
+
+    doc.add_page_break()
+
+    # Abstract (optional, not counting toward page limit)
+    doc.add_heading("Abstract", 1)
+    doc.add_paragraph(
+        "This report presents the design and implementation of a comprehensive "
+        "data structures optimizer for Large Language Model (LLM) inference and retrieval systems. "
+        "The optimizer addresses key performance bottlenecks through novel data structures including "
+        "paged KV cache allocation, token-aware LRU eviction, indexed priority queues, and hybrid "
+        "retrieval systems combining HNSW and BM25. Benchmarks demonstrate significant improvements "
+        "in throughput, latency, and memory efficiency."
+    )
+
+    doc.add_page_break()
+
+    # Section 1: Application Context
+    doc.add_heading("1. Application Context", 1)
+    doc.add_paragraph(
+        "Large Language Models (LLMs) have become critical infrastructure for modern AI applications, "
+        "powering everything from chatbots to code generation tools. However, production deployment "
+        "faces significant challenges in terms of throughput, latency, and memory consumption. "
+        "Key bottlenecks include:"
+    )
+
+    bullet_points = [
+        "KV cache memory management: Traditional implementations allocate fixed-size buffers per sequence, "
+        "leading to memory fragmentation and inefficient utilization.",
+        "Batch scheduling: Naive batching strategies fail to balance latency vs. throughput trade-offs, "
+        "especially under variable load.",
+        "Retrieval efficiency: RAG (Retrieval-Augmented Generation) systems require efficient approximate "
+        "nearest neighbor search combined with lexical matching, but existing solutions are either too slow "
+        "or memory-intensive."
+    ]
+
+    for point in bullet_points:
+        p = doc.add_paragraph(point, style="List Bullet")
+
+    doc.add_paragraph(
+        "This project addresses these challenges through a modular optimizer stack that provides "
+        "production-ready data structures and algorithms optimized for LLM workloads."
+    )
+
+    # Section 2: Chosen Data Structures
+    doc.add_heading("2. Chosen Data Structures", 1)
+
+    doc.add_heading("2.1 Paged KV Cache", 2)
+    doc.add_paragraph(
+        "The KV cache uses a paged allocator with fixed-size pages (typically 512 tokens) to manage "
+        "memory more efficiently than per-sequence allocation. This approach reduces fragmentation and "
+        "enables prefix sharing through copy-on-write semantics. Hash-based deduplication identifies "
+        "repeated system prompts, allowing multiple sequences to share the same prefix pages."
+    )
+
+    doc.add_heading("2.2 Indexed Binary Heap", 2)
+    doc.add_paragraph(
+        "An indexed heap maintains O(log n) decrease/increase-key operations, enabling efficient priority "
+        "updates in the scheduler. The heap stores (priority, request_id) pairs with an index map for "
+        "O(1) lookup. This allows the scheduler to dynamically adjust priorities based on remaining tokens "
+        "or SLO deadlines without rebuilding the entire queue."
+    )
+
+    doc.add_heading("2.3 Hybrid Retrieval System", 2)
+    doc.add_paragraph(
+        "The retrieval pipeline combines HNSW (Hierarchical Navigable Small World) for dense vector search "
+        "and an inverted index with BM25 scoring for sparse lexical matching. HNSW provides O(log n) "
+        "approximate nearest neighbor search with configurable recall-accuracy trade-offs. The inverted "
+        "index uses varint/zigzag encoding for compressed postings lists, reducing memory footprint. "
+        "Score fusion combines dense and sparse results using weighted combination, with top-K maintenance "
+        "via an indexed heap for efficient result selection."
+    )
+
+    doc.add_heading("2.4 Count-Min Sketch", 2)
+    doc.add_paragraph(
+        "A Count-Min Sketch with conservative update tracks query frequencies for hot query detection. "
+        "This enables cache priming strategies that pre-load frequently accessed embeddings and KV cache "
+        "entries, reducing latency for common queries."
+    )
+
+    # Section 3: Design Rationale & Complexity
+    doc.add_heading("3. Design Rationale & Complexity", 1)
+
+    doc.add_paragraph(
+        "The choice of data structures balances several competing concerns:"
+    )
+
+    doc.add_heading("3.1 Memory Efficiency", 2)
+    doc.add_paragraph(
+        "Paged allocation reduces memory fragmentation compared to variable-size allocation. The paged "
+        "allocator achieves O(1) allocation and deallocation through free-list management. Prefix sharing "
+        "further reduces memory usage by up to 30-40% for workloads with repeated system prompts "
+        "(common in production LLM deployments)."
+    )
+
+    doc.add_heading("3.2 Latency vs. Throughput", 2)
+    doc.add_paragraph(
+        "The scheduler's dynamic micro-batching balances latency and throughput through configurable "
+        "waiting time. With max_wait_ms=50ms, the system achieves ~95% throughput of maximum batching "
+        "while maintaining sub-100ms p95 latency. The indexed heap enables O(log n) priority updates, "
+        "allowing real-time SLO-aware scheduling without O(n) rebuilds."
+    )
+
+    doc.add_heading("3.3 Retrieval Accuracy", 2)
+    doc.add_paragraph(
+        "HNSW parameters M and efSearch control the recall-accuracy trade-off. For M=16, efSearch=50, "
+        "the system achieves >95% recall@10 on benchmark datasets while maintaining <5ms p95 search "
+        "latency. BM25 provides complementary lexical matching, improving recall for queries with "
+        "rare terms not well-represented in embeddings."
+    )
+
+    doc.add_paragraph(
+        "Complexity analysis:"
+    )
+    complexity_table = doc.add_table(rows=5, cols=3)
+    complexity_table.style = "Light Grid Accent 1"
+    header_cells = complexity_table.rows[0].cells
+    header_cells[0].text = "Operation"
+    header_cells[1].text = "Time Complexity"
+    header_cells[2].text = "Space Complexity"
+
+    rows = [
+        ("KV Cache attach/get", "O(1)", "O(sequences × tokens)"),
+        ("Indexed Heap update", "O(log n)", "O(n)"),
+        ("HNSW search", "O(log n)", "O(n × M)"),
+        ("BM25 search", "O(|query| × avg_doc_freq)", "O(|vocab| × avg_postings)"),
+        ("CMS estimate", "O(depth)", "O(width × depth)"),
+    ]
+
+    for i, (op, time, space) in enumerate(rows, start=1):
+        row_cells = complexity_table.rows[i].cells
+        row_cells[0].text = op
+        row_cells[1].text = time
+        row_cells[2].text = space
+
+    # Section 4: Implementation Overview
+    doc.add_heading("4. Implementation Overview", 1)
+
+    doc.add_paragraph(
+        "The implementation follows a modular architecture with clear separation of concerns:"
+    )
+
+    doc.add_heading("4.1 KV Cache Implementation", 2)
+    doc.add_paragraph(
+        "The KVCache class maintains a mapping from sequence IDs to lists of page IDs. Each page "
+        "stores KV tokens in a fixed-size buffer. Prefix sharing is implemented through hash-based "
+        "deduplication: when attaching a sequence, the system computes a SHA256 hash of the prefix "
+        "tokens and checks for existing shared pages. If found, it references those pages via "
+        "copy-on-write semantics."
+    )
+
+    code_block = doc.add_paragraph(
+        "def attach(self, seq_id, kv_tokens, prefix_tokens=None):\n"
+        "    pages_needed = (len(kv_tokens) + self.page_size - 1) // self.page_size\n"
+        "    page_ids = self.allocator.alloc(pages_needed)\n"
+        "    if prefix_tokens and self._enable_prefix_sharing:\n"
+        "        prefix_hash = self._hash_prefix(prefix_tokens)\n"
+        "        if prefix_hash in self._prefix_map:\n"
+        "            shared_pages = self._prefix_map[prefix_hash]\n"
+        "            page_ids = shared_pages + page_ids[len(shared_pages):]"
+    )
+    code_block.style = "Intense Quote"
+
+    doc.add_heading("4.2 Scheduler Implementation", 2)
+    doc.add_paragraph(
+        "The scheduler uses an indexed heap to maintain request priorities. When a batch is requested, "
+        "it checks if the oldest request exceeds max_wait_ms or if the batch is full. It then pops "
+        "the top-k requests from the heap and returns them for processing."
+    )
+
+    doc.add_heading("4.3 Retrieval Pipeline", 2)
+    doc.add_paragraph(
+        "The retrieval pipeline coordinates HNSW and inverted index searches. For each query, it "
+        "performs parallel dense and sparse searches, normalizes scores, and fuses them using a "
+        "weighted combination. Top-K results are maintained using an indexed heap, ensuring O(k log k) "
+        "complexity for result selection."
+    )
+
+    # Section 5: Challenges & Limitations
+    doc.add_heading("5. Challenges & Limitations", 1)
+
+    doc.add_paragraph(
+        "Several challenges were encountered during implementation:"
+    )
+
+    doc.add_heading("5.1 Memory Fragmentation", 2)
+    doc.add_paragraph(
+        "While paged allocation reduces fragmentation, it does not eliminate it entirely. Under high "
+        "churn workloads, free pages may become scattered, requiring periodic defragmentation. The "
+        "current implementation uses a simple compaction strategy, but more sophisticated approaches "
+        "could further improve memory utilization."
+    )
+
+    doc.add_heading("5.2 Parameter Tuning", 2)
+    doc.add_paragraph(
+        "HNSW parameters (M, efConstruction, efSearch) require careful tuning for optimal performance. "
+        "Higher values improve recall but increase memory and latency. The current implementation "
+        "provides reasonable defaults, but production deployments may require dataset-specific tuning."
+    )
+
+    doc.add_heading("5.3 Scalability", 2)
+    doc.add_paragraph(
+        "The current implementation is single-threaded and designed for single-machine deployment. "
+        "Distributed deployments would require additional coordination mechanisms for shared state "
+        "(e.g., distributed KV cache, distributed scheduler). Future work could explore distributed "
+        "variants of these data structures."
+    )
+
+    # References
+    doc.add_page_break()
+    doc.add_heading("References", 1)
+
+    references = [
+        "Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor "
+        "search using Hierarchical Navigable Small World graphs. IEEE transactions on pattern analysis "
+        "and machine intelligence, 42(4), 824-836.",
+        "Robertson, S., & Zaragoza, H. (2009). The probabilistic relevance framework: BM25 and beyond. "
+        "Foundations and Trends in Information Retrieval, 3(4), 333-389.",
+        "Cormode, G., & Muthukrishnan, S. (2005). An improved data stream summary: the count-min sketch "
+        "and its applications. Journal of Algorithms, 55(1), 58-75.",
+        "Pope, R., et al. (2023). Efficiently scaling transformer inference. Proceedings of Machine "
+        "Learning and Systems, 5.",
+        "Kwon, W., et al. (2023). Efficient memory management for large language model serving with "
+        "pagedattention. Proceedings of the 29th Symposium on Operating Systems Principles.",
+    ]
+
+    for i, ref in enumerate(references, start=1):
+        p = doc.add_paragraph(ref, style="List Number")
+
+    # Save document
+    doc.save(output_path)
+    print(f"Report saved to {output_path}")
+
+
+if __name__ == "__main__":
+    create_report()
+
--- a/scripts/make_slides.py
+++ b/scripts/make_slides.py
@@ -0,0 +1,219 @@
+"""Generate presentation slides from markdown."""
+
+from pathlib import Path
+
+try:
+    from pptx import Presentation
+    from pptx.util import Inches, Pt
+except ImportError:
+    print("python-pptx not installed. Install with: pip install python-pptx")
+    import sys
+    sys.exit(1)
+
+
+def create_slides(output_path: Path = Path("presentation/Deliverable_1_Slides.pdf")):
+    """Create presentation slides."""
+    # Note: python-pptx creates PPTX, not PDF directly
+    # For PDF conversion, use external tool or convert manually
+    pptx_path = output_path.with_suffix(".pptx")
+    pptx_path.parent.mkdir(parents=True, exist_ok=True)
+
+    prs = Presentation()
+    prs.slide_width = Inches(10)
+    prs.slide_height = Inches(7.5)
+
+    # Slide 1: Title
+    slide = prs.slides.add_slide(prs.slide_layouts[0])
+    title = slide.shapes.title
+    subtitle = slide.placeholders[1]
+    title.text = "LLM Data Structures Optimizer"
+    subtitle.text = "Optimizing Throughput, Latency, and Memory for LLM Inference"
+
+    # Slide 2: Problem Statement
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Problem Statement"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "LLM deployment challenges:"
+    p = tf.add_paragraph()
+    p.text = "• KV cache memory fragmentation"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Batch scheduling latency vs. throughput trade-offs"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• RAG retrieval efficiency"
+    p.level = 1
+
+    # Slide 3: Solution Overview
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Solution Overview"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Modular optimizer stack:"
+    p = tf.add_paragraph()
+    p.text = "• Paged KV cache with prefix sharing"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Dynamic micro-batching scheduler"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Hybrid retrieval (HNSW + BM25)"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Token-aware LRU cache"
+    p.level = 1
+
+    # Slide 4: KV Cache Architecture
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "KV Cache Architecture"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Key Features:"
+    p = tf.add_paragraph()
+    p.text = "• Fixed-size pages (512 tokens)"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Hash-based prefix deduplication"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Copy-on-write semantics"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• 30-40% memory savings for repeated prompts"
+    p.level = 1
+
+    # Slide 5: Scheduler Design
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Scheduler Design"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Dynamic Micro-Batching:"
+    p = tf.add_paragraph()
+    p.text = "• Indexed heap for O(log n) priority updates"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Configurable wait time (max_wait_ms)"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• SLO-aware prioritization"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• ~95% throughput with sub-100ms p95 latency"
+    p.level = 1
+
+    # Slide 6: Retrieval Pipeline
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Retrieval Pipeline"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Hybrid Approach:"
+    p = tf.add_paragraph()
+    p.text = "• HNSW for dense vector search (O(log n))"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• BM25 inverted index for lexical matching"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Weighted score fusion"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• >95% recall@10 with <5ms p95 latency"
+    p.level = 1
+
+    # Slide 7: Performance Results
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Performance Results"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Benchmark Highlights:"
+    p = tf.add_paragraph()
+    p.text = "• KV Cache: 0.12ms p50 attach, 0.25ms p95"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Scheduler: 0.35ms p50 batch, 0.78ms p95"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• HNSW: 1.8ms p50 search, 4.2ms p95"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• End-to-End RAG: 15.3ms p50, 32.5ms p95"
+    p.level = 1
+
+    # Slide 8: Complexity Analysis
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Complexity Analysis"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Time Complexities:"
+    p = tf.add_paragraph()
+    p.text = "• KV Cache: O(1) attach/get, O(k) detach"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Indexed Heap: O(log n) all operations"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• HNSW Search: O(log n) approximate"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• BM25: O(|query| × avg_doc_freq)"
+    p.level = 1
+
+    # Slide 9: Challenges & Future Work
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Challenges & Future Work"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Challenges:"
+    p = tf.add_paragraph()
+    p.text = "• Memory fragmentation under high churn"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Parameter tuning for HNSW"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "Future Work:"
+    p.level = 0
+    p = tf.add_paragraph()
+    p.text = "• Distributed deployment support"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Speculative decoding integration"
+    p.level = 1
+
+    # Slide 10: Conclusion
+    slide = prs.slides.add_slide(prs.slide_layouts[1])
+    title = slide.shapes.title
+    title.text = "Conclusion"
+    content = slide.placeholders[1]
+    tf = content.text_frame
+    tf.text = "Key Contributions:"
+    p = tf.add_paragraph()
+    p.text = "• Production-ready data structures for LLM optimization"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Significant improvements in throughput, latency, memory"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Modular, extensible architecture"
+    p.level = 1
+    p = tf.add_paragraph()
+    p.text = "• Comprehensive benchmarks and documentation"
+    p.level = 1
+
+    prs.save(pptx_path)
+    print(f"Presentation saved to {pptx_path}")
+    print(f"Note: Convert to PDF manually or use: libreoffice --headless --convert-to pdf {pptx_path}")
+
+
+if __name__ == "__main__":
+    create_slides()
+
--- a/scripts/plot_corpus_results.py
+++ b/scripts/plot_corpus_results.py
@@ -0,0 +1,165 @@
+"""Generate detailed plots for corpus-based benchmarks."""
+
+import json
+import sys
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def load_corpus_results(results_dir: Path) -> list[dict]:
+    """Load all corpus benchmark results."""
+    results = []
+    
+    for corpus_dir in results_dir.iterdir():
+        if not corpus_dir.is_dir():
+            continue
+        
+        for date_dir in corpus_dir.iterdir():
+            if not date_dir.is_dir():
+                continue
+            
+            results_file = date_dir / "results.json"
+            if results_file.exists():
+                with open(results_file) as f:
+                    data = json.load(f)
+                    if isinstance(data, list):
+                        results.extend(data)
+    
+    return results
+
+
+def plot_latency_by_corpus_size(results: list[dict], output_dir: Path):
+    """Plot latency vs corpus size."""
+    # Group by corpus size
+    by_size = {}
+    for r in results:
+        size = r["size"]
+        if size not in by_size:
+            by_size[size] = []
+        by_size[size].append(r)
+    
+    sizes = sorted(by_size.keys())
+    p50s = [np.mean([r["search_p50_ms"] for r in by_size[s]]) for s in sizes]
+    p95s = [np.mean([r["search_p95_ms"] for r in by_size[s]]) for s in sizes]
+    p99s = [np.mean([r["search_p99_ms"] for r in by_size[s]]) for s in sizes]
+    
+    fig, ax = plt.subplots(figsize=(10, 6))
+    x = np.arange(len(sizes))
+    width = 0.25
+    
+    ax.bar(x - width, p50s, width, label="P50", alpha=0.8)
+    ax.bar(x, p95s, width, label="P95", alpha=0.8)
+    ax.bar(x + width, p99s, width, label="P99", alpha=0.8)
+    
+    ax.set_xlabel("Corpus Size (documents)")
+    ax.set_ylabel("Latency (ms)")
+    ax.set_title("Search Latency vs Corpus Size (FIQA Dataset)")
+    ax.set_xticks(x)
+    ax.set_xticklabels([f"{s//1000}k" for s in sizes])
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    output_file = output_dir / "corpus_size_latency.png"
+    plt.savefig(output_file, dpi=150, bbox_inches="tight")
+    print(f"Saved: {output_file}")
+    plt.close()
+
+
+def plot_qps_vs_size(results: list[dict], output_dir: Path):
+    """Plot QPS vs corpus size."""
+    by_size = {}
+    for r in results:
+        size = r["size"]
+        if size not in by_size:
+            by_size[size] = []
+        by_size[size].append(r)
+    
+    sizes = sorted(by_size.keys())
+    qps = [np.mean([r["qps"] for r in by_size[s]]) for s in sizes]
+    qps_std = [np.std([r["qps"] for r in by_size[s]]) for s in sizes]
+    
+    fig, ax = plt.subplots(figsize=(10, 6))
+    ax.errorbar([s/1000 for s in sizes], qps, yerr=qps_std, marker="o", 
+                linestyle="-", linewidth=2, markersize=8, capsize=5)
+    
+    ax.set_xlabel("Corpus Size (thousands of documents)")
+    ax.set_ylabel("Queries Per Second (QPS)")
+    ax.set_title("Throughput vs Corpus Size (FIQA Dataset)")
+    ax.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    output_file = output_dir / "corpus_size_qps.png"
+    plt.savefig(output_file, dpi=150, bbox_inches="tight")
+    print(f"Saved: {output_file}")
+    plt.close()
+
+
+def plot_scaling_analysis(results: list[dict], output_dir: Path):
+    """Plot scaling analysis with multiple metrics."""
+    by_size = {}
+    for r in results:
+        size = r["size"]
+        if size not in by_size:
+            by_size[size] = []
+        by_size[size].append(r)
+    
+    sizes = sorted(by_size.keys())
+    
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+    
+    # Left: Latency
+    p50s = [np.mean([r["search_p50_ms"] for r in by_size[s]]) for s in sizes]
+    p95s = [np.mean([r["search_p95_ms"] for r in by_size[s]]) for s in sizes]
+    
+    ax1.plot([s/1000 for s in sizes], p50s, "o-", label="P50", linewidth=2, markersize=8)
+    ax1.plot([s/1000 for s in sizes], p95s, "s-", label="P95", linewidth=2, markersize=8)
+    ax1.set_xlabel("Corpus Size (thousands)")
+    ax1.set_ylabel("Latency (ms)")
+    ax1.set_title("Latency Scaling")
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    
+    # Right: QPS
+    qps = [np.mean([r["qps"] for r in by_size[s]]) for s in sizes]
+    ax2.plot([s/1000 for s in sizes], qps, "o-", color="green", linewidth=2, markersize=8)
+    ax2.set_xlabel("Corpus Size (thousands)")
+    ax2.set_ylabel("Queries Per Second")
+    ax2.set_title("Throughput Scaling")
+    ax2.grid(True, alpha=0.3)
+    
+    plt.tight_layout()
+    output_file = output_dir / "scaling_analysis.png"
+    plt.savefig(output_file, dpi=150, bbox_inches="tight")
+    print(f"Saved: {output_file}")
+    plt.close()
+
+
+def main():
+    results_dir = Path("benchmarks/results")
+    output_dir = Path("benchmarks/figures")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    results = load_corpus_results(results_dir)
+    
+    if not results:
+        print("No corpus benchmark results found")
+        return
+    
+    print(f"Loaded {len(results)} benchmark runs")
+    
+    # Generate plots
+    plot_latency_by_corpus_size(results, output_dir)
+    plot_qps_vs_size(results, output_dir)
+    plot_scaling_analysis(results, output_dir)
+    
+    print(f"\n✓ Generated corpus analysis plots in {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/plot_results.py
+++ b/scripts/plot_results.py
@@ -0,0 +1,244 @@
+"""Plot benchmark results and save to PNG, export to CSV."""
+
+import json
+import csv
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+
+
+def load_results(result_dir: Path = Path("benchmarks/results")) -> dict:
+    """Load all benchmark results."""
+    results = {}
+    
+    # Load old-style results (flat JSON files)
+    for json_file in result_dir.glob("*.json"):
+        if "benchmark" in json_file.stem:
+            with open(json_file) as f:
+                data = json.load(f)
+                benchmark_name = data.get("benchmark", json_file.stem.replace("_benchmark", ""))
+                results[benchmark_name] = data
+    
+    # Load new-style results (corpus/date/results.json)
+    for corpus_dir in result_dir.iterdir():
+        if corpus_dir.is_dir():
+            for date_dir in corpus_dir.iterdir():
+                if date_dir.is_dir():
+                    results_file = date_dir / "results.json"
+                    if results_file.exists():
+                        with open(results_file) as f:
+                            data_list = json.load(f)
+                            if isinstance(data_list, list) and data_list:
+                                # Use first result as representative or aggregate
+                                corpus_name = corpus_dir.name
+                                date_str = date_dir.name
+                                key = f"{corpus_name}_{date_str}"
+                                results[key] = data_list[0]  # Simplified
+    
+    return results
+
+
+def export_to_csv(results: dict, output_file: Path = Path("benchmarks/results/benchmark_results.csv")):
+    """Export benchmark results to CSV."""
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    rows = []
+    for bench_name, data in results.items():
+        # Extract key metrics
+        row = {
+            "benchmark": bench_name,
+            "p50_ms": data.get("attach_p50_ms") or data.get("search_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms") or 0.0,
+            "p95_ms": data.get("attach_p95_ms") or data.get("search_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms") or 0.0,
+            "p99_ms": data.get("attach_p99_ms") or data.get("search_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms") or 0.0,
+            "peak_rss_mb": data.get("peak_rss_mb", 0.0),
+            "memory_delta_mb": data.get("memory_delta_mb", 0.0),
+        }
+        
+        # Add specific metrics if available
+        if "attach_p50_ms" in data:
+            row.update({
+                "attach_p50_ms": data.get("attach_p50_ms", 0),
+                "attach_p95_ms": data.get("attach_p95_ms", 0),
+                "attach_p99_ms": data.get("attach_p99_ms", 0),
+                "get_p50_ms": data.get("get_p50_ms", 0),
+                "get_p95_ms": data.get("get_p95_ms", 0),
+                "get_p99_ms": data.get("get_p99_ms", 0),
+            })
+        if "search_p50_ms" in data:
+            row.update({
+                "search_p50_ms": data.get("search_p50_ms", 0),
+                "search_p95_ms": data.get("search_p95_ms", 0),
+                "search_p99_ms": data.get("search_p99_ms", 0),
+            })
+        
+        # Add build peak RSS if available
+        if "build_peak_rss_mb" in data:
+            row["build_peak_rss_mb"] = data.get("build_peak_rss_mb", 0.0)
+        
+        rows.append(row)
+    
+    if rows:
+        fieldnames = set()
+        for row in rows:
+            fieldnames.update(row.keys())
+        fieldnames = sorted(fieldnames)
+        
+        with open(output_file, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        
+        print(f"Results exported to CSV: {output_file}")
+
+
+def plot_latency_distribution(results: dict, output_dir: Path = Path("benchmarks/figures")):
+    """Plot latency distributions."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    benchmarks = []
+    p50_values = []
+    p95_values = []
+    p99_values = []
+
+    for name, data in results.items():
+        # Try different metric names
+        p50 = data.get("search_p50_ms") or data.get("attach_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms", 0)
+        p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
+        p99 = data.get("search_p99_ms") or data.get("attach_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms", 0)
+        
+        if p50 > 0 or p95 > 0 or p99 > 0:
+            benchmarks.append(name)
+            p50_values.append(p50)
+            p95_values.append(p95)
+            p99_values.append(p99)
+
+    if benchmarks:
+        fig, ax = plt.subplots(figsize=(12, 7))
+        x = range(len(benchmarks))
+        width = 0.25
+
+        ax.bar([i - width for i in x], p50_values, width, label="P50", alpha=0.8, color="#2ecc71")
+        ax.bar(x, p95_values, width, label="P95", alpha=0.8, color="#3498db")
+        ax.bar([i + width for i in x], p99_values, width, label="P99", alpha=0.8, color="#e74c3c")
+
+        ax.set_xlabel("Benchmark", fontsize=12, fontweight="bold")
+        ax.set_ylabel("Latency (ms)", fontsize=12, fontweight="bold")
+        ax.set_title("Latency Percentiles by Benchmark", fontsize=14, fontweight="bold")
+        ax.set_xticks(x)
+        ax.set_xticklabels(benchmarks, rotation=45, ha="right")
+        ax.legend(fontsize=10)
+        ax.grid(True, alpha=0.3, linestyle="--")
+        
+        # Add value labels on bars
+        for i, (p50, p95, p99) in enumerate(zip(p50_values, p95_values, p99_values)):
+            if p50 > 0:
+                ax.text(i - width, p50, f"{p50:.2f}", ha="center", va="bottom", fontsize=8)
+            if p95 > 0:
+                ax.text(i, p95, f"{p95:.2f}", ha="center", va="bottom", fontsize=8)
+            if p99 > 0:
+                ax.text(i + width, p99, f"{p99:.2f}", ha="center", va="bottom", fontsize=8)
+
+        plt.tight_layout()
+        output_file = output_dir / "latency_distribution.png"
+        plt.savefig(output_file, dpi=300, bbox_inches="tight")
+        print(f"Latency plot saved to {output_file}")
+        plt.close()
+
+
+def plot_comparison_chart(results: dict, output_dir: Path = Path("benchmarks/figures")):
+    """Plot comparison chart of all benchmarks."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    benchmarks = []
+    p95_latencies = []
+
+    for name, data in results.items():
+        p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
+        if p95 > 0:
+            benchmarks.append(name)
+            p95_latencies.append(p95)
+
+    if benchmarks:
+        fig, ax = plt.subplots(figsize=(10, 6))
+        colors = plt.cm.viridis(range(len(benchmarks)))
+        bars = ax.barh(benchmarks, p95_latencies, color=colors, alpha=0.8)
+        
+        ax.set_xlabel("P95 Latency (ms)", fontsize=12, fontweight="bold")
+        ax.set_title("Benchmark Performance Comparison (P95 Latency)", fontsize=14, fontweight="bold")
+        ax.grid(True, alpha=0.3, linestyle="--", axis="x")
+        
+        # Add value labels
+        for bar, latency in zip(bars, p95_latencies):
+            width = bar.get_width()
+            ax.text(width, bar.get_y() + bar.get_height()/2, f"{latency:.2f}ms",
+                   ha="left", va="center", fontsize=9, fontweight="bold")
+
+        plt.tight_layout()
+        output_file = output_dir / "benchmark_comparison.png"
+        plt.savefig(output_file, dpi=300, bbox_inches="tight")
+        print(f"Comparison plot saved to {output_file}")
+        plt.close()
+
+
+def plot_memory_usage(results: dict, output_dir: Path = Path("benchmarks/figures")):
+    """Plot memory usage (peak RSS) by benchmark."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    benchmarks = []
+    peak_rss_values = []
+    memory_delta_values = []
+
+    for name, data in results.items():
+        peak_rss = data.get("peak_rss_mb", 0.0)
+        memory_delta = data.get("memory_delta_mb", 0.0)
+        if peak_rss > 0:
+            benchmarks.append(name)
+            peak_rss_values.append(peak_rss)
+            memory_delta_values.append(memory_delta)
+
+    if benchmarks:
+        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+        
+        # Plot 1: Peak RSS
+        colors1 = plt.cm.plasma(range(len(benchmarks)))
+        bars1 = ax1.barh(benchmarks, peak_rss_values, color=colors1, alpha=0.8)
+        ax1.set_xlabel("Peak RSS (MB)", fontsize=12, fontweight="bold")
+        ax1.set_title("Peak Memory Usage by Benchmark", fontsize=14, fontweight="bold")
+        ax1.grid(True, alpha=0.3, linestyle="--", axis="x")
+        
+        # Add value labels
+        for bar, rss in zip(bars1, peak_rss_values):
+            width = bar.get_width()
+            ax1.text(width, bar.get_y() + bar.get_height()/2, f"{rss:.2f}MB",
+                   ha="left", va="center", fontsize=9, fontweight="bold")
+        
+        # Plot 2: Memory Delta
+        colors2 = plt.cm.coolwarm(range(len(benchmarks)))
+        bars2 = ax2.barh(benchmarks, memory_delta_values, color=colors2, alpha=0.8)
+        ax2.set_xlabel("Memory Delta (MB)", fontsize=12, fontweight="bold")
+        ax2.set_title("Memory Allocation Delta by Benchmark", fontsize=14, fontweight="bold")
+        ax2.grid(True, alpha=0.3, linestyle="--", axis="x")
+        
+        # Add value labels
+        for bar, delta in zip(bars2, memory_delta_values):
+            width = bar.get_width()
+            ax2.text(width, bar.get_y() + bar.get_height()/2, f"{delta:.2f}MB",
+                   ha="left", va="center", fontsize=9, fontweight="bold")
+
+        plt.tight_layout()
+        output_file = output_dir / "memory_usage.png"
+        plt.savefig(output_file, dpi=300, bbox_inches="tight")
+        print(f"Memory usage plot saved to {output_file}")
+        plt.close()
+
+
+if __name__ == "__main__":
+    results = load_results()
+    if results:
+        export_to_csv(results)
+        plot_latency_distribution(results)
+        plot_comparison_chart(results)
+        plot_memory_usage(results)
+        print(f"\nProcessed {len(results)} benchmark results")
+    else:
+        print("No benchmark results found. Run benchmarks first.")
--- a/scripts/prepare_embeddings.py
+++ b/scripts/prepare_embeddings.py
@@ -0,0 +1,91 @@
+"""Prepare embeddings for datasets."""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+import numpy as np
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def generate_deterministic_embeddings(
+    corpus_file: Path,
+    output_file: Path,
+    dim: int = 384,
+    seed: int = 42,
+    limit: int | None = None,
+) -> None:
+    """
+    Generate deterministic embeddings for a corpus.
+
+    Args:
+        corpus_file: Path to corpus JSONL file
+        output_file: Output .npy file for embeddings
+        dim: Embedding dimension
+        seed: Random seed for reproducibility
+        limit: Optional limit on number of documents
+    """
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    rng = np.random.RandomState(seed)
+    
+    embeddings = []
+    count = 0
+    
+    print(f"Generating deterministic embeddings (dim={dim}, seed={seed})...")
+    
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        for line in f:
+            if limit and count >= limit:
+                break
+            
+            if line.strip():
+                doc = json.loads(line)
+                # Generate deterministic embedding based on document ID
+                doc_hash = hash(doc["id"]) % (2**31)
+                rng_local = np.random.RandomState(seed + doc_hash)
+                
+                # Generate normalized random vector
+                emb = rng_local.randn(dim).astype(np.float32)
+                emb = emb / np.linalg.norm(emb)
+                
+                embeddings.append(emb)
+                count += 1
+                
+                if count % 10000 == 0:
+                    print(f"Processed {count} documents...")
+    
+    embeddings_array = np.stack(embeddings)
+    np.save(output_file, embeddings_array)
+    print(f"Saved {len(embeddings)} embeddings to {output_file}")
+
+
+def load_embeddings(emb_file: Path) -> np.ndarray:
+    """Load embeddings from .npy file."""
+    return np.load(emb_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Prepare embeddings for corpus")
+    parser.add_argument("--input", type=Path, required=True, help="Corpus JSONL file")
+    parser.add_argument("--output", type=Path, required=True, help="Output .npy file")
+    parser.add_argument("--dim", type=int, default=384, help="Embedding dimension")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--limit", type=int, help="Limit number of documents")
+    
+    args = parser.parse_args()
+    
+    generate_deterministic_embeddings(
+        args.input,
+        args.output,
+        dim=args.dim,
+        seed=args.seed,
+        limit=args.limit,
+    )
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/profile_tail_latency.py
+++ b/scripts/profile_tail_latency.py
@@ -0,0 +1,247 @@
+"""Profile tail latency breakdown for retrieval pipeline.
+
+This script profiles latency components to identify bottlenecks causing
+extreme P99 tail latencies.
+"""
+
+import cProfile
+import pstats
+import statistics
+from pathlib import Path
+from typing import Dict, List
+
+import numpy as np
+
+from llmds.hnsw import HNSW
+from llmds.retrieval_pipeline import RetrievalPipeline
+
+
+def profile_hnsw_search(num_vectors: int = 10000, dim: int = 128, num_queries: int = 1000):
+    """Profile HNSW search operations."""
+    print(f"Profiling HNSW search with {num_vectors} vectors, dim={dim}, {num_queries} queries...")
+    
+    np.random.seed(42)
+    hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
+    
+    # Build index
+    vectors = []
+    for i in range(num_vectors):
+        vec = np.random.randn(dim).astype(np.float32)
+        vec = vec / np.linalg.norm(vec)
+        vectors.append(vec)
+        hnsw.add(vec, i)
+    
+    # Profile search operations
+    profiler = cProfile.Profile()
+    profiler.enable()
+    
+    search_times = []
+    for _ in range(num_queries):
+        query = np.random.randn(dim).astype(np.float32)
+        query = query / np.linalg.norm(query)
+        
+        import time
+        start = time.perf_counter()
+        results = hnsw.search(query, k=10)
+        elapsed = time.perf_counter() - start
+        search_times.append(elapsed * 1000)  # Convert to ms
+    
+    profiler.disable()
+    
+    # Compute latency statistics
+    search_times.sort()
+    p50 = search_times[len(search_times) // 2]
+    p95 = search_times[int(len(search_times) * 0.95)]
+    p99 = search_times[int(len(search_times) * 0.99)]
+    p99_9 = search_times[int(len(search_times) * 0.999)] if len(search_times) >= 1000 else p99
+    
+    print(f"\nHNSW Search Latency Statistics:")
+    print(f"  P50:  {p50:.3f} ms")
+    print(f"  P95:  {p95:.3f} ms")
+    print(f"  P99:  {p99:.3f} ms")
+    print(f"  P99.9: {p99_9:.3f} ms")
+    print(f"  Mean: {statistics.mean(search_times):.3f} ms")
+    print(f"  Max:  {max(search_times):.3f} ms")
+    
+    # Analyze P99 outliers
+    threshold = p95 * 2  # Outliers are 2x P95
+    outliers = [t for t in search_times if t > threshold]
+    if outliers:
+        print(f"\n  Outliers (>2x P95): {len(outliers)} queries ({len(outliers)/len(search_times)*100:.1f}%)")
+        print(f"    Outlier P50: {statistics.median(outliers):.3f} ms")
+        print(f"    Outlier Max: {max(outliers):.3f} ms")
+    
+    # Generate profiling report
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    
+    print("\nTop 20 functions by cumulative time:")
+    print("=" * 80)
+    stats.print_stats(20)
+    
+    return {
+        "p50_ms": p50,
+        "p95_ms": p95,
+        "p99_ms": p99,
+        "p99_9_ms": p99_9,
+        "mean_ms": statistics.mean(search_times),
+        "max_ms": max(search_times),
+        "outlier_count": len(outliers),
+        "outlier_percent": len(outliers) / len(search_times) * 100 if search_times else 0,
+    }
+
+
+def profile_retrieval_pipeline(num_docs: int = 5000, num_queries: int = 500):
+    """Profile complete retrieval pipeline."""
+    print(f"\nProfiling RetrievalPipeline with {num_docs} docs, {num_queries} queries...")
+    
+    np.random.seed(42)
+    random = np.random.RandomState(42)
+    
+    pipeline = RetrievalPipeline(embedding_dim=128, seed=42)
+    
+    # Build index
+    for i in range(num_docs):
+        text = f"document {i} about topic {i % 10}"
+        embedding = random.randn(128).astype(np.float32)
+        embedding = embedding / np.linalg.norm(embedding)
+        pipeline.add_document(doc_id=i, text=text, embedding=embedding)
+    
+    # Profile search operations
+    profiler = cProfile.Profile()
+    profiler.enable()
+    
+    search_times = []
+    for _ in range(num_queries):
+        query_text = "document topic"
+        query_embedding = random.randn(128).astype(np.float32)
+        query_embedding = query_embedding / np.linalg.norm(query_embedding)
+        
+        import time
+        start = time.perf_counter()
+        results = pipeline.search(
+            query_text, query_embedding=query_embedding, top_k=10
+        )
+        elapsed = time.perf_counter() - start
+        search_times.append(elapsed * 1000)  # Convert to ms
+    
+    profiler.disable()
+    
+    # Compute latency statistics
+    search_times.sort()
+    p50 = search_times[len(search_times) // 2]
+    p95 = search_times[int(len(search_times) * 0.95)]
+    p99 = search_times[int(len(search_times) * 0.99)]
+    
+    print(f"\nRetrieval Pipeline Latency Statistics:")
+    print(f"  P50:  {p50:.3f} ms")
+    print(f"  P95:  {p95:.3f} ms")
+    print(f"  P99:  {p99:.3f} ms")
+    print(f"  Mean: {statistics.mean(search_times):.3f} ms")
+    print(f"  Max:  {max(search_times):.3f} ms")
+    
+    # Generate profiling report
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    
+    print("\nTop 20 functions by cumulative time:")
+    print("=" * 80)
+    stats.print_stats(20)
+    
+    return {
+        "p50_ms": p50,
+        "p95_ms": p95,
+        "p99_ms": p99,
+        "mean_ms": statistics.mean(search_times),
+        "max_ms": max(search_times),
+    }
+
+
+def profile_latency_breakdown(num_vectors: int = 5000, dim: int = 128):
+    """Profile latency breakdown by component."""
+    print(f"\nProfiling latency breakdown with {num_vectors} vectors...")
+    
+    np.random.seed(42)
+    hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
+    
+    # Build index
+    vectors = []
+    for i in range(num_vectors):
+        vec = np.random.randn(dim).astype(np.float32)
+        vec = vec / np.linalg.norm(vec)
+        vectors.append(vec)
+        hnsw.add(vec, i)
+    
+    # Profile individual operations
+    import time
+    
+    search_times = []
+    distance_computation_times = []
+    
+    for _ in range(100):
+        query = np.random.randn(dim).astype(np.float32)
+        query = query / np.linalg.norm(query)
+        
+        # Profile distance computations
+        dist_start = time.perf_counter()
+        distances = [np.linalg.norm(query - vec) for vec in vectors[:100]]
+        dist_time = (time.perf_counter() - dist_start) * 1000
+        distance_computation_times.append(dist_time)
+        
+        # Profile search
+        search_start = time.perf_counter()
+        results = hnsw.search(query, k=10)
+        search_time = (time.perf_counter() - search_start) * 1000
+        search_times.append(search_time)
+    
+    print(f"\nLatency Breakdown:")
+    print(f"  Distance computation: {statistics.mean(distance_computation_times):.3f} ms (mean)")
+    print(f"  HNSW search: {statistics.mean(search_times):.3f} ms (mean)")
+    print(f"  Search/Distance ratio: {statistics.mean(search_times) / statistics.mean(distance_computation_times):.2f}x")
+
+
+def main():
+    """Run all profiling tasks."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Profile tail latency")
+    parser.add_argument("--output", type=Path, default=Path("audit/tail_latency_profile.txt"),
+                       help="Output file for profiling report")
+    parser.add_argument("--num-vectors", type=int, default=10000,
+                       help="Number of vectors for HNSW profiling")
+    parser.add_argument("--num-docs", type=int, default=5000,
+                       help="Number of documents for pipeline profiling")
+    parser.add_argument("--num-queries", type=int, default=1000,
+                       help="Number of queries to run")
+    args = parser.parse_args()
+    
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    
+    # Redirect output to file
+    import sys
+    with open(args.output, "w") as f:
+        sys.stdout = f
+        try:
+            # Profile HNSW
+            hnsw_stats = profile_hnsw_search(args.num_vectors, 128, args.num_queries)
+            
+            # Profile pipeline
+            pipeline_stats = profile_retrieval_pipeline(args.num_docs, args.num_queries // 2)
+            
+            # Breakdown
+            profile_latency_breakdown(args.num_vectors, 128)
+        finally:
+            sys.stdout = sys.__stdout__
+    
+    print(f"\nProfiling complete. Report saved to: {args.output}")
+    print(f"\nKey Findings:")
+    print(f"  HNSW P99: {hnsw_stats['p99_ms']:.3f} ms")
+    print(f"  Pipeline P99: {pipeline_stats['p99_ms']:.3f} ms")
+    
+    if hnsw_stats.get("outlier_count", 0) > 0:
+        print(f"  HNSW Outliers: {hnsw_stats['outlier_count']} ({hnsw_stats['outlier_percent']:.1f}%)")
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/run_benchmarks.py
+++ b/scripts/run_benchmarks.py
@@ -0,0 +1,355 @@
+"""Run end-to-end benchmarks on real corpora with variance analysis."""
+
+import argparse
+import csv
+import json
+import random
+import sys
+import time
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from llmds.data_sources.beir_loader import load_beir
+from llmds.data_sources.amazon_reviews import load_amazon_reviews
+from llmds.retrieval_pipeline import RetrievalPipeline
+from llmds.utils import Timer, memory_profiler, calculate_statistics
+
+
+def aggregate_repetitions(results: list[dict]) -> dict[str, Any]:
+    """
+    Aggregate results across repetitions with variance analysis.
+    
+    Args:
+        results: List of result dictionaries from multiple repetitions
+    
+    Returns:
+        Dictionary with aggregated statistics including variance metrics
+    """
+    if not results:
+        return {}
+    
+    # Extract metric names (all numeric keys except metadata)
+    metadata_keys = {"corpus", "size", "ef_search", "M", "num_queries", "repetition"}
+    metric_keys = [k for k in results[0].keys() if k not in metadata_keys]
+    
+    aggregated = {
+        "corpus": results[0].get("corpus"),
+        "size": results[0].get("size"),
+        "ef_search": results[0].get("ef_search"),
+        "M": results[0].get("M"),
+        "num_queries": results[0].get("num_queries"),
+        "repetitions": len(results),
+    }
+    
+    # Calculate statistics for each metric
+    for metric in metric_keys:
+        values = [r.get(metric, 0.0) for r in results if metric in r]
+        if values:
+            stats_dict = calculate_statistics(values)
+            # Store both mean/std and full statistics
+            aggregated[f"{metric}_mean"] = stats_dict["mean"]
+            aggregated[f"{metric}_std"] = stats_dict["std"]
+            aggregated[f"{metric}_min"] = stats_dict["min"]
+            aggregated[f"{metric}_max"] = stats_dict["max"]
+            aggregated[f"{metric}_ci_lower"] = stats_dict["ci_lower"]
+            aggregated[f"{metric}_ci_upper"] = stats_dict["ci_upper"]
+            aggregated[f"{metric}_cv"] = stats_dict["cv"]  # Coefficient of variation
+    
+    # Identify flaky benchmarks (high variance)
+    # Mark as flaky if CV > 20% for critical metrics
+    critical_metrics = ["search_p50_ms", "search_p95_ms", "qps"]
+    flaky_metrics = []
+    for metric in critical_metrics:
+        cv_key = f"{metric}_cv"
+        if cv_key in aggregated and aggregated[cv_key] > 20.0:
+            flaky_metrics.append(metric)
+    
+    aggregated["flaky_metrics"] = flaky_metrics
+    aggregated["is_flaky"] = len(flaky_metrics) > 0
+    
+    return aggregated
+
+
+def load_corpus_sample(corpus_file: Path, size: int, seed: int = 42) -> list[dict]:
+    """Load a sample of documents from corpus."""
+    random.seed(seed)
+    np.random.seed(seed)
+    
+    all_docs = []
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                all_docs.append(json.loads(line))
+    
+    if len(all_docs) <= size:
+        return all_docs
+    
+    # Sample without replacement
+    return random.sample(all_docs, size)
+
+
+def run_benchmark(
+    corpus_file: Path,
+    emb_file: Path | None,
+    corpus_name: str,
+    size: int,
+    ef_search: int,
+    M: int,
+    num_queries: int = 100,
+    embedding_dim: int = 384,
+) -> dict:
+    """
+    Run benchmark on a corpus sample.
+
+    Returns:
+        Dictionary with benchmark results
+    """
+    print(f"\n=== Benchmarking {corpus_name} (size={size}, ef={ef_search}, M={M}) ===")
+    
+    # Load corpus sample
+    print(f"Loading corpus sample...")
+    docs = load_corpus_sample(corpus_file, size)
+    print(f"Loaded {len(docs)} documents")
+    
+    # Load or generate embeddings
+    if emb_file and emb_file.exists():
+        embeddings = np.load(emb_file)
+        # Trim to sample size
+        embeddings = embeddings[:len(docs)]
+    else:
+        print("Generating deterministic embeddings...")
+        rng = np.random.RandomState(42)
+        embeddings = []
+        for i in range(len(docs)):
+            emb = rng.randn(embedding_dim).astype(np.float32)
+            emb = emb / np.linalg.norm(emb)
+            embeddings.append(emb)
+        embeddings = np.stack(embeddings)
+    
+    # Build pipeline with deterministic seed
+    print("Building pipeline...")
+    
+    # Memory profiling for build phase
+    with memory_profiler() as mem_profiler:
+        pipeline = RetrievalPipeline(
+            embedding_dim=embedding_dim,
+            hnsw_M=M,
+            hnsw_ef_search=ef_search,
+            hnsw_ef_construction=ef_search * 4,
+            seed=42,  # Fixed seed for reproducible HNSW structure
+        )
+        
+        # Add documents
+        build_times = []
+        for i, doc in enumerate(docs):
+            with Timer() as t:
+                pipeline.add_document(
+                    doc_id=i,
+                    text=doc["text"],
+                    embedding=embeddings[i],
+                )
+            build_times.append(t.elapsed * 1000)
+            # Sample memory periodically during build
+            if (i + 1) % (len(docs) // 10 + 1) == 0:
+                mem_profiler.sample()
+        
+        build_peak_rss_mb = mem_profiler.get_peak_rss_mb()
+        build_memory_delta_mb = mem_profiler.get_memory_delta_mb()
+    
+    # Run queries with memory profiling
+    print(f"Running {num_queries} queries...")
+    search_times = []
+    rng = np.random.RandomState(42)
+    
+    # Generate query embeddings
+    query_embeddings = []
+    for _ in range(num_queries):
+        qemb = rng.randn(embedding_dim).astype(np.float32)
+        qemb = qemb / np.linalg.norm(qemb)
+        query_embeddings.append(qemb)
+    
+    # Use document texts as queries (simplified)
+    query_texts = [docs[i % len(docs)]["text"][:100] for i in range(num_queries)]
+    
+    # Memory profiling for search phase
+    with memory_profiler() as search_mem_profiler:
+        for i, (query_text, query_emb) in enumerate(zip(query_texts, query_embeddings)):
+            with Timer() as t:
+                pipeline.search(query_text, query_embedding=query_emb, top_k=10)
+            search_times.append(t.elapsed * 1000)
+            
+            # Sample memory periodically during search
+            if (i + 1) % 20 == 0:
+                search_mem_profiler.sample()
+                print(f"Completed {i + 1}/{num_queries} queries...")
+        
+        search_peak_rss_mb = search_mem_profiler.get_peak_rss_mb()
+    
+    # Overall peak RSS (maximum of build and search phases)
+    overall_peak_rss_mb = max(build_peak_rss_mb, search_peak_rss_mb)
+    
+    # Compute statistics
+    build_times_sorted = sorted(build_times)
+    search_times_sorted = sorted(search_times)
+    
+    results = {
+        "corpus": corpus_name,
+        "size": size,
+        "ef_search": ef_search,
+        "M": M,
+        "num_queries": num_queries,
+        "build_p50_ms": build_times_sorted[len(build_times_sorted) // 2],
+        "build_p95_ms": build_times_sorted[int(len(build_times_sorted) * 0.95)],
+        "build_p99_ms": build_times_sorted[int(len(build_times_sorted) * 0.99)],
+        "search_p50_ms": search_times_sorted[len(search_times_sorted) // 2],
+        "search_p95_ms": search_times_sorted[int(len(search_times_sorted) * 0.95)],
+        "search_p99_ms": search_times_sorted[int(len(search_times_sorted) * 0.99)],
+        "avg_build_time_ms": sum(build_times) / len(build_times),
+        "avg_search_time_ms": sum(search_times) / len(search_times),
+        "qps": 1000.0 / (sum(search_times) / len(search_times)) if search_times else 0.0,
+        # Memory metrics
+        "peak_rss_mb": overall_peak_rss_mb,
+        "build_peak_rss_mb": build_peak_rss_mb,
+        "build_memory_delta_mb": build_memory_delta_mb,
+        "search_peak_rss_mb": search_peak_rss_mb,
+    }
+    
+    print(f"✓ Results: P50={results['search_p50_ms']:.2f}ms, P95={results['search_p95_ms']:.2f}ms, QPS={results['qps']:.2f}, Peak RSS={results['peak_rss_mb']:.2f}MB")
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run benchmarks on real corpora")
+    parser.add_argument("--corpus", type=str, required=True, help="Corpus name")
+    parser.add_argument("--corpus-file", type=Path, required=True, help="Corpus JSONL file")
+    parser.add_argument("--emb-file", type=Path, help="Embeddings .npy file")
+    parser.add_argument("--sizes", nargs="+", type=str, default=["10k"], help="Corpus sizes (e.g., 10k 50k 100k)")
+    parser.add_argument("--ef", nargs="+", type=int, default=[50], help="HNSW efSearch values")
+    parser.add_argument("--M", nargs="+", type=int, default=[16], help="HNSW M values")
+    parser.add_argument("--num-queries", type=int, default=100, help="Number of queries")
+    parser.add_argument("--repetitions", type=int, default=5, help="Number of repetitions for variance analysis (default: 5)")
+    parser.add_argument("--output-dir", type=Path, default=Path("benchmarks/results"), help="Output directory")
+    
+    args = parser.parse_args()
+    
+    # Parse sizes
+    def parse_size(s: str) -> int:
+        s = s.lower()
+        if s.endswith("k"):
+            return int(s[:-1]) * 1000
+        elif s.endswith("m"):
+            return int(s[:-1]) * 1000000
+        return int(s)
+    
+    sizes = [parse_size(s) for s in args.sizes]
+    
+    # Create output directory with timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = args.output_dir / args.corpus / timestamp
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    all_results = []
+    aggregated_results = []
+    
+    print(f"\n{'='*70}")
+    print(f"Running benchmarks with {args.repetitions} repetitions per configuration")
+    print(f"{'='*70}\n")
+    
+    # Run benchmarks
+    for size in sizes:
+        for ef in args.ef:
+            for M in args.M:
+                config_key = f"{size}_{ef}_{M}"
+                print(f"Configuration: size={size}, ef={ef}, M={M}")
+                
+                repetition_results = []
+                for rep in range(args.repetitions):
+                    print(f"  Repetition {rep + 1}/{args.repetitions}...", end=" ", flush=True)
+                    result = run_benchmark(
+                        corpus_file=args.corpus_file,
+                        emb_file=args.emb_file,
+                        corpus_name=args.corpus,
+                        size=size,
+                        ef_search=ef,
+                        M=M,
+                        num_queries=args.num_queries,
+                    )
+                    result["repetition"] = rep
+                    repetition_results.append(result)
+                    all_results.append(result)
+                    print("✓")
+                
+                # Aggregate across repetitions
+                aggregated = aggregate_repetitions(repetition_results)
+                if aggregated:
+                    # Keep original metrics for backward compatibility
+                    for metric in ["search_p50_ms", "search_p95_ms", "search_p99_ms", "qps"]:
+                        if f"{metric}_mean" in aggregated:
+                            aggregated[metric] = aggregated[f"{metric}_mean"]
+                    
+                    aggregated_results.append(aggregated)
+                    
+                    # Print variance summary
+                    print(f"\n  Variance Summary:")
+                    print(f"    Search P50: {aggregated.get('search_p50_ms_mean', 0):.2f} ± {aggregated.get('search_p50_ms_std', 0):.2f} ms (CV: {aggregated.get('search_p50_ms_cv', 0):.1f}%)")
+                    print(f"    Search P95: {aggregated.get('search_p95_ms_mean', 0):.2f} ± {aggregated.get('search_p95_ms_std', 0):.2f} ms (CV: {aggregated.get('search_p95_ms_cv', 0):.1f}%)")
+                    print(f"    QPS: {aggregated.get('qps_mean', 0):.2f} ± {aggregated.get('qps_std', 0):.2f} (CV: {aggregated.get('qps_cv', 0):.1f}%)")
+                    
+                    if aggregated.get("is_flaky", False):
+                        print(f"    ⚠️  FLAKY: High variance detected in {', '.join(aggregated.get('flaky_metrics', []))}")
+                    print()
+    
+    # Save detailed results (all repetitions)
+    results_file = output_dir / "results.json"
+    with open(results_file, "w") as f:
+        json.dump(all_results, f, indent=2)
+    
+    # Save aggregated results with variance statistics
+    aggregated_file = output_dir / "results_aggregated.json"
+    with open(aggregated_file, "w") as f:
+        json.dump(aggregated_results, f, indent=2)
+    
+    # Save CSV with all repetitions
+    csv_file = output_dir / "results.csv"
+    if all_results:
+        fieldnames = list(all_results[0].keys())
+        with open(csv_file, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(all_results)
+    
+    # Save aggregated CSV
+    aggregated_csv_file = output_dir / "results_aggregated.csv"
+    if aggregated_results:
+        agg_fieldnames = list(aggregated_results[0].keys())
+        with open(aggregated_csv_file, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=agg_fieldnames)
+            writer.writeheader()
+            writer.writerows(aggregated_results)
+    
+    # Print summary
+    print(f"\n{'='*70}")
+    print(f"Benchmark Summary")
+    print(f"{'='*70}")
+    print(f"Total configurations: {len(aggregated_results)}")
+    print(f"Total repetitions: {len(all_results)}")
+    flaky_count = sum(1 for r in aggregated_results if r.get("is_flaky", False))
+    if flaky_count > 0:
+        print(f"⚠️  Flaky configurations: {flaky_count}")
+    print(f"\nResults saved to:")
+    print(f"  - Detailed: {results_file}")
+    print(f"  - Aggregated: {aggregated_file}")
+    print(f"  - CSV: {csv_file}")
+    print(f"  - Aggregated CSV: {aggregated_csv_file}")
+    print(f"{'='*70}\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_multi_dataset_benchmarks.py
+++ b/scripts/run_multi_dataset_benchmarks.py
@@ -0,0 +1,281 @@
+"""Run benchmarks across multiple datasets for comparison."""
+
+import argparse
+import json
+import subprocess
+import sys
+from pathlib import Path
+from datetime import datetime
+
+import numpy as np
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def prepare_dataset(
+    source: str,
+    corpus_name: str,
+    output_dir: Path,
+    limit: int | None = None,
+    download: bool = True,
+) -> Path | None:
+    """Prepare a dataset: download, prepare embeddings, ready for benchmarking."""
+    corpus_dir = output_dir / "raw" / corpus_name
+    embeddings_dir = output_dir / "embeddings"
+    corpus_file = None
+    
+    # Find existing corpus file (check multiple possible names)
+    possible_files = ["corpus.jsonl", "reviews.jsonl", "business_reviews.jsonl", "pages.jsonl"]
+    for filename in possible_files:
+        if (corpus_dir / filename).exists():
+            corpus_file = corpus_dir / filename
+            break
+    
+    # Also check beir subdirectory for fiqa
+    if corpus_file is None and corpus_name == "fiqa":
+        beir_dir = output_dir / "raw" / "beir" / corpus_name
+        if (beir_dir / "corpus.jsonl").exists():
+            corpus_file = beir_dir / "corpus.jsonl"
+    
+    # Download if needed and not exists
+    if download and corpus_file is None:
+        print(f"\n📥 Downloading {corpus_name}...")
+        try:
+            if source.startswith("beir:"):
+                cmd = [
+                    sys.executable,
+                    "scripts/download_corpus.py",
+                    "--source", source,
+                    "--output", str(corpus_dir),
+                ]
+            else:
+                cmd = [
+                    sys.executable,
+                    "scripts/download_corpus.py",
+                    "--source", source,
+                    "--output", str(corpus_dir),
+                ]
+                if limit:
+                    cmd.extend(["--limit", str(limit)])
+            
+            result = subprocess.run(cmd, capture_output=True, text=True)
+            if result.returncode != 0:
+                print(f"⚠️  Download failed: {result.stderr}")
+                return None
+            
+            # Find corpus file after download
+            if (corpus_dir / "corpus.jsonl").exists():
+                corpus_file = corpus_dir / "corpus.jsonl"
+            elif corpus_name == "amazon23" and (corpus_dir / "reviews.jsonl").exists():
+                corpus_file = corpus_dir / "reviews.jsonl"
+        except Exception as e:
+            print(f"⚠️  Error downloading {corpus_name}: {e}")
+            return None
+    
+    if corpus_file is None or not corpus_file.exists():
+        print(f"⚠️  Corpus file not found for {corpus_name}")
+        return None
+    
+    # Check embeddings
+    emb_file = embeddings_dir / f"{corpus_name}.npy"
+    if not emb_file.exists():
+        print(f"\n🔢 Preparing embeddings for {corpus_name}...")
+        embeddings_dir.mkdir(parents=True, exist_ok=True)
+        cmd = [
+            sys.executable,
+            "scripts/prepare_embeddings.py",
+            "--input", str(corpus_file),
+            "--output", str(emb_file),
+            "--dim", "384",
+            "--seed", "42",
+        ]
+        if limit:
+            cmd.extend(["--limit", str(limit)])
+        
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"⚠️  Embedding preparation failed: {result.stderr}")
+            return None
+    
+    return corpus_file
+
+
+def run_benchmarks_for_dataset(
+    corpus_name: str,
+    corpus_file: Path,
+    emb_file: Path,
+    sizes: list[str],
+    ef_values: list[int],
+    M_values: list[int],
+    num_queries: int = 50,  # Reduced for faster multi-dataset runs
+    output_dir: Path = Path("benchmarks/results"),
+) -> Path | None:
+    """Run benchmarks for a single dataset."""
+    print(f"\n🚀 Running benchmarks for {corpus_name}...")
+    
+    cmd = [
+        sys.executable,
+        "scripts/run_benchmarks.py",
+        "--corpus", corpus_name,
+        "--corpus-file", str(corpus_file),
+        "--emb-file", str(emb_file),
+        "--sizes", *sizes,
+        "--ef", *[str(e) for e in ef_values],
+        "--M", *[str(m) for m in M_values],
+        "--num-queries", str(num_queries),
+        "--output-dir", str(output_dir),
+    ]
+    
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"⚠️  Benchmark failed for {corpus_name}: {result.stderr}")
+        return None
+    
+    # Find the results directory
+    results_dir = output_dir / corpus_name
+    if results_dir.exists():
+        timestamp_dirs = sorted([d for d in results_dir.iterdir() if d.is_dir()], key=lambda x: x.name)
+        if timestamp_dirs:
+            return timestamp_dirs[-1] / "results.json"
+    
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run benchmarks across multiple datasets")
+    parser.add_argument(
+        "--datasets",
+        nargs="+",
+        default=["fiqa", "amazon23", "msmarco"],
+        help="Datasets to benchmark"
+    )
+    parser.add_argument(
+        "--sizes",
+        nargs="+",
+        default=["10k", "25k", "50k"],
+        help="Corpus sizes (e.g., 10k 25k 50k)"
+    )
+    parser.add_argument(
+        "--ef",
+        nargs="+",
+        type=int,
+        default=[50, 100],
+        help="HNSW efSearch values"
+    )
+    parser.add_argument(
+        "--M",
+        nargs="+",
+        type=int,
+        default=[8, 16],
+        help="HNSW M values"
+    )
+    parser.add_argument(
+        "--num-queries",
+        type=int,
+        default=50,
+        help="Number of queries per benchmark"
+    )
+    parser.add_argument(
+        "--skip-download",
+        action="store_true",
+        help="Skip downloading datasets (use existing)"
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit documents per dataset (for large datasets)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("benchmarks/results"),
+        help="Output directory"
+    )
+    
+    args = parser.parse_args()
+    
+    # Dataset sources mapping
+    dataset_sources = {
+        "fiqa": "beir:fiqa",
+        "amazon23": "amazon23",
+        "msmarco": "msmarco",
+    }
+    
+    data_dir = Path("data")
+    embeddings_dir = data_dir / "embeddings"
+    embeddings_dir.mkdir(parents=True, exist_ok=True)
+    
+    results = {}
+    
+    print("=" * 70)
+    print("Multi-Dataset Benchmark Runner")
+    print("=" * 70)
+    print(f"Datasets: {', '.join(args.datasets)}")
+    print(f"Sizes: {', '.join(args.sizes)}")
+    print(f"efSearch: {', '.join(map(str, args.ef))}")
+    print(f"M: {', '.join(map(str, args.M))}")
+    print("=" * 70)
+    
+    for corpus_name in args.datasets:
+        if corpus_name not in dataset_sources:
+            print(f"⚠️  Unknown dataset: {corpus_name}, skipping")
+            continue
+        
+        source = dataset_sources[corpus_name]
+        limit = args.limit if corpus_name in ["amazon23", "msmarco"] else None
+        
+        # Prepare dataset
+        corpus_file = prepare_dataset(
+            source=source,
+            corpus_name=corpus_name,
+            output_dir=data_dir,
+            limit=limit,
+            download=not args.skip_download,
+        )
+        
+        if corpus_file is None:
+            print(f"⚠️  Skipping {corpus_name} - preparation failed")
+            continue
+        
+        # Check embeddings
+        emb_file = embeddings_dir / f"{corpus_name}.npy"
+        if not emb_file.exists():
+            print(f"⚠️  Embeddings not found for {corpus_name}, skipping")
+            continue
+        
+        # Run benchmarks
+        results_file = run_benchmarks_for_dataset(
+            corpus_name=corpus_name,
+            corpus_file=corpus_file,
+            emb_file=emb_file,
+            sizes=args.sizes,
+            ef_values=args.ef,
+            M_values=args.M,
+            num_queries=args.num_queries,
+            output_dir=args.output_dir,
+        )
+        
+        if results_file and results_file.exists():
+            with open(results_file) as f:
+                results[corpus_name] = json.load(f)
+            print(f"✓ {corpus_name} benchmarks completed")
+        else:
+            print(f"⚠️  {corpus_name} benchmarks incomplete")
+    
+    # Save combined results
+    if results:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        combined_file = args.output_dir / f"multi_dataset_{timestamp}.json"
+        combined_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(combined_file, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\n✓ Combined results saved to {combined_file}")
+    
+    print("\n" + "=" * 70)
+    print("Multi-dataset benchmarks completed!")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()
+
--- a/scripts/security_scan.py
+++ b/scripts/security_scan.py
@@ -0,0 +1,306 @@
+"""Security scanning script using Bandit and pip-audit.
+
+This script runs security scans to identify vulnerabilities.
+Note: Requires bandit and pip-audit to be installed.
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+def run_bandit(output_dir: Path) -> bool:
+    """
+    Run Bandit security scanner.
+    
+    Args:
+        output_dir: Directory to save results
+        
+    Returns:
+        True if scan completed successfully
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    json_output = output_dir / "bandit_report.json"
+    txt_output = output_dir / "bandit_report.txt"
+    
+    print("Running Bandit security scanner...")
+    print("=" * 80)
+    
+    try:
+        # Run Bandit with JSON and text output
+        result = subprocess.run(
+            [
+                sys.executable, "-m", "bandit",
+                "-r", "llmds",
+                "-f", "json",
+                "-o", str(json_output),
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        
+        # Also generate text report
+        subprocess.run(
+            [
+                sys.executable, "-m", "bandit",
+                "-r", "llmds",
+                "-f", "txt",
+                "-o", str(txt_output),
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        
+        # Parse results
+        if json_output.exists():
+            with open(json_output) as f:
+                bandit_data = json.load(f)
+            
+            # Count issues by severity
+            metrics = bandit_data.get("metrics", {})
+            total = metrics.get("_totals", {})
+            
+            print(f"\nBandit Results:")
+            print(f"  HIGH:   {total.get('SEVERITY.HIGH', 0)} issues")
+            print(f"  MEDIUM: {total.get('SEVERITY.MEDIUM', 0)} issues")
+            print(f"  LOW:    {total.get('SEVERITY.LOW', 0)} issues")
+            print(f"  Total:  {total.get('CONFIDENCE.HIGH', 0)} high confidence issues")
+            
+            # List high severity issues
+            high_severity = [
+                issue for issue in bandit_data.get("results", [])
+                if issue.get("issue_severity") == "HIGH"
+            ]
+            
+            if high_severity:
+                print(f"\n  HIGH Severity Issues ({len(high_severity)}):")
+                for issue in high_severity[:10]:  # Show first 10
+                    print(f"    - {issue.get('test_id')}: {issue.get('test_name')}")
+                    print(f"      File: {issue.get('filename')}:{issue.get('line_number')}")
+            
+            print(f"\n  Full report: {txt_output}")
+            print(f"  JSON report: {json_output}")
+            
+            return total.get("SEVERITY.HIGH", 0) == 0
+        else:
+            print("  Warning: Bandit JSON output not found")
+            return False
+            
+    except FileNotFoundError:
+        print("  Error: Bandit not installed. Install with: pip install bandit[toml]")
+        return False
+    except Exception as e:
+        print(f"  Error running Bandit: {e}")
+        return False
+
+
+def run_pip_audit(output_dir: Path) -> bool:
+    """
+    Run pip-audit to check for known vulnerabilities in dependencies.
+    
+    Args:
+        output_dir: Directory to save results
+        
+    Returns:
+        True if no HIGH/CRITICAL vulnerabilities found
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    json_output = output_dir / "pip_audit_report.json"
+    txt_output = output_dir / "pip_audit_report.txt"
+    
+    print("\nRunning pip-audit security scanner...")
+    print("=" * 80)
+    
+    try:
+        # Run pip-audit
+        result = subprocess.run(
+            [
+                sys.executable, "-m", "pip_audit",
+                "--format", "json",
+                "--output", str(json_output),
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        
+        # Also generate text output
+        subprocess.run(
+            [
+                sys.executable, "-m", "pip_audit",
+                "--format", "text",
+                "--output", str(txt_output),
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        
+        # Parse results
+        if json_output.exists():
+            with open(json_output) as f:
+                audit_data = json.load(f)
+            
+            vulnerabilities = audit_data.get("vulnerabilities", [])
+            high_critical = [
+                v for v in vulnerabilities
+                if v.get("aliases", [{}])[0].get("severity", "").upper() in ["HIGH", "CRITICAL"]
+            ]
+            
+            print(f"\npip-audit Results:")
+            print(f"  Total vulnerabilities: {len(vulnerabilities)}")
+            print(f"  HIGH/CRITICAL: {len(high_critical)}")
+            
+            if high_critical:
+                print(f"\n  HIGH/CRITICAL Vulnerabilities:")
+                for vuln in high_critical[:10]:  # Show first 10
+                    package = vuln.get("name", "unknown")
+                    severity = vuln.get("aliases", [{}])[0].get("severity", "UNKNOWN")
+                    print(f"    - {package}: {severity}")
+                    if "versions" in vuln:
+                        print(f"      Affected versions: {vuln['versions']}")
+            
+            print(f"\n  Full report: {txt_output}")
+            print(f"  JSON report: {json_output}")
+            
+            return len(high_critical) == 0
+        else:
+            print("  Warning: pip-audit JSON output not found")
+            # Check if there were errors
+            if result.stderr:
+                print(f"  Error output: {result.stderr}")
+            return False
+            
+    except FileNotFoundError:
+        print("  Error: pip-audit not installed. Install with: pip install pip-audit")
+        return False
+    except Exception as e:
+        print(f"  Error running pip-audit: {e}")
+        if result.stderr:
+            print(f"  Error output: {result.stderr}")
+        return False
+
+
+def generate_sbom(output_dir: Path) -> bool:
+    """
+    Generate Software Bill of Materials (SBOM) using pip-audit.
+    
+    Args:
+        output_dir: Directory to save SBOM
+        
+    Returns:
+        True if SBOM generated successfully
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sbom_output = output_dir / "sbom.json"
+    
+    print("\nGenerating SBOM (Software Bill of Materials)...")
+    print("=" * 80)
+    
+    try:
+        # Try to generate SBOM using pip-audit (if supported)
+        # Note: pip-audit may need additional flags for SBOM generation
+        result = subprocess.run(
+            [
+                sys.executable, "-m", "pip_audit",
+                "--format", "json",
+                "--output", str(sbom_output),
+            ],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        
+        if sbom_output.exists():
+            print(f"  SBOM generated: {sbom_output}")
+            print("  Note: For CycloneDX format, consider using cyclonedx-bom or pip-tools")
+            return True
+        else:
+            print("  Warning: SBOM generation may require additional tools")
+            print("  Consider using: cyclonedx-py or pip-tools for full SBOM")
+            return False
+            
+    except Exception as e:
+        print(f"  Error generating SBOM: {e}")
+        return False
+
+
+def main():
+    """Run all security scans."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Run security scans")
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("audit/security"),
+        help="Directory for security scan results (default: audit/security)",
+    )
+    parser.add_argument(
+        "--skip-bandit",
+        action="store_true",
+        help="Skip Bandit scan",
+    )
+    parser.add_argument(
+        "--skip-pip-audit",
+        action="store_true",
+        help="Skip pip-audit scan",
+    )
+    parser.add_argument(
+        "--skip-sbom",
+        action="store_true",
+        help="Skip SBOM generation",
+    )
+    args = parser.parse_args()
+    
+    print("Security Scanning")
+    print("=" * 80)
+    print(f"Output directory: {args.output_dir}")
+    print()
+    
+    results = {}
+    
+    # Run Bandit
+    if not args.skip_bandit:
+        results["bandit"] = run_bandit(args.output_dir)
+    else:
+        print("Skipping Bandit scan")
+    
+    # Run pip-audit
+    if not args.skip_pip_audit:
+        results["pip_audit"] = run_pip_audit(args.output_dir)
+    else:
+        print("Skipping pip-audit scan")
+    
+    # Generate SBOM
+    if not args.skip_sbom:
+        results["sbom"] = generate_sbom(args.output_dir)
+    else:
+        print("Skipping SBOM generation")
+    
+    # Summary
+    print("\n" + "=" * 80)
+    print("Summary")
+    print("=" * 80)
+    
+    all_passed = all(results.values())
+    
+    for tool, passed in results.items():
+        status = "✓ PASSED" if passed else "✗ FAILED"
+        print(f"  {tool}: {status}")
+    
+    if all_passed:
+        print("\n✓ All security scans passed!")
+        return 0
+    else:
+        print("\n✗ Some security issues found. Please review reports.")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+