Initial commit: LLM-DS optimizer framework with data files excluded
This commit is contained in:
2
scripts/__init__.py
Normal file
2
scripts/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Empty file to make scripts a package
|
||||
|
||||
196
scripts/analyze_variance.py
Normal file
196
scripts/analyze_variance.py
Normal file
@@ -0,0 +1,196 @@
|
||||
"""Analyze variance in benchmark results and identify flaky benchmarks."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
try:
|
||||
from scipy import stats
|
||||
HAS_SCIPY = True
|
||||
except ImportError:
|
||||
HAS_SCIPY = False
|
||||
|
||||
|
||||
def load_benchmark_results(results_file: Path) -> list[dict]:
|
||||
"""Load benchmark results from JSON file."""
|
||||
with open(results_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def identify_flaky_configurations(
|
||||
results: list[dict],
|
||||
cv_threshold: float = 20.0,
|
||||
metrics: list[str] | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Identify flaky benchmark configurations based on coefficient of variation.
|
||||
|
||||
Args:
|
||||
results: List of aggregated result dictionaries
|
||||
cv_threshold: CV threshold (%) above which a benchmark is considered flaky
|
||||
metrics: List of metrics to check (default: critical metrics)
|
||||
|
||||
Returns:
|
||||
List of flaky configuration summaries
|
||||
"""
|
||||
if metrics is None:
|
||||
metrics = ["search_p50_ms", "search_p95_ms", "qps"]
|
||||
|
||||
flaky_configs = []
|
||||
|
||||
for result in results:
|
||||
flaky_metrics = []
|
||||
for metric in metrics:
|
||||
cv_key = f"{metric}_cv"
|
||||
if cv_key in result:
|
||||
cv = result[cv_key]
|
||||
if cv > cv_threshold:
|
||||
mean_val = result.get(f"{metric}_mean", 0)
|
||||
std_val = result.get(f"{metric}_std", 0)
|
||||
flaky_metrics.append({
|
||||
"metric": metric,
|
||||
"mean": mean_val,
|
||||
"std": std_val,
|
||||
"cv": cv,
|
||||
})
|
||||
|
||||
if flaky_metrics:
|
||||
flaky_configs.append({
|
||||
"corpus": result.get("corpus"),
|
||||
"size": result.get("size"),
|
||||
"ef_search": result.get("ef_search"),
|
||||
"M": result.get("M"),
|
||||
"repetitions": result.get("repetitions"),
|
||||
"flaky_metrics": flaky_metrics,
|
||||
})
|
||||
|
||||
return flaky_configs
|
||||
|
||||
|
||||
def generate_variance_report(
|
||||
aggregated_file: Path,
|
||||
output_file: Path | None = None,
|
||||
cv_threshold: float = 20.0,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Generate a variance analysis report.
|
||||
|
||||
Args:
|
||||
aggregated_file: Path to aggregated results JSON
|
||||
output_file: Optional output file for report
|
||||
cv_threshold: CV threshold for flaky detection
|
||||
|
||||
Returns:
|
||||
Report dictionary
|
||||
"""
|
||||
results = load_benchmark_results(aggregated_file)
|
||||
|
||||
if not results:
|
||||
return {"error": "No results found"}
|
||||
|
||||
# Calculate overall statistics
|
||||
all_cvs = []
|
||||
for result in results:
|
||||
for key in result.keys():
|
||||
if key.endswith("_cv") and isinstance(result[key], (int, float)):
|
||||
all_cvs.append(result[key])
|
||||
|
||||
# Identify flaky configurations
|
||||
flaky_configs = identify_flaky_configurations(results, cv_threshold)
|
||||
|
||||
# Group by corpus
|
||||
by_corpus = {}
|
||||
for result in results:
|
||||
corpus = result.get("corpus", "unknown")
|
||||
if corpus not in by_corpus:
|
||||
by_corpus[corpus] = []
|
||||
by_corpus[corpus].append(result)
|
||||
|
||||
report = {
|
||||
"summary": {
|
||||
"total_configurations": len(results),
|
||||
"flaky_configurations": len(flaky_configs),
|
||||
"flaky_percentage": (len(flaky_configs) / len(results) * 100) if results else 0,
|
||||
"average_cv": float(np.mean(all_cvs)) if all_cvs else 0.0,
|
||||
"max_cv": float(np.max(all_cvs)) if all_cvs else 0.0,
|
||||
},
|
||||
"flaky_configurations": flaky_configs,
|
||||
"by_corpus": {
|
||||
corpus: {
|
||||
"count": len(configs),
|
||||
"flaky_count": sum(1 for c in configs if any(m["cv"] > cv_threshold for m in identify_flaky_configurations([c], cv_threshold)[0].get("flaky_metrics", []))),
|
||||
}
|
||||
for corpus, configs in by_corpus.items()
|
||||
},
|
||||
}
|
||||
|
||||
if output_file:
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f"Variance report saved to {output_file}")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Analyze variance in benchmark results")
|
||||
parser.add_argument(
|
||||
"--results",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to aggregated results JSON file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
help="Output file for variance report"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cv-threshold",
|
||||
type=float,
|
||||
default=20.0,
|
||||
help="Coefficient of variation threshold (%) for flaky detection (default: 20.0)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.results.exists():
|
||||
print(f"Error: Results file not found: {args.results}")
|
||||
return
|
||||
|
||||
report = generate_variance_report(
|
||||
aggregated_file=args.results,
|
||||
output_file=args.output,
|
||||
cv_threshold=args.cv_threshold,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*70)
|
||||
print("Variance Analysis Report")
|
||||
print("="*70)
|
||||
summary = report.get("summary", {})
|
||||
print(f"Total configurations: {summary.get('total_configurations', 0)}")
|
||||
print(f"Flaky configurations: {summary.get('flaky_configurations', 0)} ({summary.get('flaky_percentage', 0):.1f}%)")
|
||||
print(f"Average CV: {summary.get('average_cv', 0):.2f}%")
|
||||
print(f"Max CV: {summary.get('max_cv', 0):.2f}%")
|
||||
|
||||
flaky = report.get("flaky_configurations", [])
|
||||
if flaky:
|
||||
print(f"\n⚠️ Flaky Configurations ({len(flaky)}):")
|
||||
for config in flaky[:10]: # Show first 10
|
||||
print(f" - {config.get('corpus')} (size={config.get('size')}, ef={config.get('ef_search')}, M={config.get('M')}):")
|
||||
for metric in config.get("flaky_metrics", []):
|
||||
print(f" • {metric['metric']}: CV={metric['cv']:.1f}% (mean={metric['mean']:.2f}±{metric['std']:.2f})")
|
||||
if len(flaky) > 10:
|
||||
print(f" ... and {len(flaky) - 10} more")
|
||||
else:
|
||||
print("\n✅ No flaky configurations detected!")
|
||||
|
||||
print("="*70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
166
scripts/build_indices.py
Normal file
166
scripts/build_indices.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""Build indices (BM25 + HNSW) for a corpus."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from llmds.hnsw import HNSW
|
||||
from llmds.inverted_index import InvertedIndex
|
||||
from llmds.tokenizer import Tokenizer
|
||||
|
||||
|
||||
def build_indices(
|
||||
corpus_file: Path,
|
||||
emb_file: Path | None,
|
||||
index_dir: Path,
|
||||
bm25: bool = True,
|
||||
hnsw: bool = True,
|
||||
ef_construction: int = 200,
|
||||
M: int = 16,
|
||||
embedding_dim: int = 384,
|
||||
) -> dict:
|
||||
"""
|
||||
Build inverted index and/or HNSW for a corpus.
|
||||
|
||||
Args:
|
||||
corpus_file: Path to corpus JSONL file
|
||||
emb_file: Optional path to embeddings .npy file
|
||||
index_dir: Directory to save indices
|
||||
bm25: Whether to build BM25 inverted index
|
||||
hnsw: Whether to build HNSW index
|
||||
ef_construction: HNSW efConstruction parameter
|
||||
M: HNSW M parameter
|
||||
embedding_dim: Embedding dimension
|
||||
|
||||
Returns:
|
||||
Dictionary with build statistics
|
||||
"""
|
||||
index_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tokenizer = Tokenizer()
|
||||
stats = {}
|
||||
|
||||
# Load embeddings if available
|
||||
embeddings = None
|
||||
if emb_file and emb_file.exists():
|
||||
print(f"Loading embeddings from {emb_file}...")
|
||||
embeddings = np.load(emb_file)
|
||||
print(f"Loaded {len(embeddings)} embeddings")
|
||||
|
||||
# Build BM25 index
|
||||
if bm25:
|
||||
print("Building BM25 inverted index...")
|
||||
start_time = time.time()
|
||||
|
||||
index = InvertedIndex(tokenizer=tokenizer)
|
||||
doc_count = 0
|
||||
|
||||
with open(corpus_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
doc = json.loads(line)
|
||||
index.add_document(doc_id=int(doc["id"].split("_")[-1]) if doc["id"].split("_")[-1].isdigit() else doc_count, text=doc["text"])
|
||||
doc_count += 1
|
||||
|
||||
if doc_count % 10000 == 0:
|
||||
print(f"Indexed {doc_count} documents...")
|
||||
|
||||
# Save index metadata
|
||||
index_stats = index.stats()
|
||||
stats["bm25"] = {
|
||||
"build_time_sec": time.time() - start_time,
|
||||
"total_documents": index_stats["total_documents"],
|
||||
"total_terms": index_stats["total_terms"],
|
||||
}
|
||||
|
||||
print(f"✓ BM25 index built: {stats['bm25']['total_documents']} documents, {stats['bm25']['build_time_sec']:.2f}s")
|
||||
|
||||
# Build HNSW index
|
||||
if hnsw:
|
||||
if embeddings is None:
|
||||
print("Warning: No embeddings provided. Generating deterministic embeddings...")
|
||||
# Generate on-the-fly
|
||||
embeddings = []
|
||||
doc_count = 0
|
||||
rng = np.random.RandomState(42)
|
||||
with open(corpus_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
doc = json.loads(line)
|
||||
emb = rng.randn(embedding_dim).astype(np.float32)
|
||||
emb = emb / np.linalg.norm(emb)
|
||||
embeddings.append(emb)
|
||||
doc_count += 1
|
||||
embeddings = np.stack(embeddings)
|
||||
|
||||
print(f"Building HNSW index (M={M}, efConstruction={ef_construction})...")
|
||||
start_time = time.time()
|
||||
|
||||
hnsw = HNSW(
|
||||
dim=embedding_dim,
|
||||
M=M,
|
||||
ef_construction=ef_construction,
|
||||
ef_search=50,
|
||||
seed=42, # Fixed seed for reproducible HNSW structure
|
||||
)
|
||||
|
||||
for i, emb in enumerate(embeddings):
|
||||
hnsw.add(emb, i)
|
||||
if (i + 1) % 10000 == 0:
|
||||
print(f"Added {i + 1} vectors...")
|
||||
|
||||
hnsw_stats = hnsw.stats()
|
||||
stats["hnsw"] = {
|
||||
"build_time_sec": time.time() - start_time,
|
||||
"num_vectors": hnsw_stats["num_vectors"],
|
||||
"num_layers": hnsw_stats["num_layers"],
|
||||
}
|
||||
|
||||
print(f"✓ HNSW index built: {stats['hnsw']['num_vectors']} vectors, {stats['hnsw']['build_time_sec']:.2f}s")
|
||||
|
||||
# Save statistics
|
||||
stats_file = index_dir / "build_stats.json"
|
||||
with open(stats_file, "w") as f:
|
||||
json.dump(stats, f, indent=2)
|
||||
|
||||
print(f"✓ Indices built and saved to {index_dir}")
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Build indices for corpus")
|
||||
parser.add_argument("--corpus", type=Path, required=True, help="Corpus JSONL file")
|
||||
parser.add_argument("--emb", type=Path, help="Embeddings .npy file")
|
||||
parser.add_argument("--index-dir", type=Path, required=True, help="Index output directory")
|
||||
parser.add_argument("--bm25", action="store_true", help="Build BM25 index")
|
||||
parser.add_argument("--hnsw", action="store_true", help="Build HNSW index")
|
||||
parser.add_argument("--ef", type=int, default=200, help="HNSW efConstruction")
|
||||
parser.add_argument("--M", type=int, default=16, help="HNSW M parameter")
|
||||
parser.add_argument("--dim", type=int, default=384, help="Embedding dimension")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.bm25 and not args.hnsw:
|
||||
print("Error: Must specify --bm25 and/or --hnsw")
|
||||
sys.exit(1)
|
||||
|
||||
build_indices(
|
||||
corpus_file=args.corpus,
|
||||
emb_file=args.emb,
|
||||
index_dir=args.index_dir,
|
||||
bm25=args.bm25,
|
||||
hnsw=args.hnsw,
|
||||
ef_construction=args.ef,
|
||||
M=args.M,
|
||||
embedding_dim=args.dim,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
73
scripts/download_corpus.py
Normal file
73
scripts/download_corpus.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Download and prepare datasets."""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from llmds.data_sources.msmarco import download_msmarco
|
||||
from llmds.data_sources.beir_loader import download_beir
|
||||
from llmds.data_sources.amazon_reviews import download_amazon_reviews
|
||||
from llmds.data_sources.yelp import download_yelp
|
||||
from llmds.data_sources.wikipedia import download_wikipedia
|
||||
from llmds.data_sources.commoncrawl import download_commoncrawl
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download datasets")
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
required=True,
|
||||
help="Dataset source: msmarco, beir:task (e.g., beir:fiqa), amazon23, yelp, wikipedia, commoncrawl"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output directory for corpus"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Limit number of documents"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cc-month",
|
||||
type=str,
|
||||
help="Common Crawl month (e.g., 'CC-MAIN-2025-14')"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse source (handle beir:task format)
|
||||
source_parts = args.source.split(":", 1)
|
||||
source_base = source_parts[0]
|
||||
task = source_parts[1] if len(source_parts) > 1 else None
|
||||
|
||||
if source_base == "msmarco":
|
||||
download_msmarco(args.output)
|
||||
elif source_base == "beir":
|
||||
if not task:
|
||||
print("Error: BEIR requires task name (e.g., 'beir:fiqa', 'beir:scidocs')")
|
||||
sys.exit(1)
|
||||
download_beir(task, args.output)
|
||||
elif source_base == "amazon23":
|
||||
download_amazon_reviews(args.output, limit=args.limit)
|
||||
elif source_base == "yelp":
|
||||
download_yelp(args.output)
|
||||
elif source_base == "wikipedia":
|
||||
download_wikipedia(args.output)
|
||||
elif source_base == "commoncrawl":
|
||||
download_commoncrawl(args.output, cc_month=args.cc_month, limit=args.limit)
|
||||
else:
|
||||
print(f"Error: Unknown source '{source_base}'. Use: msmarco, beir:task, amazon23, yelp, wikipedia, commoncrawl")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"✓ Dataset downloaded to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
137
scripts/env_hash.py
Normal file
137
scripts/env_hash.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""Generate environment hash for reproducibility tracking."""
|
||||
|
||||
import platform
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_blas_info():
|
||||
"""Get BLAS library information."""
|
||||
try:
|
||||
# Try to get BLAS config from numpy
|
||||
blas_info = np.show_config()
|
||||
return str(blas_info)
|
||||
except Exception:
|
||||
try:
|
||||
# Fallback: try to get from numpy config
|
||||
config = np.__config__
|
||||
return str(config)
|
||||
except Exception:
|
||||
return "BLAS info unavailable"
|
||||
|
||||
|
||||
def get_numpy_config():
|
||||
"""Get NumPy configuration."""
|
||||
try:
|
||||
return {
|
||||
"version": np.__version__,
|
||||
"config": str(np.show_config()),
|
||||
}
|
||||
except Exception:
|
||||
return {"version": np.__version__, "config": "unavailable"}
|
||||
|
||||
|
||||
def generate_env_hash(output_path: Path = Path("audit/env_hash.txt")):
|
||||
"""
|
||||
Generate environment hash file with system and library information.
|
||||
|
||||
Args:
|
||||
output_path: Path to output file (default: audit/env_hash.txt)
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
lines = []
|
||||
lines.append("=" * 80)
|
||||
lines.append("Environment Hash")
|
||||
lines.append("=" * 80)
|
||||
lines.append("")
|
||||
|
||||
# Python information
|
||||
lines.append("Python:")
|
||||
lines.append(f" Version: {sys.version}")
|
||||
lines.append(f" Executable: {sys.executable}")
|
||||
lines.append(f" Platform: {platform.platform()}")
|
||||
lines.append("")
|
||||
|
||||
# OS information
|
||||
lines.append("Operating System:")
|
||||
lines.append(f" System: {platform.system()}")
|
||||
lines.append(f" Release: {platform.release()}")
|
||||
lines.append(f" Version: {platform.version()}")
|
||||
lines.append(f" Architecture: {platform.machine()}")
|
||||
lines.append(f" Processor: {platform.processor()}")
|
||||
lines.append("")
|
||||
|
||||
# CPU information
|
||||
try:
|
||||
import psutil
|
||||
lines.append("CPU:")
|
||||
lines.append(f" Physical cores: {psutil.cpu_count(logical=False)}")
|
||||
lines.append(f" Logical cores: {psutil.cpu_count(logical=True)}")
|
||||
lines.append(f" Frequency: {psutil.cpu_freq()}")
|
||||
lines.append("")
|
||||
except ImportError:
|
||||
lines.append("CPU:")
|
||||
lines.append(f" Count: {platform.processor()}")
|
||||
lines.append("")
|
||||
|
||||
# NumPy configuration
|
||||
lines.append("NumPy Configuration:")
|
||||
np_config = get_numpy_config()
|
||||
lines.append(f" Version: {np_config['version']}")
|
||||
lines.append(" Config:")
|
||||
for line in np_config.get("config", "").split("\n"):
|
||||
if line.strip():
|
||||
lines.append(f" {line}")
|
||||
lines.append("")
|
||||
|
||||
# BLAS information
|
||||
lines.append("BLAS Information:")
|
||||
blas_info = get_blas_info()
|
||||
for line in blas_info.split("\n"):
|
||||
if line.strip():
|
||||
lines.append(f" {line}")
|
||||
lines.append("")
|
||||
|
||||
# Python packages (if available)
|
||||
try:
|
||||
import pkg_resources
|
||||
lines.append("Key Packages:")
|
||||
key_packages = ["numpy", "scipy", "hypothesis", "pytest"]
|
||||
for pkg_name in key_packages:
|
||||
try:
|
||||
pkg = pkg_resources.get_distribution(pkg_name)
|
||||
lines.append(f" {pkg_name}: {pkg.version}")
|
||||
except Exception:
|
||||
pass
|
||||
lines.append("")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
lines.append("=" * 80)
|
||||
|
||||
# Write to file
|
||||
content = "\n".join(lines)
|
||||
with open(output_path, "w") as f:
|
||||
f.write(content)
|
||||
|
||||
print(f"Environment hash written to: {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate environment hash")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("audit/env_hash.txt"),
|
||||
help="Output file path (default: audit/env_hash.txt)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_env_hash(args.output)
|
||||
|
||||
235
scripts/generate_architecture_diagram.py
Normal file
235
scripts/generate_architecture_diagram.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""Generate architecture diagram for the LLM Data Structures Optimizer.
|
||||
|
||||
This script creates a visual architecture diagram showing the relationships
|
||||
between major components in the system.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.patches as mpatches
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def generate_architecture_diagram(output_path: Path = Path("audit/ARCH_DIAGRAM.png")):
|
||||
"""
|
||||
Generate architecture diagram showing system components and relationships.
|
||||
|
||||
Args:
|
||||
output_path: Path to save the diagram (default: audit/ARCH_DIAGRAM.png)
|
||||
"""
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(16, 12))
|
||||
ax.set_xlim(0, 10)
|
||||
ax.set_ylim(0, 10)
|
||||
ax.axis("off")
|
||||
|
||||
# Define colors
|
||||
colors = {
|
||||
"kv_cache": "#E8F4F8",
|
||||
"scheduler": "#FFF4E6",
|
||||
"retrieval": "#F0F8E8",
|
||||
"data_structure": "#F5E6F8",
|
||||
}
|
||||
|
||||
# Title
|
||||
ax.text(5, 9.5, "LLM Data Structures Optimizer Architecture",
|
||||
ha="center", va="top", fontsize=20, weight="bold")
|
||||
|
||||
# ===== KV Cache System =====
|
||||
kv_y = 7.5
|
||||
ax.add_patch(mpatches.Rectangle((0.2, kv_y), 3.0, 1.5,
|
||||
facecolor=colors["kv_cache"],
|
||||
edgecolor="black", linewidth=2))
|
||||
ax.text(1.7, kv_y + 1.2, "KV Cache System",
|
||||
ha="center", va="center", fontsize=14, weight="bold")
|
||||
|
||||
# KVCache
|
||||
ax.add_patch(mpatches.Rectangle((0.4, kv_y + 0.7), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(1.0, kv_y + 0.9, "KVCache", ha="center", va="center", fontsize=10)
|
||||
|
||||
# PagedAllocator
|
||||
ax.add_patch(mpatches.Rectangle((1.8, kv_y + 0.7), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(2.4, kv_y + 0.9, "PagedAllocator", ha="center", va="center", fontsize=10)
|
||||
|
||||
# TokenLRU
|
||||
ax.add_patch(mpatches.Rectangle((0.4, kv_y - 0.2), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(1.0, kv_y, "TokenLRU", ha="center", va="center", fontsize=10)
|
||||
|
||||
# Connections within KV Cache
|
||||
ax.arrow(1.6, kv_y + 0.9, 0.2, 0, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
ax.arrow(1.0, kv_y + 0.5, 0, 0.2, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
|
||||
# ===== Scheduler & Batching =====
|
||||
scheduler_y = 5.5
|
||||
ax.add_patch(mpatches.Rectangle((0.2, scheduler_y), 3.0, 1.5,
|
||||
facecolor=colors["scheduler"],
|
||||
edgecolor="black", linewidth=2))
|
||||
ax.text(1.7, scheduler_y + 1.2, "Scheduler & Batching",
|
||||
ha="center", va="center", fontsize=14, weight="bold")
|
||||
|
||||
# Scheduler
|
||||
ax.add_patch(mpatches.Rectangle((0.4, scheduler_y + 0.7), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(1.0, scheduler_y + 0.9, "Scheduler", ha="center", va="center", fontsize=10)
|
||||
|
||||
# IndexedHeap
|
||||
ax.add_patch(mpatches.Rectangle((1.8, scheduler_y + 0.7), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(2.4, scheduler_y + 0.9, "IndexedHeap", ha="center", va="center", fontsize=10)
|
||||
|
||||
# AdmissionController
|
||||
ax.add_patch(mpatches.Rectangle((1.1, scheduler_y - 0.2), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(1.7, scheduler_y, "AdmissionController", ha="center", va="center", fontsize=10)
|
||||
|
||||
# Connections within Scheduler
|
||||
ax.arrow(1.6, scheduler_y + 0.9, 0.2, 0, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
ax.arrow(1.7, scheduler_y + 0.5, 0, 0.2, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
|
||||
# ===== Retrieval Pipeline =====
|
||||
retrieval_y = 3.5
|
||||
ax.add_patch(mpatches.Rectangle((0.2, retrieval_y), 3.0, 1.5,
|
||||
facecolor=colors["retrieval"],
|
||||
edgecolor="black", linewidth=2))
|
||||
ax.text(1.7, retrieval_y + 1.2, "Retrieval Pipeline",
|
||||
ha="center", va="center", fontsize=14, weight="bold")
|
||||
|
||||
# RetrievalPipeline
|
||||
ax.add_patch(mpatches.Rectangle((1.1, retrieval_y + 0.7), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=2))
|
||||
ax.text(1.7, retrieval_y + 0.9, "RetrievalPipeline",
|
||||
ha="center", va="center", fontsize=11, weight="bold")
|
||||
|
||||
# HNSW
|
||||
ax.add_patch(mpatches.Rectangle((0.4, retrieval_y - 0.2), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(1.0, retrieval_y, "HNSW", ha="center", va="center", fontsize=10)
|
||||
|
||||
# InvertedIndex
|
||||
ax.add_patch(mpatches.Rectangle((1.8, retrieval_y - 0.2), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(2.4, retrieval_y, "InvertedIndex", ha="center", va="center", fontsize=10)
|
||||
|
||||
# CountMinSketch
|
||||
ax.add_patch(mpatches.Rectangle((0.4, retrieval_y - 0.9), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(1.0, retrieval_y - 0.7, "CountMinSketch", ha="center", va="center", fontsize=10)
|
||||
|
||||
# Tokenizer
|
||||
ax.add_patch(mpatches.Rectangle((1.8, retrieval_y - 0.9), 1.2, 0.4,
|
||||
facecolor="white", edgecolor="black", linewidth=1))
|
||||
ax.text(2.4, retrieval_y - 0.7, "Tokenizer", ha="center", va="center", fontsize=10)
|
||||
|
||||
# Connections within Retrieval Pipeline
|
||||
ax.arrow(1.7, retrieval_y + 0.5, -0.3, 0.2, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
ax.arrow(1.7, retrieval_y + 0.5, 0.3, 0.2, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
ax.arrow(1.7, retrieval_y + 0.5, -0.3, -0.5, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
ax.arrow(1.7, retrieval_y + 0.5, 0.3, -0.5, head_width=0.05, head_length=0.05,
|
||||
fc="black", ec="black")
|
||||
|
||||
# ===== Data Flow Arrows =====
|
||||
# KV Cache to Scheduler
|
||||
ax.arrow(1.7, scheduler_y + 1.5, 0, 0.3, head_width=0.1, head_length=0.08,
|
||||
fc="blue", ec="blue", linewidth=2, linestyle="--")
|
||||
ax.text(2.2, scheduler_y + 1.8, "uses", ha="left", va="center",
|
||||
fontsize=9, color="blue", style="italic")
|
||||
|
||||
# Scheduler to Retrieval
|
||||
ax.arrow(1.7, scheduler_y - 0.5, 0, -0.3, head_width=0.1, head_length=0.08,
|
||||
fc="green", ec="green", linewidth=2, linestyle="--")
|
||||
ax.text(2.2, retrieval_y + 1.5, "schedules", ha="left", va="center",
|
||||
fontsize=9, color="green", style="italic")
|
||||
|
||||
# ===== Right Side: Data Structures =====
|
||||
ds_x = 6.0
|
||||
ax.add_patch(mpatches.Rectangle((ds_x, 6.5), 3.5, 3.0,
|
||||
facecolor=colors["data_structure"],
|
||||
edgecolor="black", linewidth=2))
|
||||
ax.text(ds_x + 1.75, 9.0, "Core Data Structures",
|
||||
ha="center", va="center", fontsize=14, weight="bold")
|
||||
|
||||
# List data structures
|
||||
structures = [
|
||||
"IndexedHeap: O(log n) priority queue",
|
||||
"PagedAllocator: Page-based memory",
|
||||
"TokenLRU: Token-aware cache",
|
||||
"HNSW: Hierarchical graph ANN",
|
||||
"InvertedIndex: BM25 search",
|
||||
"CountMinSketch: Frequency estimation",
|
||||
]
|
||||
|
||||
for i, struct in enumerate(structures):
|
||||
y_pos = 8.3 - i * 0.45
|
||||
ax.text(ds_x + 0.2, y_pos, "•", ha="left", va="center", fontsize=12)
|
||||
ax.text(ds_x + 0.4, y_pos, struct, ha="left", va="center", fontsize=9)
|
||||
|
||||
# ===== Legend =====
|
||||
legend_y = 1.5
|
||||
ax.text(0.2, legend_y + 1.2, "Legend:", ha="left", va="top",
|
||||
fontsize=12, weight="bold")
|
||||
|
||||
# Legend items
|
||||
legend_items = [
|
||||
("───", "blue", "KV Cache usage"),
|
||||
("───", "green", "Scheduler flow"),
|
||||
("────", "black", "Component relationships"),
|
||||
]
|
||||
|
||||
for i, (style, color, label) in enumerate(legend_items):
|
||||
y_pos = legend_y + 0.8 - i * 0.3
|
||||
ax.plot([0.4, 0.7], [y_pos, y_pos], color=color, linewidth=2,
|
||||
linestyle="--" if "usage" in label or "flow" in label else "-")
|
||||
ax.text(0.8, y_pos, label, ha="left", va="center", fontsize=9)
|
||||
|
||||
# ===== Notes =====
|
||||
notes_x = 5.0
|
||||
notes_y = 2.0
|
||||
ax.add_patch(mpatches.Rectangle((notes_x, notes_y), 4.5, 1.8,
|
||||
facecolor="#F5F5F5",
|
||||
edgecolor="gray", linewidth=1))
|
||||
ax.text(notes_x + 2.25, notes_y + 1.5, "Key Features",
|
||||
ha="center", va="center", fontsize=11, weight="bold")
|
||||
|
||||
key_features = [
|
||||
"• Copy-on-write prefix sharing",
|
||||
"• Reference counting for memory",
|
||||
"• Hybrid dense + sparse retrieval",
|
||||
"• Score fusion with configurable weights",
|
||||
]
|
||||
|
||||
for i, feature in enumerate(key_features):
|
||||
y_pos = notes_y + 1.1 - i * 0.35
|
||||
ax.text(notes_x + 0.2, y_pos, feature, ha="left", va="center", fontsize=8)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||
print(f"Architecture diagram saved to: {output_path}")
|
||||
return output_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate architecture diagram")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("audit/ARCH_DIAGRAM.png"),
|
||||
help="Output file path (default: audit/ARCH_DIAGRAM.png)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_architecture_diagram(args.output)
|
||||
|
||||
52
scripts/generate_synthetic_data.py
Normal file
52
scripts/generate_synthetic_data.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Generate synthetic data for testing and benchmarks."""
|
||||
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def generate_synthetic_documents(num_docs: int = 1000, output_file: Path = Path("data/documents.txt")):
|
||||
"""Generate synthetic documents for indexing."""
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
words = [
|
||||
"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
|
||||
"cat", "mouse", "elephant", "tiger", "lion", "bear", "wolf",
|
||||
"rabbit", "deer", "bird", "fish", "snake", "monkey", "panda",
|
||||
"computer", "science", "machine", "learning", "artificial", "intelligence",
|
||||
"neural", "network", "deep", "learning", "transformer", "attention",
|
||||
"language", "model", "natural", "processing", "text", "generation",
|
||||
]
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
for i in range(num_docs):
|
||||
doc_length = random.randint(20, 200)
|
||||
doc_words = random.choices(words, k=doc_length)
|
||||
doc_text = " ".join(doc_words)
|
||||
f.write(f"{i}\t{doc_text}\n")
|
||||
|
||||
print(f"Generated {num_docs} documents in {output_file}")
|
||||
|
||||
|
||||
def generate_synthetic_embeddings(
|
||||
num_vectors: int = 1000,
|
||||
dim: int = 384,
|
||||
output_file: Path = Path("data/embeddings.npy"),
|
||||
):
|
||||
"""Generate synthetic embedding vectors."""
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
embeddings = np.random.randn(num_vectors, dim).astype(np.float32)
|
||||
# Normalize
|
||||
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
||||
embeddings = embeddings / norms
|
||||
|
||||
np.save(output_file, embeddings)
|
||||
print(f"Generated {num_vectors} embeddings in {output_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_synthetic_documents(num_docs=1000)
|
||||
generate_synthetic_embeddings(num_vectors=1000, dim=384)
|
||||
|
||||
257
scripts/make_report.py
Normal file
257
scripts/make_report.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""Generate Word report in APA format."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from docx import Document
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
|
||||
|
||||
def create_report(output_path: Path = Path("Deliverable_1_Report.docx")):
|
||||
"""Create APA-formatted Word report."""
|
||||
doc = Document()
|
||||
|
||||
# Title page
|
||||
title = doc.add_heading("LLM Data Structures Optimizer:", 0)
|
||||
subtitle = doc.add_heading("Optimizing Throughput, Latency, and Memory for LLM Inference", 1)
|
||||
subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
doc.add_paragraph("Author Name")
|
||||
doc.add_paragraph("Institution")
|
||||
doc.add_paragraph("Date")
|
||||
|
||||
doc.add_page_break()
|
||||
|
||||
# Abstract (optional, not counting toward page limit)
|
||||
doc.add_heading("Abstract", 1)
|
||||
doc.add_paragraph(
|
||||
"This report presents the design and implementation of a comprehensive "
|
||||
"data structures optimizer for Large Language Model (LLM) inference and retrieval systems. "
|
||||
"The optimizer addresses key performance bottlenecks through novel data structures including "
|
||||
"paged KV cache allocation, token-aware LRU eviction, indexed priority queues, and hybrid "
|
||||
"retrieval systems combining HNSW and BM25. Benchmarks demonstrate significant improvements "
|
||||
"in throughput, latency, and memory efficiency."
|
||||
)
|
||||
|
||||
doc.add_page_break()
|
||||
|
||||
# Section 1: Application Context
|
||||
doc.add_heading("1. Application Context", 1)
|
||||
doc.add_paragraph(
|
||||
"Large Language Models (LLMs) have become critical infrastructure for modern AI applications, "
|
||||
"powering everything from chatbots to code generation tools. However, production deployment "
|
||||
"faces significant challenges in terms of throughput, latency, and memory consumption. "
|
||||
"Key bottlenecks include:"
|
||||
)
|
||||
|
||||
bullet_points = [
|
||||
"KV cache memory management: Traditional implementations allocate fixed-size buffers per sequence, "
|
||||
"leading to memory fragmentation and inefficient utilization.",
|
||||
"Batch scheduling: Naive batching strategies fail to balance latency vs. throughput trade-offs, "
|
||||
"especially under variable load.",
|
||||
"Retrieval efficiency: RAG (Retrieval-Augmented Generation) systems require efficient approximate "
|
||||
"nearest neighbor search combined with lexical matching, but existing solutions are either too slow "
|
||||
"or memory-intensive."
|
||||
]
|
||||
|
||||
for point in bullet_points:
|
||||
p = doc.add_paragraph(point, style="List Bullet")
|
||||
|
||||
doc.add_paragraph(
|
||||
"This project addresses these challenges through a modular optimizer stack that provides "
|
||||
"production-ready data structures and algorithms optimized for LLM workloads."
|
||||
)
|
||||
|
||||
# Section 2: Chosen Data Structures
|
||||
doc.add_heading("2. Chosen Data Structures", 1)
|
||||
|
||||
doc.add_heading("2.1 Paged KV Cache", 2)
|
||||
doc.add_paragraph(
|
||||
"The KV cache uses a paged allocator with fixed-size pages (typically 512 tokens) to manage "
|
||||
"memory more efficiently than per-sequence allocation. This approach reduces fragmentation and "
|
||||
"enables prefix sharing through copy-on-write semantics. Hash-based deduplication identifies "
|
||||
"repeated system prompts, allowing multiple sequences to share the same prefix pages."
|
||||
)
|
||||
|
||||
doc.add_heading("2.2 Indexed Binary Heap", 2)
|
||||
doc.add_paragraph(
|
||||
"An indexed heap maintains O(log n) decrease/increase-key operations, enabling efficient priority "
|
||||
"updates in the scheduler. The heap stores (priority, request_id) pairs with an index map for "
|
||||
"O(1) lookup. This allows the scheduler to dynamically adjust priorities based on remaining tokens "
|
||||
"or SLO deadlines without rebuilding the entire queue."
|
||||
)
|
||||
|
||||
doc.add_heading("2.3 Hybrid Retrieval System", 2)
|
||||
doc.add_paragraph(
|
||||
"The retrieval pipeline combines HNSW (Hierarchical Navigable Small World) for dense vector search "
|
||||
"and an inverted index with BM25 scoring for sparse lexical matching. HNSW provides O(log n) "
|
||||
"approximate nearest neighbor search with configurable recall-accuracy trade-offs. The inverted "
|
||||
"index uses varint/zigzag encoding for compressed postings lists, reducing memory footprint. "
|
||||
"Score fusion combines dense and sparse results using weighted combination, with top-K maintenance "
|
||||
"via an indexed heap for efficient result selection."
|
||||
)
|
||||
|
||||
doc.add_heading("2.4 Count-Min Sketch", 2)
|
||||
doc.add_paragraph(
|
||||
"A Count-Min Sketch with conservative update tracks query frequencies for hot query detection. "
|
||||
"This enables cache priming strategies that pre-load frequently accessed embeddings and KV cache "
|
||||
"entries, reducing latency for common queries."
|
||||
)
|
||||
|
||||
# Section 3: Design Rationale & Complexity
|
||||
doc.add_heading("3. Design Rationale & Complexity", 1)
|
||||
|
||||
doc.add_paragraph(
|
||||
"The choice of data structures balances several competing concerns:"
|
||||
)
|
||||
|
||||
doc.add_heading("3.1 Memory Efficiency", 2)
|
||||
doc.add_paragraph(
|
||||
"Paged allocation reduces memory fragmentation compared to variable-size allocation. The paged "
|
||||
"allocator achieves O(1) allocation and deallocation through free-list management. Prefix sharing "
|
||||
"further reduces memory usage by up to 30-40% for workloads with repeated system prompts "
|
||||
"(common in production LLM deployments)."
|
||||
)
|
||||
|
||||
doc.add_heading("3.2 Latency vs. Throughput", 2)
|
||||
doc.add_paragraph(
|
||||
"The scheduler's dynamic micro-batching balances latency and throughput through configurable "
|
||||
"waiting time. With max_wait_ms=50ms, the system achieves ~95% throughput of maximum batching "
|
||||
"while maintaining sub-100ms p95 latency. The indexed heap enables O(log n) priority updates, "
|
||||
"allowing real-time SLO-aware scheduling without O(n) rebuilds."
|
||||
)
|
||||
|
||||
doc.add_heading("3.3 Retrieval Accuracy", 2)
|
||||
doc.add_paragraph(
|
||||
"HNSW parameters M and efSearch control the recall-accuracy trade-off. For M=16, efSearch=50, "
|
||||
"the system achieves >95% recall@10 on benchmark datasets while maintaining <5ms p95 search "
|
||||
"latency. BM25 provides complementary lexical matching, improving recall for queries with "
|
||||
"rare terms not well-represented in embeddings."
|
||||
)
|
||||
|
||||
doc.add_paragraph(
|
||||
"Complexity analysis:"
|
||||
)
|
||||
complexity_table = doc.add_table(rows=5, cols=3)
|
||||
complexity_table.style = "Light Grid Accent 1"
|
||||
header_cells = complexity_table.rows[0].cells
|
||||
header_cells[0].text = "Operation"
|
||||
header_cells[1].text = "Time Complexity"
|
||||
header_cells[2].text = "Space Complexity"
|
||||
|
||||
rows = [
|
||||
("KV Cache attach/get", "O(1)", "O(sequences × tokens)"),
|
||||
("Indexed Heap update", "O(log n)", "O(n)"),
|
||||
("HNSW search", "O(log n)", "O(n × M)"),
|
||||
("BM25 search", "O(|query| × avg_doc_freq)", "O(|vocab| × avg_postings)"),
|
||||
("CMS estimate", "O(depth)", "O(width × depth)"),
|
||||
]
|
||||
|
||||
for i, (op, time, space) in enumerate(rows, start=1):
|
||||
row_cells = complexity_table.rows[i].cells
|
||||
row_cells[0].text = op
|
||||
row_cells[1].text = time
|
||||
row_cells[2].text = space
|
||||
|
||||
# Section 4: Implementation Overview
|
||||
doc.add_heading("4. Implementation Overview", 1)
|
||||
|
||||
doc.add_paragraph(
|
||||
"The implementation follows a modular architecture with clear separation of concerns:"
|
||||
)
|
||||
|
||||
doc.add_heading("4.1 KV Cache Implementation", 2)
|
||||
doc.add_paragraph(
|
||||
"The KVCache class maintains a mapping from sequence IDs to lists of page IDs. Each page "
|
||||
"stores KV tokens in a fixed-size buffer. Prefix sharing is implemented through hash-based "
|
||||
"deduplication: when attaching a sequence, the system computes a SHA256 hash of the prefix "
|
||||
"tokens and checks for existing shared pages. If found, it references those pages via "
|
||||
"copy-on-write semantics."
|
||||
)
|
||||
|
||||
code_block = doc.add_paragraph(
|
||||
"def attach(self, seq_id, kv_tokens, prefix_tokens=None):\n"
|
||||
" pages_needed = (len(kv_tokens) + self.page_size - 1) // self.page_size\n"
|
||||
" page_ids = self.allocator.alloc(pages_needed)\n"
|
||||
" if prefix_tokens and self._enable_prefix_sharing:\n"
|
||||
" prefix_hash = self._hash_prefix(prefix_tokens)\n"
|
||||
" if prefix_hash in self._prefix_map:\n"
|
||||
" shared_pages = self._prefix_map[prefix_hash]\n"
|
||||
" page_ids = shared_pages + page_ids[len(shared_pages):]"
|
||||
)
|
||||
code_block.style = "Intense Quote"
|
||||
|
||||
doc.add_heading("4.2 Scheduler Implementation", 2)
|
||||
doc.add_paragraph(
|
||||
"The scheduler uses an indexed heap to maintain request priorities. When a batch is requested, "
|
||||
"it checks if the oldest request exceeds max_wait_ms or if the batch is full. It then pops "
|
||||
"the top-k requests from the heap and returns them for processing."
|
||||
)
|
||||
|
||||
doc.add_heading("4.3 Retrieval Pipeline", 2)
|
||||
doc.add_paragraph(
|
||||
"The retrieval pipeline coordinates HNSW and inverted index searches. For each query, it "
|
||||
"performs parallel dense and sparse searches, normalizes scores, and fuses them using a "
|
||||
"weighted combination. Top-K results are maintained using an indexed heap, ensuring O(k log k) "
|
||||
"complexity for result selection."
|
||||
)
|
||||
|
||||
# Section 5: Challenges & Limitations
|
||||
doc.add_heading("5. Challenges & Limitations", 1)
|
||||
|
||||
doc.add_paragraph(
|
||||
"Several challenges were encountered during implementation:"
|
||||
)
|
||||
|
||||
doc.add_heading("5.1 Memory Fragmentation", 2)
|
||||
doc.add_paragraph(
|
||||
"While paged allocation reduces fragmentation, it does not eliminate it entirely. Under high "
|
||||
"churn workloads, free pages may become scattered, requiring periodic defragmentation. The "
|
||||
"current implementation uses a simple compaction strategy, but more sophisticated approaches "
|
||||
"could further improve memory utilization."
|
||||
)
|
||||
|
||||
doc.add_heading("5.2 Parameter Tuning", 2)
|
||||
doc.add_paragraph(
|
||||
"HNSW parameters (M, efConstruction, efSearch) require careful tuning for optimal performance. "
|
||||
"Higher values improve recall but increase memory and latency. The current implementation "
|
||||
"provides reasonable defaults, but production deployments may require dataset-specific tuning."
|
||||
)
|
||||
|
||||
doc.add_heading("5.3 Scalability", 2)
|
||||
doc.add_paragraph(
|
||||
"The current implementation is single-threaded and designed for single-machine deployment. "
|
||||
"Distributed deployments would require additional coordination mechanisms for shared state "
|
||||
"(e.g., distributed KV cache, distributed scheduler). Future work could explore distributed "
|
||||
"variants of these data structures."
|
||||
)
|
||||
|
||||
# References
|
||||
doc.add_page_break()
|
||||
doc.add_heading("References", 1)
|
||||
|
||||
references = [
|
||||
"Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor "
|
||||
"search using Hierarchical Navigable Small World graphs. IEEE transactions on pattern analysis "
|
||||
"and machine intelligence, 42(4), 824-836.",
|
||||
"Robertson, S., & Zaragoza, H. (2009). The probabilistic relevance framework: BM25 and beyond. "
|
||||
"Foundations and Trends in Information Retrieval, 3(4), 333-389.",
|
||||
"Cormode, G., & Muthukrishnan, S. (2005). An improved data stream summary: the count-min sketch "
|
||||
"and its applications. Journal of Algorithms, 55(1), 58-75.",
|
||||
"Pope, R., et al. (2023). Efficiently scaling transformer inference. Proceedings of Machine "
|
||||
"Learning and Systems, 5.",
|
||||
"Kwon, W., et al. (2023). Efficient memory management for large language model serving with "
|
||||
"pagedattention. Proceedings of the 29th Symposium on Operating Systems Principles.",
|
||||
]
|
||||
|
||||
for i, ref in enumerate(references, start=1):
|
||||
p = doc.add_paragraph(ref, style="List Number")
|
||||
|
||||
# Save document
|
||||
doc.save(output_path)
|
||||
print(f"Report saved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_report()
|
||||
|
||||
219
scripts/make_slides.py
Normal file
219
scripts/make_slides.py
Normal file
@@ -0,0 +1,219 @@
|
||||
"""Generate presentation slides from markdown."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from pptx import Presentation
|
||||
from pptx.util import Inches, Pt
|
||||
except ImportError:
|
||||
print("python-pptx not installed. Install with: pip install python-pptx")
|
||||
import sys
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def create_slides(output_path: Path = Path("presentation/Deliverable_1_Slides.pdf")):
|
||||
"""Create presentation slides."""
|
||||
# Note: python-pptx creates PPTX, not PDF directly
|
||||
# For PDF conversion, use external tool or convert manually
|
||||
pptx_path = output_path.with_suffix(".pptx")
|
||||
pptx_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
prs = Presentation()
|
||||
prs.slide_width = Inches(10)
|
||||
prs.slide_height = Inches(7.5)
|
||||
|
||||
# Slide 1: Title
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[0])
|
||||
title = slide.shapes.title
|
||||
subtitle = slide.placeholders[1]
|
||||
title.text = "LLM Data Structures Optimizer"
|
||||
subtitle.text = "Optimizing Throughput, Latency, and Memory for LLM Inference"
|
||||
|
||||
# Slide 2: Problem Statement
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Problem Statement"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "LLM deployment challenges:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• KV cache memory fragmentation"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Batch scheduling latency vs. throughput trade-offs"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• RAG retrieval efficiency"
|
||||
p.level = 1
|
||||
|
||||
# Slide 3: Solution Overview
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Solution Overview"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Modular optimizer stack:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Paged KV cache with prefix sharing"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Dynamic micro-batching scheduler"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Hybrid retrieval (HNSW + BM25)"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Token-aware LRU cache"
|
||||
p.level = 1
|
||||
|
||||
# Slide 4: KV Cache Architecture
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "KV Cache Architecture"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Key Features:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Fixed-size pages (512 tokens)"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Hash-based prefix deduplication"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Copy-on-write semantics"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• 30-40% memory savings for repeated prompts"
|
||||
p.level = 1
|
||||
|
||||
# Slide 5: Scheduler Design
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Scheduler Design"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Dynamic Micro-Batching:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Indexed heap for O(log n) priority updates"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Configurable wait time (max_wait_ms)"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• SLO-aware prioritization"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• ~95% throughput with sub-100ms p95 latency"
|
||||
p.level = 1
|
||||
|
||||
# Slide 6: Retrieval Pipeline
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Retrieval Pipeline"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Hybrid Approach:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• HNSW for dense vector search (O(log n))"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• BM25 inverted index for lexical matching"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Weighted score fusion"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• >95% recall@10 with <5ms p95 latency"
|
||||
p.level = 1
|
||||
|
||||
# Slide 7: Performance Results
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Performance Results"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Benchmark Highlights:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• KV Cache: 0.12ms p50 attach, 0.25ms p95"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Scheduler: 0.35ms p50 batch, 0.78ms p95"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• HNSW: 1.8ms p50 search, 4.2ms p95"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• End-to-End RAG: 15.3ms p50, 32.5ms p95"
|
||||
p.level = 1
|
||||
|
||||
# Slide 8: Complexity Analysis
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Complexity Analysis"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Time Complexities:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• KV Cache: O(1) attach/get, O(k) detach"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Indexed Heap: O(log n) all operations"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• HNSW Search: O(log n) approximate"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• BM25: O(|query| × avg_doc_freq)"
|
||||
p.level = 1
|
||||
|
||||
# Slide 9: Challenges & Future Work
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Challenges & Future Work"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Challenges:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Memory fragmentation under high churn"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Parameter tuning for HNSW"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "Future Work:"
|
||||
p.level = 0
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Distributed deployment support"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Speculative decoding integration"
|
||||
p.level = 1
|
||||
|
||||
# Slide 10: Conclusion
|
||||
slide = prs.slides.add_slide(prs.slide_layouts[1])
|
||||
title = slide.shapes.title
|
||||
title.text = "Conclusion"
|
||||
content = slide.placeholders[1]
|
||||
tf = content.text_frame
|
||||
tf.text = "Key Contributions:"
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Production-ready data structures for LLM optimization"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Significant improvements in throughput, latency, memory"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Modular, extensible architecture"
|
||||
p.level = 1
|
||||
p = tf.add_paragraph()
|
||||
p.text = "• Comprehensive benchmarks and documentation"
|
||||
p.level = 1
|
||||
|
||||
prs.save(pptx_path)
|
||||
print(f"Presentation saved to {pptx_path}")
|
||||
print(f"Note: Convert to PDF manually or use: libreoffice --headless --convert-to pdf {pptx_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_slides()
|
||||
|
||||
165
scripts/plot_corpus_results.py
Normal file
165
scripts/plot_corpus_results.py
Normal file
@@ -0,0 +1,165 @@
|
||||
"""Generate detailed plots for corpus-based benchmarks."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def load_corpus_results(results_dir: Path) -> list[dict]:
|
||||
"""Load all corpus benchmark results."""
|
||||
results = []
|
||||
|
||||
for corpus_dir in results_dir.iterdir():
|
||||
if not corpus_dir.is_dir():
|
||||
continue
|
||||
|
||||
for date_dir in corpus_dir.iterdir():
|
||||
if not date_dir.is_dir():
|
||||
continue
|
||||
|
||||
results_file = date_dir / "results.json"
|
||||
if results_file.exists():
|
||||
with open(results_file) as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, list):
|
||||
results.extend(data)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def plot_latency_by_corpus_size(results: list[dict], output_dir: Path):
|
||||
"""Plot latency vs corpus size."""
|
||||
# Group by corpus size
|
||||
by_size = {}
|
||||
for r in results:
|
||||
size = r["size"]
|
||||
if size not in by_size:
|
||||
by_size[size] = []
|
||||
by_size[size].append(r)
|
||||
|
||||
sizes = sorted(by_size.keys())
|
||||
p50s = [np.mean([r["search_p50_ms"] for r in by_size[s]]) for s in sizes]
|
||||
p95s = [np.mean([r["search_p95_ms"] for r in by_size[s]]) for s in sizes]
|
||||
p99s = [np.mean([r["search_p99_ms"] for r in by_size[s]]) for s in sizes]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
x = np.arange(len(sizes))
|
||||
width = 0.25
|
||||
|
||||
ax.bar(x - width, p50s, width, label="P50", alpha=0.8)
|
||||
ax.bar(x, p95s, width, label="P95", alpha=0.8)
|
||||
ax.bar(x + width, p99s, width, label="P99", alpha=0.8)
|
||||
|
||||
ax.set_xlabel("Corpus Size (documents)")
|
||||
ax.set_ylabel("Latency (ms)")
|
||||
ax.set_title("Search Latency vs Corpus Size (FIQA Dataset)")
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels([f"{s//1000}k" for s in sizes])
|
||||
ax.legend()
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
output_file = output_dir / "corpus_size_latency.png"
|
||||
plt.savefig(output_file, dpi=150, bbox_inches="tight")
|
||||
print(f"Saved: {output_file}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_qps_vs_size(results: list[dict], output_dir: Path):
|
||||
"""Plot QPS vs corpus size."""
|
||||
by_size = {}
|
||||
for r in results:
|
||||
size = r["size"]
|
||||
if size not in by_size:
|
||||
by_size[size] = []
|
||||
by_size[size].append(r)
|
||||
|
||||
sizes = sorted(by_size.keys())
|
||||
qps = [np.mean([r["qps"] for r in by_size[s]]) for s in sizes]
|
||||
qps_std = [np.std([r["qps"] for r in by_size[s]]) for s in sizes]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
ax.errorbar([s/1000 for s in sizes], qps, yerr=qps_std, marker="o",
|
||||
linestyle="-", linewidth=2, markersize=8, capsize=5)
|
||||
|
||||
ax.set_xlabel("Corpus Size (thousands of documents)")
|
||||
ax.set_ylabel("Queries Per Second (QPS)")
|
||||
ax.set_title("Throughput vs Corpus Size (FIQA Dataset)")
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
output_file = output_dir / "corpus_size_qps.png"
|
||||
plt.savefig(output_file, dpi=150, bbox_inches="tight")
|
||||
print(f"Saved: {output_file}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_scaling_analysis(results: list[dict], output_dir: Path):
|
||||
"""Plot scaling analysis with multiple metrics."""
|
||||
by_size = {}
|
||||
for r in results:
|
||||
size = r["size"]
|
||||
if size not in by_size:
|
||||
by_size[size] = []
|
||||
by_size[size].append(r)
|
||||
|
||||
sizes = sorted(by_size.keys())
|
||||
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||||
|
||||
# Left: Latency
|
||||
p50s = [np.mean([r["search_p50_ms"] for r in by_size[s]]) for s in sizes]
|
||||
p95s = [np.mean([r["search_p95_ms"] for r in by_size[s]]) for s in sizes]
|
||||
|
||||
ax1.plot([s/1000 for s in sizes], p50s, "o-", label="P50", linewidth=2, markersize=8)
|
||||
ax1.plot([s/1000 for s in sizes], p95s, "s-", label="P95", linewidth=2, markersize=8)
|
||||
ax1.set_xlabel("Corpus Size (thousands)")
|
||||
ax1.set_ylabel("Latency (ms)")
|
||||
ax1.set_title("Latency Scaling")
|
||||
ax1.legend()
|
||||
ax1.grid(True, alpha=0.3)
|
||||
|
||||
# Right: QPS
|
||||
qps = [np.mean([r["qps"] for r in by_size[s]]) for s in sizes]
|
||||
ax2.plot([s/1000 for s in sizes], qps, "o-", color="green", linewidth=2, markersize=8)
|
||||
ax2.set_xlabel("Corpus Size (thousands)")
|
||||
ax2.set_ylabel("Queries Per Second")
|
||||
ax2.set_title("Throughput Scaling")
|
||||
ax2.grid(True, alpha=0.3)
|
||||
|
||||
plt.tight_layout()
|
||||
output_file = output_dir / "scaling_analysis.png"
|
||||
plt.savefig(output_file, dpi=150, bbox_inches="tight")
|
||||
print(f"Saved: {output_file}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def main():
|
||||
results_dir = Path("benchmarks/results")
|
||||
output_dir = Path("benchmarks/figures")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = load_corpus_results(results_dir)
|
||||
|
||||
if not results:
|
||||
print("No corpus benchmark results found")
|
||||
return
|
||||
|
||||
print(f"Loaded {len(results)} benchmark runs")
|
||||
|
||||
# Generate plots
|
||||
plot_latency_by_corpus_size(results, output_dir)
|
||||
plot_qps_vs_size(results, output_dir)
|
||||
plot_scaling_analysis(results, output_dir)
|
||||
|
||||
print(f"\n✓ Generated corpus analysis plots in {output_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
244
scripts/plot_results.py
Normal file
244
scripts/plot_results.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""Plot benchmark results and save to PNG, export to CSV."""
|
||||
|
||||
import json
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def load_results(result_dir: Path = Path("benchmarks/results")) -> dict:
|
||||
"""Load all benchmark results."""
|
||||
results = {}
|
||||
|
||||
# Load old-style results (flat JSON files)
|
||||
for json_file in result_dir.glob("*.json"):
|
||||
if "benchmark" in json_file.stem:
|
||||
with open(json_file) as f:
|
||||
data = json.load(f)
|
||||
benchmark_name = data.get("benchmark", json_file.stem.replace("_benchmark", ""))
|
||||
results[benchmark_name] = data
|
||||
|
||||
# Load new-style results (corpus/date/results.json)
|
||||
for corpus_dir in result_dir.iterdir():
|
||||
if corpus_dir.is_dir():
|
||||
for date_dir in corpus_dir.iterdir():
|
||||
if date_dir.is_dir():
|
||||
results_file = date_dir / "results.json"
|
||||
if results_file.exists():
|
||||
with open(results_file) as f:
|
||||
data_list = json.load(f)
|
||||
if isinstance(data_list, list) and data_list:
|
||||
# Use first result as representative or aggregate
|
||||
corpus_name = corpus_dir.name
|
||||
date_str = date_dir.name
|
||||
key = f"{corpus_name}_{date_str}"
|
||||
results[key] = data_list[0] # Simplified
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def export_to_csv(results: dict, output_file: Path = Path("benchmarks/results/benchmark_results.csv")):
|
||||
"""Export benchmark results to CSV."""
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
rows = []
|
||||
for bench_name, data in results.items():
|
||||
# Extract key metrics
|
||||
row = {
|
||||
"benchmark": bench_name,
|
||||
"p50_ms": data.get("attach_p50_ms") or data.get("search_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms") or 0.0,
|
||||
"p95_ms": data.get("attach_p95_ms") or data.get("search_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms") or 0.0,
|
||||
"p99_ms": data.get("attach_p99_ms") or data.get("search_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms") or 0.0,
|
||||
"peak_rss_mb": data.get("peak_rss_mb", 0.0),
|
||||
"memory_delta_mb": data.get("memory_delta_mb", 0.0),
|
||||
}
|
||||
|
||||
# Add specific metrics if available
|
||||
if "attach_p50_ms" in data:
|
||||
row.update({
|
||||
"attach_p50_ms": data.get("attach_p50_ms", 0),
|
||||
"attach_p95_ms": data.get("attach_p95_ms", 0),
|
||||
"attach_p99_ms": data.get("attach_p99_ms", 0),
|
||||
"get_p50_ms": data.get("get_p50_ms", 0),
|
||||
"get_p95_ms": data.get("get_p95_ms", 0),
|
||||
"get_p99_ms": data.get("get_p99_ms", 0),
|
||||
})
|
||||
if "search_p50_ms" in data:
|
||||
row.update({
|
||||
"search_p50_ms": data.get("search_p50_ms", 0),
|
||||
"search_p95_ms": data.get("search_p95_ms", 0),
|
||||
"search_p99_ms": data.get("search_p99_ms", 0),
|
||||
})
|
||||
|
||||
# Add build peak RSS if available
|
||||
if "build_peak_rss_mb" in data:
|
||||
row["build_peak_rss_mb"] = data.get("build_peak_rss_mb", 0.0)
|
||||
|
||||
rows.append(row)
|
||||
|
||||
if rows:
|
||||
fieldnames = set()
|
||||
for row in rows:
|
||||
fieldnames.update(row.keys())
|
||||
fieldnames = sorted(fieldnames)
|
||||
|
||||
with open(output_file, "w", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
print(f"Results exported to CSV: {output_file}")
|
||||
|
||||
|
||||
def plot_latency_distribution(results: dict, output_dir: Path = Path("benchmarks/figures")):
|
||||
"""Plot latency distributions."""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
benchmarks = []
|
||||
p50_values = []
|
||||
p95_values = []
|
||||
p99_values = []
|
||||
|
||||
for name, data in results.items():
|
||||
# Try different metric names
|
||||
p50 = data.get("search_p50_ms") or data.get("attach_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms", 0)
|
||||
p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
|
||||
p99 = data.get("search_p99_ms") or data.get("attach_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms", 0)
|
||||
|
||||
if p50 > 0 or p95 > 0 or p99 > 0:
|
||||
benchmarks.append(name)
|
||||
p50_values.append(p50)
|
||||
p95_values.append(p95)
|
||||
p99_values.append(p99)
|
||||
|
||||
if benchmarks:
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
x = range(len(benchmarks))
|
||||
width = 0.25
|
||||
|
||||
ax.bar([i - width for i in x], p50_values, width, label="P50", alpha=0.8, color="#2ecc71")
|
||||
ax.bar(x, p95_values, width, label="P95", alpha=0.8, color="#3498db")
|
||||
ax.bar([i + width for i in x], p99_values, width, label="P99", alpha=0.8, color="#e74c3c")
|
||||
|
||||
ax.set_xlabel("Benchmark", fontsize=12, fontweight="bold")
|
||||
ax.set_ylabel("Latency (ms)", fontsize=12, fontweight="bold")
|
||||
ax.set_title("Latency Percentiles by Benchmark", fontsize=14, fontweight="bold")
|
||||
ax.set_xticks(x)
|
||||
ax.set_xticklabels(benchmarks, rotation=45, ha="right")
|
||||
ax.legend(fontsize=10)
|
||||
ax.grid(True, alpha=0.3, linestyle="--")
|
||||
|
||||
# Add value labels on bars
|
||||
for i, (p50, p95, p99) in enumerate(zip(p50_values, p95_values, p99_values)):
|
||||
if p50 > 0:
|
||||
ax.text(i - width, p50, f"{p50:.2f}", ha="center", va="bottom", fontsize=8)
|
||||
if p95 > 0:
|
||||
ax.text(i, p95, f"{p95:.2f}", ha="center", va="bottom", fontsize=8)
|
||||
if p99 > 0:
|
||||
ax.text(i + width, p99, f"{p99:.2f}", ha="center", va="bottom", fontsize=8)
|
||||
|
||||
plt.tight_layout()
|
||||
output_file = output_dir / "latency_distribution.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches="tight")
|
||||
print(f"Latency plot saved to {output_file}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_comparison_chart(results: dict, output_dir: Path = Path("benchmarks/figures")):
|
||||
"""Plot comparison chart of all benchmarks."""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
benchmarks = []
|
||||
p95_latencies = []
|
||||
|
||||
for name, data in results.items():
|
||||
p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
|
||||
if p95 > 0:
|
||||
benchmarks.append(name)
|
||||
p95_latencies.append(p95)
|
||||
|
||||
if benchmarks:
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
colors = plt.cm.viridis(range(len(benchmarks)))
|
||||
bars = ax.barh(benchmarks, p95_latencies, color=colors, alpha=0.8)
|
||||
|
||||
ax.set_xlabel("P95 Latency (ms)", fontsize=12, fontweight="bold")
|
||||
ax.set_title("Benchmark Performance Comparison (P95 Latency)", fontsize=14, fontweight="bold")
|
||||
ax.grid(True, alpha=0.3, linestyle="--", axis="x")
|
||||
|
||||
# Add value labels
|
||||
for bar, latency in zip(bars, p95_latencies):
|
||||
width = bar.get_width()
|
||||
ax.text(width, bar.get_y() + bar.get_height()/2, f"{latency:.2f}ms",
|
||||
ha="left", va="center", fontsize=9, fontweight="bold")
|
||||
|
||||
plt.tight_layout()
|
||||
output_file = output_dir / "benchmark_comparison.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches="tight")
|
||||
print(f"Comparison plot saved to {output_file}")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_memory_usage(results: dict, output_dir: Path = Path("benchmarks/figures")):
|
||||
"""Plot memory usage (peak RSS) by benchmark."""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
benchmarks = []
|
||||
peak_rss_values = []
|
||||
memory_delta_values = []
|
||||
|
||||
for name, data in results.items():
|
||||
peak_rss = data.get("peak_rss_mb", 0.0)
|
||||
memory_delta = data.get("memory_delta_mb", 0.0)
|
||||
if peak_rss > 0:
|
||||
benchmarks.append(name)
|
||||
peak_rss_values.append(peak_rss)
|
||||
memory_delta_values.append(memory_delta)
|
||||
|
||||
if benchmarks:
|
||||
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
||||
|
||||
# Plot 1: Peak RSS
|
||||
colors1 = plt.cm.plasma(range(len(benchmarks)))
|
||||
bars1 = ax1.barh(benchmarks, peak_rss_values, color=colors1, alpha=0.8)
|
||||
ax1.set_xlabel("Peak RSS (MB)", fontsize=12, fontweight="bold")
|
||||
ax1.set_title("Peak Memory Usage by Benchmark", fontsize=14, fontweight="bold")
|
||||
ax1.grid(True, alpha=0.3, linestyle="--", axis="x")
|
||||
|
||||
# Add value labels
|
||||
for bar, rss in zip(bars1, peak_rss_values):
|
||||
width = bar.get_width()
|
||||
ax1.text(width, bar.get_y() + bar.get_height()/2, f"{rss:.2f}MB",
|
||||
ha="left", va="center", fontsize=9, fontweight="bold")
|
||||
|
||||
# Plot 2: Memory Delta
|
||||
colors2 = plt.cm.coolwarm(range(len(benchmarks)))
|
||||
bars2 = ax2.barh(benchmarks, memory_delta_values, color=colors2, alpha=0.8)
|
||||
ax2.set_xlabel("Memory Delta (MB)", fontsize=12, fontweight="bold")
|
||||
ax2.set_title("Memory Allocation Delta by Benchmark", fontsize=14, fontweight="bold")
|
||||
ax2.grid(True, alpha=0.3, linestyle="--", axis="x")
|
||||
|
||||
# Add value labels
|
||||
for bar, delta in zip(bars2, memory_delta_values):
|
||||
width = bar.get_width()
|
||||
ax2.text(width, bar.get_y() + bar.get_height()/2, f"{delta:.2f}MB",
|
||||
ha="left", va="center", fontsize=9, fontweight="bold")
|
||||
|
||||
plt.tight_layout()
|
||||
output_file = output_dir / "memory_usage.png"
|
||||
plt.savefig(output_file, dpi=300, bbox_inches="tight")
|
||||
print(f"Memory usage plot saved to {output_file}")
|
||||
plt.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
results = load_results()
|
||||
if results:
|
||||
export_to_csv(results)
|
||||
plot_latency_distribution(results)
|
||||
plot_comparison_chart(results)
|
||||
plot_memory_usage(results)
|
||||
print(f"\nProcessed {len(results)} benchmark results")
|
||||
else:
|
||||
print("No benchmark results found. Run benchmarks first.")
|
||||
91
scripts/prepare_embeddings.py
Normal file
91
scripts/prepare_embeddings.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Prepare embeddings for datasets."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def generate_deterministic_embeddings(
|
||||
corpus_file: Path,
|
||||
output_file: Path,
|
||||
dim: int = 384,
|
||||
seed: int = 42,
|
||||
limit: int | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Generate deterministic embeddings for a corpus.
|
||||
|
||||
Args:
|
||||
corpus_file: Path to corpus JSONL file
|
||||
output_file: Output .npy file for embeddings
|
||||
dim: Embedding dimension
|
||||
seed: Random seed for reproducibility
|
||||
limit: Optional limit on number of documents
|
||||
"""
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
rng = np.random.RandomState(seed)
|
||||
|
||||
embeddings = []
|
||||
count = 0
|
||||
|
||||
print(f"Generating deterministic embeddings (dim={dim}, seed={seed})...")
|
||||
|
||||
with open(corpus_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if limit and count >= limit:
|
||||
break
|
||||
|
||||
if line.strip():
|
||||
doc = json.loads(line)
|
||||
# Generate deterministic embedding based on document ID
|
||||
doc_hash = hash(doc["id"]) % (2**31)
|
||||
rng_local = np.random.RandomState(seed + doc_hash)
|
||||
|
||||
# Generate normalized random vector
|
||||
emb = rng_local.randn(dim).astype(np.float32)
|
||||
emb = emb / np.linalg.norm(emb)
|
||||
|
||||
embeddings.append(emb)
|
||||
count += 1
|
||||
|
||||
if count % 10000 == 0:
|
||||
print(f"Processed {count} documents...")
|
||||
|
||||
embeddings_array = np.stack(embeddings)
|
||||
np.save(output_file, embeddings_array)
|
||||
print(f"Saved {len(embeddings)} embeddings to {output_file}")
|
||||
|
||||
|
||||
def load_embeddings(emb_file: Path) -> np.ndarray:
|
||||
"""Load embeddings from .npy file."""
|
||||
return np.load(emb_file)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Prepare embeddings for corpus")
|
||||
parser.add_argument("--input", type=Path, required=True, help="Corpus JSONL file")
|
||||
parser.add_argument("--output", type=Path, required=True, help="Output .npy file")
|
||||
parser.add_argument("--dim", type=int, default=384, help="Embedding dimension")
|
||||
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
||||
parser.add_argument("--limit", type=int, help="Limit number of documents")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_deterministic_embeddings(
|
||||
args.input,
|
||||
args.output,
|
||||
dim=args.dim,
|
||||
seed=args.seed,
|
||||
limit=args.limit,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
247
scripts/profile_tail_latency.py
Normal file
247
scripts/profile_tail_latency.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""Profile tail latency breakdown for retrieval pipeline.
|
||||
|
||||
This script profiles latency components to identify bottlenecks causing
|
||||
extreme P99 tail latencies.
|
||||
"""
|
||||
|
||||
import cProfile
|
||||
import pstats
|
||||
import statistics
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import numpy as np
|
||||
|
||||
from llmds.hnsw import HNSW
|
||||
from llmds.retrieval_pipeline import RetrievalPipeline
|
||||
|
||||
|
||||
def profile_hnsw_search(num_vectors: int = 10000, dim: int = 128, num_queries: int = 1000):
|
||||
"""Profile HNSW search operations."""
|
||||
print(f"Profiling HNSW search with {num_vectors} vectors, dim={dim}, {num_queries} queries...")
|
||||
|
||||
np.random.seed(42)
|
||||
hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
|
||||
|
||||
# Build index
|
||||
vectors = []
|
||||
for i in range(num_vectors):
|
||||
vec = np.random.randn(dim).astype(np.float32)
|
||||
vec = vec / np.linalg.norm(vec)
|
||||
vectors.append(vec)
|
||||
hnsw.add(vec, i)
|
||||
|
||||
# Profile search operations
|
||||
profiler = cProfile.Profile()
|
||||
profiler.enable()
|
||||
|
||||
search_times = []
|
||||
for _ in range(num_queries):
|
||||
query = np.random.randn(dim).astype(np.float32)
|
||||
query = query / np.linalg.norm(query)
|
||||
|
||||
import time
|
||||
start = time.perf_counter()
|
||||
results = hnsw.search(query, k=10)
|
||||
elapsed = time.perf_counter() - start
|
||||
search_times.append(elapsed * 1000) # Convert to ms
|
||||
|
||||
profiler.disable()
|
||||
|
||||
# Compute latency statistics
|
||||
search_times.sort()
|
||||
p50 = search_times[len(search_times) // 2]
|
||||
p95 = search_times[int(len(search_times) * 0.95)]
|
||||
p99 = search_times[int(len(search_times) * 0.99)]
|
||||
p99_9 = search_times[int(len(search_times) * 0.999)] if len(search_times) >= 1000 else p99
|
||||
|
||||
print(f"\nHNSW Search Latency Statistics:")
|
||||
print(f" P50: {p50:.3f} ms")
|
||||
print(f" P95: {p95:.3f} ms")
|
||||
print(f" P99: {p99:.3f} ms")
|
||||
print(f" P99.9: {p99_9:.3f} ms")
|
||||
print(f" Mean: {statistics.mean(search_times):.3f} ms")
|
||||
print(f" Max: {max(search_times):.3f} ms")
|
||||
|
||||
# Analyze P99 outliers
|
||||
threshold = p95 * 2 # Outliers are 2x P95
|
||||
outliers = [t for t in search_times if t > threshold]
|
||||
if outliers:
|
||||
print(f"\n Outliers (>2x P95): {len(outliers)} queries ({len(outliers)/len(search_times)*100:.1f}%)")
|
||||
print(f" Outlier P50: {statistics.median(outliers):.3f} ms")
|
||||
print(f" Outlier Max: {max(outliers):.3f} ms")
|
||||
|
||||
# Generate profiling report
|
||||
stats = pstats.Stats(profiler)
|
||||
stats.sort_stats("cumulative")
|
||||
|
||||
print("\nTop 20 functions by cumulative time:")
|
||||
print("=" * 80)
|
||||
stats.print_stats(20)
|
||||
|
||||
return {
|
||||
"p50_ms": p50,
|
||||
"p95_ms": p95,
|
||||
"p99_ms": p99,
|
||||
"p99_9_ms": p99_9,
|
||||
"mean_ms": statistics.mean(search_times),
|
||||
"max_ms": max(search_times),
|
||||
"outlier_count": len(outliers),
|
||||
"outlier_percent": len(outliers) / len(search_times) * 100 if search_times else 0,
|
||||
}
|
||||
|
||||
|
||||
def profile_retrieval_pipeline(num_docs: int = 5000, num_queries: int = 500):
|
||||
"""Profile complete retrieval pipeline."""
|
||||
print(f"\nProfiling RetrievalPipeline with {num_docs} docs, {num_queries} queries...")
|
||||
|
||||
np.random.seed(42)
|
||||
random = np.random.RandomState(42)
|
||||
|
||||
pipeline = RetrievalPipeline(embedding_dim=128, seed=42)
|
||||
|
||||
# Build index
|
||||
for i in range(num_docs):
|
||||
text = f"document {i} about topic {i % 10}"
|
||||
embedding = random.randn(128).astype(np.float32)
|
||||
embedding = embedding / np.linalg.norm(embedding)
|
||||
pipeline.add_document(doc_id=i, text=text, embedding=embedding)
|
||||
|
||||
# Profile search operations
|
||||
profiler = cProfile.Profile()
|
||||
profiler.enable()
|
||||
|
||||
search_times = []
|
||||
for _ in range(num_queries):
|
||||
query_text = "document topic"
|
||||
query_embedding = random.randn(128).astype(np.float32)
|
||||
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
||||
|
||||
import time
|
||||
start = time.perf_counter()
|
||||
results = pipeline.search(
|
||||
query_text, query_embedding=query_embedding, top_k=10
|
||||
)
|
||||
elapsed = time.perf_counter() - start
|
||||
search_times.append(elapsed * 1000) # Convert to ms
|
||||
|
||||
profiler.disable()
|
||||
|
||||
# Compute latency statistics
|
||||
search_times.sort()
|
||||
p50 = search_times[len(search_times) // 2]
|
||||
p95 = search_times[int(len(search_times) * 0.95)]
|
||||
p99 = search_times[int(len(search_times) * 0.99)]
|
||||
|
||||
print(f"\nRetrieval Pipeline Latency Statistics:")
|
||||
print(f" P50: {p50:.3f} ms")
|
||||
print(f" P95: {p95:.3f} ms")
|
||||
print(f" P99: {p99:.3f} ms")
|
||||
print(f" Mean: {statistics.mean(search_times):.3f} ms")
|
||||
print(f" Max: {max(search_times):.3f} ms")
|
||||
|
||||
# Generate profiling report
|
||||
stats = pstats.Stats(profiler)
|
||||
stats.sort_stats("cumulative")
|
||||
|
||||
print("\nTop 20 functions by cumulative time:")
|
||||
print("=" * 80)
|
||||
stats.print_stats(20)
|
||||
|
||||
return {
|
||||
"p50_ms": p50,
|
||||
"p95_ms": p95,
|
||||
"p99_ms": p99,
|
||||
"mean_ms": statistics.mean(search_times),
|
||||
"max_ms": max(search_times),
|
||||
}
|
||||
|
||||
|
||||
def profile_latency_breakdown(num_vectors: int = 5000, dim: int = 128):
|
||||
"""Profile latency breakdown by component."""
|
||||
print(f"\nProfiling latency breakdown with {num_vectors} vectors...")
|
||||
|
||||
np.random.seed(42)
|
||||
hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
|
||||
|
||||
# Build index
|
||||
vectors = []
|
||||
for i in range(num_vectors):
|
||||
vec = np.random.randn(dim).astype(np.float32)
|
||||
vec = vec / np.linalg.norm(vec)
|
||||
vectors.append(vec)
|
||||
hnsw.add(vec, i)
|
||||
|
||||
# Profile individual operations
|
||||
import time
|
||||
|
||||
search_times = []
|
||||
distance_computation_times = []
|
||||
|
||||
for _ in range(100):
|
||||
query = np.random.randn(dim).astype(np.float32)
|
||||
query = query / np.linalg.norm(query)
|
||||
|
||||
# Profile distance computations
|
||||
dist_start = time.perf_counter()
|
||||
distances = [np.linalg.norm(query - vec) for vec in vectors[:100]]
|
||||
dist_time = (time.perf_counter() - dist_start) * 1000
|
||||
distance_computation_times.append(dist_time)
|
||||
|
||||
# Profile search
|
||||
search_start = time.perf_counter()
|
||||
results = hnsw.search(query, k=10)
|
||||
search_time = (time.perf_counter() - search_start) * 1000
|
||||
search_times.append(search_time)
|
||||
|
||||
print(f"\nLatency Breakdown:")
|
||||
print(f" Distance computation: {statistics.mean(distance_computation_times):.3f} ms (mean)")
|
||||
print(f" HNSW search: {statistics.mean(search_times):.3f} ms (mean)")
|
||||
print(f" Search/Distance ratio: {statistics.mean(search_times) / statistics.mean(distance_computation_times):.2f}x")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all profiling tasks."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Profile tail latency")
|
||||
parser.add_argument("--output", type=Path, default=Path("audit/tail_latency_profile.txt"),
|
||||
help="Output file for profiling report")
|
||||
parser.add_argument("--num-vectors", type=int, default=10000,
|
||||
help="Number of vectors for HNSW profiling")
|
||||
parser.add_argument("--num-docs", type=int, default=5000,
|
||||
help="Number of documents for pipeline profiling")
|
||||
parser.add_argument("--num-queries", type=int, default=1000,
|
||||
help="Number of queries to run")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Redirect output to file
|
||||
import sys
|
||||
with open(args.output, "w") as f:
|
||||
sys.stdout = f
|
||||
try:
|
||||
# Profile HNSW
|
||||
hnsw_stats = profile_hnsw_search(args.num_vectors, 128, args.num_queries)
|
||||
|
||||
# Profile pipeline
|
||||
pipeline_stats = profile_retrieval_pipeline(args.num_docs, args.num_queries // 2)
|
||||
|
||||
# Breakdown
|
||||
profile_latency_breakdown(args.num_vectors, 128)
|
||||
finally:
|
||||
sys.stdout = sys.__stdout__
|
||||
|
||||
print(f"\nProfiling complete. Report saved to: {args.output}")
|
||||
print(f"\nKey Findings:")
|
||||
print(f" HNSW P99: {hnsw_stats['p99_ms']:.3f} ms")
|
||||
print(f" Pipeline P99: {pipeline_stats['p99_ms']:.3f} ms")
|
||||
|
||||
if hnsw_stats.get("outlier_count", 0) > 0:
|
||||
print(f" HNSW Outliers: {hnsw_stats['outlier_count']} ({hnsw_stats['outlier_percent']:.1f}%)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
355
scripts/run_benchmarks.py
Normal file
355
scripts/run_benchmarks.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""Run end-to-end benchmarks on real corpora with variance analysis."""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from llmds.data_sources.beir_loader import load_beir
|
||||
from llmds.data_sources.amazon_reviews import load_amazon_reviews
|
||||
from llmds.retrieval_pipeline import RetrievalPipeline
|
||||
from llmds.utils import Timer, memory_profiler, calculate_statistics
|
||||
|
||||
|
||||
def aggregate_repetitions(results: list[dict]) -> dict[str, Any]:
|
||||
"""
|
||||
Aggregate results across repetitions with variance analysis.
|
||||
|
||||
Args:
|
||||
results: List of result dictionaries from multiple repetitions
|
||||
|
||||
Returns:
|
||||
Dictionary with aggregated statistics including variance metrics
|
||||
"""
|
||||
if not results:
|
||||
return {}
|
||||
|
||||
# Extract metric names (all numeric keys except metadata)
|
||||
metadata_keys = {"corpus", "size", "ef_search", "M", "num_queries", "repetition"}
|
||||
metric_keys = [k for k in results[0].keys() if k not in metadata_keys]
|
||||
|
||||
aggregated = {
|
||||
"corpus": results[0].get("corpus"),
|
||||
"size": results[0].get("size"),
|
||||
"ef_search": results[0].get("ef_search"),
|
||||
"M": results[0].get("M"),
|
||||
"num_queries": results[0].get("num_queries"),
|
||||
"repetitions": len(results),
|
||||
}
|
||||
|
||||
# Calculate statistics for each metric
|
||||
for metric in metric_keys:
|
||||
values = [r.get(metric, 0.0) for r in results if metric in r]
|
||||
if values:
|
||||
stats_dict = calculate_statistics(values)
|
||||
# Store both mean/std and full statistics
|
||||
aggregated[f"{metric}_mean"] = stats_dict["mean"]
|
||||
aggregated[f"{metric}_std"] = stats_dict["std"]
|
||||
aggregated[f"{metric}_min"] = stats_dict["min"]
|
||||
aggregated[f"{metric}_max"] = stats_dict["max"]
|
||||
aggregated[f"{metric}_ci_lower"] = stats_dict["ci_lower"]
|
||||
aggregated[f"{metric}_ci_upper"] = stats_dict["ci_upper"]
|
||||
aggregated[f"{metric}_cv"] = stats_dict["cv"] # Coefficient of variation
|
||||
|
||||
# Identify flaky benchmarks (high variance)
|
||||
# Mark as flaky if CV > 20% for critical metrics
|
||||
critical_metrics = ["search_p50_ms", "search_p95_ms", "qps"]
|
||||
flaky_metrics = []
|
||||
for metric in critical_metrics:
|
||||
cv_key = f"{metric}_cv"
|
||||
if cv_key in aggregated and aggregated[cv_key] > 20.0:
|
||||
flaky_metrics.append(metric)
|
||||
|
||||
aggregated["flaky_metrics"] = flaky_metrics
|
||||
aggregated["is_flaky"] = len(flaky_metrics) > 0
|
||||
|
||||
return aggregated
|
||||
|
||||
|
||||
def load_corpus_sample(corpus_file: Path, size: int, seed: int = 42) -> list[dict]:
|
||||
"""Load a sample of documents from corpus."""
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
||||
all_docs = []
|
||||
with open(corpus_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
all_docs.append(json.loads(line))
|
||||
|
||||
if len(all_docs) <= size:
|
||||
return all_docs
|
||||
|
||||
# Sample without replacement
|
||||
return random.sample(all_docs, size)
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
corpus_file: Path,
|
||||
emb_file: Path | None,
|
||||
corpus_name: str,
|
||||
size: int,
|
||||
ef_search: int,
|
||||
M: int,
|
||||
num_queries: int = 100,
|
||||
embedding_dim: int = 384,
|
||||
) -> dict:
|
||||
"""
|
||||
Run benchmark on a corpus sample.
|
||||
|
||||
Returns:
|
||||
Dictionary with benchmark results
|
||||
"""
|
||||
print(f"\n=== Benchmarking {corpus_name} (size={size}, ef={ef_search}, M={M}) ===")
|
||||
|
||||
# Load corpus sample
|
||||
print(f"Loading corpus sample...")
|
||||
docs = load_corpus_sample(corpus_file, size)
|
||||
print(f"Loaded {len(docs)} documents")
|
||||
|
||||
# Load or generate embeddings
|
||||
if emb_file and emb_file.exists():
|
||||
embeddings = np.load(emb_file)
|
||||
# Trim to sample size
|
||||
embeddings = embeddings[:len(docs)]
|
||||
else:
|
||||
print("Generating deterministic embeddings...")
|
||||
rng = np.random.RandomState(42)
|
||||
embeddings = []
|
||||
for i in range(len(docs)):
|
||||
emb = rng.randn(embedding_dim).astype(np.float32)
|
||||
emb = emb / np.linalg.norm(emb)
|
||||
embeddings.append(emb)
|
||||
embeddings = np.stack(embeddings)
|
||||
|
||||
# Build pipeline with deterministic seed
|
||||
print("Building pipeline...")
|
||||
|
||||
# Memory profiling for build phase
|
||||
with memory_profiler() as mem_profiler:
|
||||
pipeline = RetrievalPipeline(
|
||||
embedding_dim=embedding_dim,
|
||||
hnsw_M=M,
|
||||
hnsw_ef_search=ef_search,
|
||||
hnsw_ef_construction=ef_search * 4,
|
||||
seed=42, # Fixed seed for reproducible HNSW structure
|
||||
)
|
||||
|
||||
# Add documents
|
||||
build_times = []
|
||||
for i, doc in enumerate(docs):
|
||||
with Timer() as t:
|
||||
pipeline.add_document(
|
||||
doc_id=i,
|
||||
text=doc["text"],
|
||||
embedding=embeddings[i],
|
||||
)
|
||||
build_times.append(t.elapsed * 1000)
|
||||
# Sample memory periodically during build
|
||||
if (i + 1) % (len(docs) // 10 + 1) == 0:
|
||||
mem_profiler.sample()
|
||||
|
||||
build_peak_rss_mb = mem_profiler.get_peak_rss_mb()
|
||||
build_memory_delta_mb = mem_profiler.get_memory_delta_mb()
|
||||
|
||||
# Run queries with memory profiling
|
||||
print(f"Running {num_queries} queries...")
|
||||
search_times = []
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# Generate query embeddings
|
||||
query_embeddings = []
|
||||
for _ in range(num_queries):
|
||||
qemb = rng.randn(embedding_dim).astype(np.float32)
|
||||
qemb = qemb / np.linalg.norm(qemb)
|
||||
query_embeddings.append(qemb)
|
||||
|
||||
# Use document texts as queries (simplified)
|
||||
query_texts = [docs[i % len(docs)]["text"][:100] for i in range(num_queries)]
|
||||
|
||||
# Memory profiling for search phase
|
||||
with memory_profiler() as search_mem_profiler:
|
||||
for i, (query_text, query_emb) in enumerate(zip(query_texts, query_embeddings)):
|
||||
with Timer() as t:
|
||||
pipeline.search(query_text, query_embedding=query_emb, top_k=10)
|
||||
search_times.append(t.elapsed * 1000)
|
||||
|
||||
# Sample memory periodically during search
|
||||
if (i + 1) % 20 == 0:
|
||||
search_mem_profiler.sample()
|
||||
print(f"Completed {i + 1}/{num_queries} queries...")
|
||||
|
||||
search_peak_rss_mb = search_mem_profiler.get_peak_rss_mb()
|
||||
|
||||
# Overall peak RSS (maximum of build and search phases)
|
||||
overall_peak_rss_mb = max(build_peak_rss_mb, search_peak_rss_mb)
|
||||
|
||||
# Compute statistics
|
||||
build_times_sorted = sorted(build_times)
|
||||
search_times_sorted = sorted(search_times)
|
||||
|
||||
results = {
|
||||
"corpus": corpus_name,
|
||||
"size": size,
|
||||
"ef_search": ef_search,
|
||||
"M": M,
|
||||
"num_queries": num_queries,
|
||||
"build_p50_ms": build_times_sorted[len(build_times_sorted) // 2],
|
||||
"build_p95_ms": build_times_sorted[int(len(build_times_sorted) * 0.95)],
|
||||
"build_p99_ms": build_times_sorted[int(len(build_times_sorted) * 0.99)],
|
||||
"search_p50_ms": search_times_sorted[len(search_times_sorted) // 2],
|
||||
"search_p95_ms": search_times_sorted[int(len(search_times_sorted) * 0.95)],
|
||||
"search_p99_ms": search_times_sorted[int(len(search_times_sorted) * 0.99)],
|
||||
"avg_build_time_ms": sum(build_times) / len(build_times),
|
||||
"avg_search_time_ms": sum(search_times) / len(search_times),
|
||||
"qps": 1000.0 / (sum(search_times) / len(search_times)) if search_times else 0.0,
|
||||
# Memory metrics
|
||||
"peak_rss_mb": overall_peak_rss_mb,
|
||||
"build_peak_rss_mb": build_peak_rss_mb,
|
||||
"build_memory_delta_mb": build_memory_delta_mb,
|
||||
"search_peak_rss_mb": search_peak_rss_mb,
|
||||
}
|
||||
|
||||
print(f"✓ Results: P50={results['search_p50_ms']:.2f}ms, P95={results['search_p95_ms']:.2f}ms, QPS={results['qps']:.2f}, Peak RSS={results['peak_rss_mb']:.2f}MB")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run benchmarks on real corpora")
|
||||
parser.add_argument("--corpus", type=str, required=True, help="Corpus name")
|
||||
parser.add_argument("--corpus-file", type=Path, required=True, help="Corpus JSONL file")
|
||||
parser.add_argument("--emb-file", type=Path, help="Embeddings .npy file")
|
||||
parser.add_argument("--sizes", nargs="+", type=str, default=["10k"], help="Corpus sizes (e.g., 10k 50k 100k)")
|
||||
parser.add_argument("--ef", nargs="+", type=int, default=[50], help="HNSW efSearch values")
|
||||
parser.add_argument("--M", nargs="+", type=int, default=[16], help="HNSW M values")
|
||||
parser.add_argument("--num-queries", type=int, default=100, help="Number of queries")
|
||||
parser.add_argument("--repetitions", type=int, default=5, help="Number of repetitions for variance analysis (default: 5)")
|
||||
parser.add_argument("--output-dir", type=Path, default=Path("benchmarks/results"), help="Output directory")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse sizes
|
||||
def parse_size(s: str) -> int:
|
||||
s = s.lower()
|
||||
if s.endswith("k"):
|
||||
return int(s[:-1]) * 1000
|
||||
elif s.endswith("m"):
|
||||
return int(s[:-1]) * 1000000
|
||||
return int(s)
|
||||
|
||||
sizes = [parse_size(s) for s in args.sizes]
|
||||
|
||||
# Create output directory with timestamp
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_dir = args.output_dir / args.corpus / timestamp
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
all_results = []
|
||||
aggregated_results = []
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Running benchmarks with {args.repetitions} repetitions per configuration")
|
||||
print(f"{'='*70}\n")
|
||||
|
||||
# Run benchmarks
|
||||
for size in sizes:
|
||||
for ef in args.ef:
|
||||
for M in args.M:
|
||||
config_key = f"{size}_{ef}_{M}"
|
||||
print(f"Configuration: size={size}, ef={ef}, M={M}")
|
||||
|
||||
repetition_results = []
|
||||
for rep in range(args.repetitions):
|
||||
print(f" Repetition {rep + 1}/{args.repetitions}...", end=" ", flush=True)
|
||||
result = run_benchmark(
|
||||
corpus_file=args.corpus_file,
|
||||
emb_file=args.emb_file,
|
||||
corpus_name=args.corpus,
|
||||
size=size,
|
||||
ef_search=ef,
|
||||
M=M,
|
||||
num_queries=args.num_queries,
|
||||
)
|
||||
result["repetition"] = rep
|
||||
repetition_results.append(result)
|
||||
all_results.append(result)
|
||||
print("✓")
|
||||
|
||||
# Aggregate across repetitions
|
||||
aggregated = aggregate_repetitions(repetition_results)
|
||||
if aggregated:
|
||||
# Keep original metrics for backward compatibility
|
||||
for metric in ["search_p50_ms", "search_p95_ms", "search_p99_ms", "qps"]:
|
||||
if f"{metric}_mean" in aggregated:
|
||||
aggregated[metric] = aggregated[f"{metric}_mean"]
|
||||
|
||||
aggregated_results.append(aggregated)
|
||||
|
||||
# Print variance summary
|
||||
print(f"\n Variance Summary:")
|
||||
print(f" Search P50: {aggregated.get('search_p50_ms_mean', 0):.2f} ± {aggregated.get('search_p50_ms_std', 0):.2f} ms (CV: {aggregated.get('search_p50_ms_cv', 0):.1f}%)")
|
||||
print(f" Search P95: {aggregated.get('search_p95_ms_mean', 0):.2f} ± {aggregated.get('search_p95_ms_std', 0):.2f} ms (CV: {aggregated.get('search_p95_ms_cv', 0):.1f}%)")
|
||||
print(f" QPS: {aggregated.get('qps_mean', 0):.2f} ± {aggregated.get('qps_std', 0):.2f} (CV: {aggregated.get('qps_cv', 0):.1f}%)")
|
||||
|
||||
if aggregated.get("is_flaky", False):
|
||||
print(f" ⚠️ FLAKY: High variance detected in {', '.join(aggregated.get('flaky_metrics', []))}")
|
||||
print()
|
||||
|
||||
# Save detailed results (all repetitions)
|
||||
results_file = output_dir / "results.json"
|
||||
with open(results_file, "w") as f:
|
||||
json.dump(all_results, f, indent=2)
|
||||
|
||||
# Save aggregated results with variance statistics
|
||||
aggregated_file = output_dir / "results_aggregated.json"
|
||||
with open(aggregated_file, "w") as f:
|
||||
json.dump(aggregated_results, f, indent=2)
|
||||
|
||||
# Save CSV with all repetitions
|
||||
csv_file = output_dir / "results.csv"
|
||||
if all_results:
|
||||
fieldnames = list(all_results[0].keys())
|
||||
with open(csv_file, "w", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_results)
|
||||
|
||||
# Save aggregated CSV
|
||||
aggregated_csv_file = output_dir / "results_aggregated.csv"
|
||||
if aggregated_results:
|
||||
agg_fieldnames = list(aggregated_results[0].keys())
|
||||
with open(aggregated_csv_file, "w", newline="") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=agg_fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(aggregated_results)
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Benchmark Summary")
|
||||
print(f"{'='*70}")
|
||||
print(f"Total configurations: {len(aggregated_results)}")
|
||||
print(f"Total repetitions: {len(all_results)}")
|
||||
flaky_count = sum(1 for r in aggregated_results if r.get("is_flaky", False))
|
||||
if flaky_count > 0:
|
||||
print(f"⚠️ Flaky configurations: {flaky_count}")
|
||||
print(f"\nResults saved to:")
|
||||
print(f" - Detailed: {results_file}")
|
||||
print(f" - Aggregated: {aggregated_file}")
|
||||
print(f" - CSV: {csv_file}")
|
||||
print(f" - Aggregated CSV: {aggregated_csv_file}")
|
||||
print(f"{'='*70}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
281
scripts/run_multi_dataset_benchmarks.py
Normal file
281
scripts/run_multi_dataset_benchmarks.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""Run benchmarks across multiple datasets for comparison."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def prepare_dataset(
|
||||
source: str,
|
||||
corpus_name: str,
|
||||
output_dir: Path,
|
||||
limit: int | None = None,
|
||||
download: bool = True,
|
||||
) -> Path | None:
|
||||
"""Prepare a dataset: download, prepare embeddings, ready for benchmarking."""
|
||||
corpus_dir = output_dir / "raw" / corpus_name
|
||||
embeddings_dir = output_dir / "embeddings"
|
||||
corpus_file = None
|
||||
|
||||
# Find existing corpus file (check multiple possible names)
|
||||
possible_files = ["corpus.jsonl", "reviews.jsonl", "business_reviews.jsonl", "pages.jsonl"]
|
||||
for filename in possible_files:
|
||||
if (corpus_dir / filename).exists():
|
||||
corpus_file = corpus_dir / filename
|
||||
break
|
||||
|
||||
# Also check beir subdirectory for fiqa
|
||||
if corpus_file is None and corpus_name == "fiqa":
|
||||
beir_dir = output_dir / "raw" / "beir" / corpus_name
|
||||
if (beir_dir / "corpus.jsonl").exists():
|
||||
corpus_file = beir_dir / "corpus.jsonl"
|
||||
|
||||
# Download if needed and not exists
|
||||
if download and corpus_file is None:
|
||||
print(f"\n📥 Downloading {corpus_name}...")
|
||||
try:
|
||||
if source.startswith("beir:"):
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/download_corpus.py",
|
||||
"--source", source,
|
||||
"--output", str(corpus_dir),
|
||||
]
|
||||
else:
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/download_corpus.py",
|
||||
"--source", source,
|
||||
"--output", str(corpus_dir),
|
||||
]
|
||||
if limit:
|
||||
cmd.extend(["--limit", str(limit)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ Download failed: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Find corpus file after download
|
||||
if (corpus_dir / "corpus.jsonl").exists():
|
||||
corpus_file = corpus_dir / "corpus.jsonl"
|
||||
elif corpus_name == "amazon23" and (corpus_dir / "reviews.jsonl").exists():
|
||||
corpus_file = corpus_dir / "reviews.jsonl"
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error downloading {corpus_name}: {e}")
|
||||
return None
|
||||
|
||||
if corpus_file is None or not corpus_file.exists():
|
||||
print(f"⚠️ Corpus file not found for {corpus_name}")
|
||||
return None
|
||||
|
||||
# Check embeddings
|
||||
emb_file = embeddings_dir / f"{corpus_name}.npy"
|
||||
if not emb_file.exists():
|
||||
print(f"\n🔢 Preparing embeddings for {corpus_name}...")
|
||||
embeddings_dir.mkdir(parents=True, exist_ok=True)
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/prepare_embeddings.py",
|
||||
"--input", str(corpus_file),
|
||||
"--output", str(emb_file),
|
||||
"--dim", "384",
|
||||
"--seed", "42",
|
||||
]
|
||||
if limit:
|
||||
cmd.extend(["--limit", str(limit)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ Embedding preparation failed: {result.stderr}")
|
||||
return None
|
||||
|
||||
return corpus_file
|
||||
|
||||
|
||||
def run_benchmarks_for_dataset(
|
||||
corpus_name: str,
|
||||
corpus_file: Path,
|
||||
emb_file: Path,
|
||||
sizes: list[str],
|
||||
ef_values: list[int],
|
||||
M_values: list[int],
|
||||
num_queries: int = 50, # Reduced for faster multi-dataset runs
|
||||
output_dir: Path = Path("benchmarks/results"),
|
||||
) -> Path | None:
|
||||
"""Run benchmarks for a single dataset."""
|
||||
print(f"\n🚀 Running benchmarks for {corpus_name}...")
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"scripts/run_benchmarks.py",
|
||||
"--corpus", corpus_name,
|
||||
"--corpus-file", str(corpus_file),
|
||||
"--emb-file", str(emb_file),
|
||||
"--sizes", *sizes,
|
||||
"--ef", *[str(e) for e in ef_values],
|
||||
"--M", *[str(m) for m in M_values],
|
||||
"--num-queries", str(num_queries),
|
||||
"--output-dir", str(output_dir),
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"⚠️ Benchmark failed for {corpus_name}: {result.stderr}")
|
||||
return None
|
||||
|
||||
# Find the results directory
|
||||
results_dir = output_dir / corpus_name
|
||||
if results_dir.exists():
|
||||
timestamp_dirs = sorted([d for d in results_dir.iterdir() if d.is_dir()], key=lambda x: x.name)
|
||||
if timestamp_dirs:
|
||||
return timestamp_dirs[-1] / "results.json"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run benchmarks across multiple datasets")
|
||||
parser.add_argument(
|
||||
"--datasets",
|
||||
nargs="+",
|
||||
default=["fiqa", "amazon23", "msmarco"],
|
||||
help="Datasets to benchmark"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sizes",
|
||||
nargs="+",
|
||||
default=["10k", "25k", "50k"],
|
||||
help="Corpus sizes (e.g., 10k 25k 50k)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ef",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[50, 100],
|
||||
help="HNSW efSearch values"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--M",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[8, 16],
|
||||
help="HNSW M values"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-queries",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Number of queries per benchmark"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-download",
|
||||
action="store_true",
|
||||
help="Skip downloading datasets (use existing)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Limit documents per dataset (for large datasets)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("benchmarks/results"),
|
||||
help="Output directory"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Dataset sources mapping
|
||||
dataset_sources = {
|
||||
"fiqa": "beir:fiqa",
|
||||
"amazon23": "amazon23",
|
||||
"msmarco": "msmarco",
|
||||
}
|
||||
|
||||
data_dir = Path("data")
|
||||
embeddings_dir = data_dir / "embeddings"
|
||||
embeddings_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = {}
|
||||
|
||||
print("=" * 70)
|
||||
print("Multi-Dataset Benchmark Runner")
|
||||
print("=" * 70)
|
||||
print(f"Datasets: {', '.join(args.datasets)}")
|
||||
print(f"Sizes: {', '.join(args.sizes)}")
|
||||
print(f"efSearch: {', '.join(map(str, args.ef))}")
|
||||
print(f"M: {', '.join(map(str, args.M))}")
|
||||
print("=" * 70)
|
||||
|
||||
for corpus_name in args.datasets:
|
||||
if corpus_name not in dataset_sources:
|
||||
print(f"⚠️ Unknown dataset: {corpus_name}, skipping")
|
||||
continue
|
||||
|
||||
source = dataset_sources[corpus_name]
|
||||
limit = args.limit if corpus_name in ["amazon23", "msmarco"] else None
|
||||
|
||||
# Prepare dataset
|
||||
corpus_file = prepare_dataset(
|
||||
source=source,
|
||||
corpus_name=corpus_name,
|
||||
output_dir=data_dir,
|
||||
limit=limit,
|
||||
download=not args.skip_download,
|
||||
)
|
||||
|
||||
if corpus_file is None:
|
||||
print(f"⚠️ Skipping {corpus_name} - preparation failed")
|
||||
continue
|
||||
|
||||
# Check embeddings
|
||||
emb_file = embeddings_dir / f"{corpus_name}.npy"
|
||||
if not emb_file.exists():
|
||||
print(f"⚠️ Embeddings not found for {corpus_name}, skipping")
|
||||
continue
|
||||
|
||||
# Run benchmarks
|
||||
results_file = run_benchmarks_for_dataset(
|
||||
corpus_name=corpus_name,
|
||||
corpus_file=corpus_file,
|
||||
emb_file=emb_file,
|
||||
sizes=args.sizes,
|
||||
ef_values=args.ef,
|
||||
M_values=args.M,
|
||||
num_queries=args.num_queries,
|
||||
output_dir=args.output_dir,
|
||||
)
|
||||
|
||||
if results_file and results_file.exists():
|
||||
with open(results_file) as f:
|
||||
results[corpus_name] = json.load(f)
|
||||
print(f"✓ {corpus_name} benchmarks completed")
|
||||
else:
|
||||
print(f"⚠️ {corpus_name} benchmarks incomplete")
|
||||
|
||||
# Save combined results
|
||||
if results:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
combined_file = args.output_dir / f"multi_dataset_{timestamp}.json"
|
||||
combined_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(combined_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
print(f"\n✓ Combined results saved to {combined_file}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Multi-dataset benchmarks completed!")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
306
scripts/security_scan.py
Normal file
306
scripts/security_scan.py
Normal file
@@ -0,0 +1,306 @@
|
||||
"""Security scanning script using Bandit and pip-audit.
|
||||
|
||||
This script runs security scans to identify vulnerabilities.
|
||||
Note: Requires bandit and pip-audit to be installed.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def run_bandit(output_dir: Path) -> bool:
|
||||
"""
|
||||
Run Bandit security scanner.
|
||||
|
||||
Args:
|
||||
output_dir: Directory to save results
|
||||
|
||||
Returns:
|
||||
True if scan completed successfully
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
json_output = output_dir / "bandit_report.json"
|
||||
txt_output = output_dir / "bandit_report.txt"
|
||||
|
||||
print("Running Bandit security scanner...")
|
||||
print("=" * 80)
|
||||
|
||||
try:
|
||||
# Run Bandit with JSON and text output
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, "-m", "bandit",
|
||||
"-r", "llmds",
|
||||
"-f", "json",
|
||||
"-o", str(json_output),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Also generate text report
|
||||
subprocess.run(
|
||||
[
|
||||
sys.executable, "-m", "bandit",
|
||||
"-r", "llmds",
|
||||
"-f", "txt",
|
||||
"-o", str(txt_output),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Parse results
|
||||
if json_output.exists():
|
||||
with open(json_output) as f:
|
||||
bandit_data = json.load(f)
|
||||
|
||||
# Count issues by severity
|
||||
metrics = bandit_data.get("metrics", {})
|
||||
total = metrics.get("_totals", {})
|
||||
|
||||
print(f"\nBandit Results:")
|
||||
print(f" HIGH: {total.get('SEVERITY.HIGH', 0)} issues")
|
||||
print(f" MEDIUM: {total.get('SEVERITY.MEDIUM', 0)} issues")
|
||||
print(f" LOW: {total.get('SEVERITY.LOW', 0)} issues")
|
||||
print(f" Total: {total.get('CONFIDENCE.HIGH', 0)} high confidence issues")
|
||||
|
||||
# List high severity issues
|
||||
high_severity = [
|
||||
issue for issue in bandit_data.get("results", [])
|
||||
if issue.get("issue_severity") == "HIGH"
|
||||
]
|
||||
|
||||
if high_severity:
|
||||
print(f"\n HIGH Severity Issues ({len(high_severity)}):")
|
||||
for issue in high_severity[:10]: # Show first 10
|
||||
print(f" - {issue.get('test_id')}: {issue.get('test_name')}")
|
||||
print(f" File: {issue.get('filename')}:{issue.get('line_number')}")
|
||||
|
||||
print(f"\n Full report: {txt_output}")
|
||||
print(f" JSON report: {json_output}")
|
||||
|
||||
return total.get("SEVERITY.HIGH", 0) == 0
|
||||
else:
|
||||
print(" Warning: Bandit JSON output not found")
|
||||
return False
|
||||
|
||||
except FileNotFoundError:
|
||||
print(" Error: Bandit not installed. Install with: pip install bandit[toml]")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" Error running Bandit: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def run_pip_audit(output_dir: Path) -> bool:
|
||||
"""
|
||||
Run pip-audit to check for known vulnerabilities in dependencies.
|
||||
|
||||
Args:
|
||||
output_dir: Directory to save results
|
||||
|
||||
Returns:
|
||||
True if no HIGH/CRITICAL vulnerabilities found
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
json_output = output_dir / "pip_audit_report.json"
|
||||
txt_output = output_dir / "pip_audit_report.txt"
|
||||
|
||||
print("\nRunning pip-audit security scanner...")
|
||||
print("=" * 80)
|
||||
|
||||
try:
|
||||
# Run pip-audit
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, "-m", "pip_audit",
|
||||
"--format", "json",
|
||||
"--output", str(json_output),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Also generate text output
|
||||
subprocess.run(
|
||||
[
|
||||
sys.executable, "-m", "pip_audit",
|
||||
"--format", "text",
|
||||
"--output", str(txt_output),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Parse results
|
||||
if json_output.exists():
|
||||
with open(json_output) as f:
|
||||
audit_data = json.load(f)
|
||||
|
||||
vulnerabilities = audit_data.get("vulnerabilities", [])
|
||||
high_critical = [
|
||||
v for v in vulnerabilities
|
||||
if v.get("aliases", [{}])[0].get("severity", "").upper() in ["HIGH", "CRITICAL"]
|
||||
]
|
||||
|
||||
print(f"\npip-audit Results:")
|
||||
print(f" Total vulnerabilities: {len(vulnerabilities)}")
|
||||
print(f" HIGH/CRITICAL: {len(high_critical)}")
|
||||
|
||||
if high_critical:
|
||||
print(f"\n HIGH/CRITICAL Vulnerabilities:")
|
||||
for vuln in high_critical[:10]: # Show first 10
|
||||
package = vuln.get("name", "unknown")
|
||||
severity = vuln.get("aliases", [{}])[0].get("severity", "UNKNOWN")
|
||||
print(f" - {package}: {severity}")
|
||||
if "versions" in vuln:
|
||||
print(f" Affected versions: {vuln['versions']}")
|
||||
|
||||
print(f"\n Full report: {txt_output}")
|
||||
print(f" JSON report: {json_output}")
|
||||
|
||||
return len(high_critical) == 0
|
||||
else:
|
||||
print(" Warning: pip-audit JSON output not found")
|
||||
# Check if there were errors
|
||||
if result.stderr:
|
||||
print(f" Error output: {result.stderr}")
|
||||
return False
|
||||
|
||||
except FileNotFoundError:
|
||||
print(" Error: pip-audit not installed. Install with: pip install pip-audit")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f" Error running pip-audit: {e}")
|
||||
if result.stderr:
|
||||
print(f" Error output: {result.stderr}")
|
||||
return False
|
||||
|
||||
|
||||
def generate_sbom(output_dir: Path) -> bool:
|
||||
"""
|
||||
Generate Software Bill of Materials (SBOM) using pip-audit.
|
||||
|
||||
Args:
|
||||
output_dir: Directory to save SBOM
|
||||
|
||||
Returns:
|
||||
True if SBOM generated successfully
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
sbom_output = output_dir / "sbom.json"
|
||||
|
||||
print("\nGenerating SBOM (Software Bill of Materials)...")
|
||||
print("=" * 80)
|
||||
|
||||
try:
|
||||
# Try to generate SBOM using pip-audit (if supported)
|
||||
# Note: pip-audit may need additional flags for SBOM generation
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, "-m", "pip_audit",
|
||||
"--format", "json",
|
||||
"--output", str(sbom_output),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
if sbom_output.exists():
|
||||
print(f" SBOM generated: {sbom_output}")
|
||||
print(" Note: For CycloneDX format, consider using cyclonedx-bom or pip-tools")
|
||||
return True
|
||||
else:
|
||||
print(" Warning: SBOM generation may require additional tools")
|
||||
print(" Consider using: cyclonedx-py or pip-tools for full SBOM")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error generating SBOM: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all security scans."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run security scans")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("audit/security"),
|
||||
help="Directory for security scan results (default: audit/security)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-bandit",
|
||||
action="store_true",
|
||||
help="Skip Bandit scan",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-pip-audit",
|
||||
action="store_true",
|
||||
help="Skip pip-audit scan",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-sbom",
|
||||
action="store_true",
|
||||
help="Skip SBOM generation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Security Scanning")
|
||||
print("=" * 80)
|
||||
print(f"Output directory: {args.output_dir}")
|
||||
print()
|
||||
|
||||
results = {}
|
||||
|
||||
# Run Bandit
|
||||
if not args.skip_bandit:
|
||||
results["bandit"] = run_bandit(args.output_dir)
|
||||
else:
|
||||
print("Skipping Bandit scan")
|
||||
|
||||
# Run pip-audit
|
||||
if not args.skip_pip_audit:
|
||||
results["pip_audit"] = run_pip_audit(args.output_dir)
|
||||
else:
|
||||
print("Skipping pip-audit scan")
|
||||
|
||||
# Generate SBOM
|
||||
if not args.skip_sbom:
|
||||
results["sbom"] = generate_sbom(args.output_dir)
|
||||
else:
|
||||
print("Skipping SBOM generation")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("Summary")
|
||||
print("=" * 80)
|
||||
|
||||
all_passed = all(results.values())
|
||||
|
||||
for tool, passed in results.items():
|
||||
status = "✓ PASSED" if passed else "✗ FAILED"
|
||||
print(f" {tool}: {status}")
|
||||
|
||||
if all_passed:
|
||||
print("\n✓ All security scans passed!")
|
||||
return 0
|
||||
else:
|
||||
print("\n✗ Some security issues found. Please review reports.")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user