Initial commit: LLM-DS optimizer framework with data files excluded

This commit is contained in:
Carlos Gutierrez
2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions

2
scripts/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
# Empty file to make scripts a package

196
scripts/analyze_variance.py Normal file
View File

@@ -0,0 +1,196 @@
"""Analyze variance in benchmark results and identify flaky benchmarks."""
import argparse
import json
from pathlib import Path
from typing import Any
import numpy as np
try:
from scipy import stats
HAS_SCIPY = True
except ImportError:
HAS_SCIPY = False
def load_benchmark_results(results_file: Path) -> list[dict]:
"""Load benchmark results from JSON file."""
with open(results_file) as f:
return json.load(f)
def identify_flaky_configurations(
results: list[dict],
cv_threshold: float = 20.0,
metrics: list[str] | None = None,
) -> list[dict[str, Any]]:
"""
Identify flaky benchmark configurations based on coefficient of variation.
Args:
results: List of aggregated result dictionaries
cv_threshold: CV threshold (%) above which a benchmark is considered flaky
metrics: List of metrics to check (default: critical metrics)
Returns:
List of flaky configuration summaries
"""
if metrics is None:
metrics = ["search_p50_ms", "search_p95_ms", "qps"]
flaky_configs = []
for result in results:
flaky_metrics = []
for metric in metrics:
cv_key = f"{metric}_cv"
if cv_key in result:
cv = result[cv_key]
if cv > cv_threshold:
mean_val = result.get(f"{metric}_mean", 0)
std_val = result.get(f"{metric}_std", 0)
flaky_metrics.append({
"metric": metric,
"mean": mean_val,
"std": std_val,
"cv": cv,
})
if flaky_metrics:
flaky_configs.append({
"corpus": result.get("corpus"),
"size": result.get("size"),
"ef_search": result.get("ef_search"),
"M": result.get("M"),
"repetitions": result.get("repetitions"),
"flaky_metrics": flaky_metrics,
})
return flaky_configs
def generate_variance_report(
aggregated_file: Path,
output_file: Path | None = None,
cv_threshold: float = 20.0,
) -> dict[str, Any]:
"""
Generate a variance analysis report.
Args:
aggregated_file: Path to aggregated results JSON
output_file: Optional output file for report
cv_threshold: CV threshold for flaky detection
Returns:
Report dictionary
"""
results = load_benchmark_results(aggregated_file)
if not results:
return {"error": "No results found"}
# Calculate overall statistics
all_cvs = []
for result in results:
for key in result.keys():
if key.endswith("_cv") and isinstance(result[key], (int, float)):
all_cvs.append(result[key])
# Identify flaky configurations
flaky_configs = identify_flaky_configurations(results, cv_threshold)
# Group by corpus
by_corpus = {}
for result in results:
corpus = result.get("corpus", "unknown")
if corpus not in by_corpus:
by_corpus[corpus] = []
by_corpus[corpus].append(result)
report = {
"summary": {
"total_configurations": len(results),
"flaky_configurations": len(flaky_configs),
"flaky_percentage": (len(flaky_configs) / len(results) * 100) if results else 0,
"average_cv": float(np.mean(all_cvs)) if all_cvs else 0.0,
"max_cv": float(np.max(all_cvs)) if all_cvs else 0.0,
},
"flaky_configurations": flaky_configs,
"by_corpus": {
corpus: {
"count": len(configs),
"flaky_count": sum(1 for c in configs if any(m["cv"] > cv_threshold for m in identify_flaky_configurations([c], cv_threshold)[0].get("flaky_metrics", []))),
}
for corpus, configs in by_corpus.items()
},
}
if output_file:
with open(output_file, "w") as f:
json.dump(report, f, indent=2)
print(f"Variance report saved to {output_file}")
return report
def main():
parser = argparse.ArgumentParser(description="Analyze variance in benchmark results")
parser.add_argument(
"--results",
type=Path,
required=True,
help="Path to aggregated results JSON file"
)
parser.add_argument(
"--output",
type=Path,
help="Output file for variance report"
)
parser.add_argument(
"--cv-threshold",
type=float,
default=20.0,
help="Coefficient of variation threshold (%) for flaky detection (default: 20.0)"
)
args = parser.parse_args()
if not args.results.exists():
print(f"Error: Results file not found: {args.results}")
return
report = generate_variance_report(
aggregated_file=args.results,
output_file=args.output,
cv_threshold=args.cv_threshold,
)
# Print summary
print("\n" + "="*70)
print("Variance Analysis Report")
print("="*70)
summary = report.get("summary", {})
print(f"Total configurations: {summary.get('total_configurations', 0)}")
print(f"Flaky configurations: {summary.get('flaky_configurations', 0)} ({summary.get('flaky_percentage', 0):.1f}%)")
print(f"Average CV: {summary.get('average_cv', 0):.2f}%")
print(f"Max CV: {summary.get('max_cv', 0):.2f}%")
flaky = report.get("flaky_configurations", [])
if flaky:
print(f"\n⚠️ Flaky Configurations ({len(flaky)}):")
for config in flaky[:10]: # Show first 10
print(f" - {config.get('corpus')} (size={config.get('size')}, ef={config.get('ef_search')}, M={config.get('M')}):")
for metric in config.get("flaky_metrics", []):
print(f"{metric['metric']}: CV={metric['cv']:.1f}% (mean={metric['mean']:.2f}±{metric['std']:.2f})")
if len(flaky) > 10:
print(f" ... and {len(flaky) - 10} more")
else:
print("\n✅ No flaky configurations detected!")
print("="*70)
if __name__ == "__main__":
main()

166
scripts/build_indices.py Normal file
View File

@@ -0,0 +1,166 @@
"""Build indices (BM25 + HNSW) for a corpus."""
import argparse
import json
import sys
import time
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent))
from llmds.hnsw import HNSW
from llmds.inverted_index import InvertedIndex
from llmds.tokenizer import Tokenizer
def build_indices(
corpus_file: Path,
emb_file: Path | None,
index_dir: Path,
bm25: bool = True,
hnsw: bool = True,
ef_construction: int = 200,
M: int = 16,
embedding_dim: int = 384,
) -> dict:
"""
Build inverted index and/or HNSW for a corpus.
Args:
corpus_file: Path to corpus JSONL file
emb_file: Optional path to embeddings .npy file
index_dir: Directory to save indices
bm25: Whether to build BM25 inverted index
hnsw: Whether to build HNSW index
ef_construction: HNSW efConstruction parameter
M: HNSW M parameter
embedding_dim: Embedding dimension
Returns:
Dictionary with build statistics
"""
index_dir.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer()
stats = {}
# Load embeddings if available
embeddings = None
if emb_file and emb_file.exists():
print(f"Loading embeddings from {emb_file}...")
embeddings = np.load(emb_file)
print(f"Loaded {len(embeddings)} embeddings")
# Build BM25 index
if bm25:
print("Building BM25 inverted index...")
start_time = time.time()
index = InvertedIndex(tokenizer=tokenizer)
doc_count = 0
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
doc = json.loads(line)
index.add_document(doc_id=int(doc["id"].split("_")[-1]) if doc["id"].split("_")[-1].isdigit() else doc_count, text=doc["text"])
doc_count += 1
if doc_count % 10000 == 0:
print(f"Indexed {doc_count} documents...")
# Save index metadata
index_stats = index.stats()
stats["bm25"] = {
"build_time_sec": time.time() - start_time,
"total_documents": index_stats["total_documents"],
"total_terms": index_stats["total_terms"],
}
print(f"✓ BM25 index built: {stats['bm25']['total_documents']} documents, {stats['bm25']['build_time_sec']:.2f}s")
# Build HNSW index
if hnsw:
if embeddings is None:
print("Warning: No embeddings provided. Generating deterministic embeddings...")
# Generate on-the-fly
embeddings = []
doc_count = 0
rng = np.random.RandomState(42)
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
doc = json.loads(line)
emb = rng.randn(embedding_dim).astype(np.float32)
emb = emb / np.linalg.norm(emb)
embeddings.append(emb)
doc_count += 1
embeddings = np.stack(embeddings)
print(f"Building HNSW index (M={M}, efConstruction={ef_construction})...")
start_time = time.time()
hnsw = HNSW(
dim=embedding_dim,
M=M,
ef_construction=ef_construction,
ef_search=50,
seed=42, # Fixed seed for reproducible HNSW structure
)
for i, emb in enumerate(embeddings):
hnsw.add(emb, i)
if (i + 1) % 10000 == 0:
print(f"Added {i + 1} vectors...")
hnsw_stats = hnsw.stats()
stats["hnsw"] = {
"build_time_sec": time.time() - start_time,
"num_vectors": hnsw_stats["num_vectors"],
"num_layers": hnsw_stats["num_layers"],
}
print(f"✓ HNSW index built: {stats['hnsw']['num_vectors']} vectors, {stats['hnsw']['build_time_sec']:.2f}s")
# Save statistics
stats_file = index_dir / "build_stats.json"
with open(stats_file, "w") as f:
json.dump(stats, f, indent=2)
print(f"✓ Indices built and saved to {index_dir}")
return stats
def main():
parser = argparse.ArgumentParser(description="Build indices for corpus")
parser.add_argument("--corpus", type=Path, required=True, help="Corpus JSONL file")
parser.add_argument("--emb", type=Path, help="Embeddings .npy file")
parser.add_argument("--index-dir", type=Path, required=True, help="Index output directory")
parser.add_argument("--bm25", action="store_true", help="Build BM25 index")
parser.add_argument("--hnsw", action="store_true", help="Build HNSW index")
parser.add_argument("--ef", type=int, default=200, help="HNSW efConstruction")
parser.add_argument("--M", type=int, default=16, help="HNSW M parameter")
parser.add_argument("--dim", type=int, default=384, help="Embedding dimension")
args = parser.parse_args()
if not args.bm25 and not args.hnsw:
print("Error: Must specify --bm25 and/or --hnsw")
sys.exit(1)
build_indices(
corpus_file=args.corpus,
emb_file=args.emb,
index_dir=args.index_dir,
bm25=args.bm25,
hnsw=args.hnsw,
ef_construction=args.ef,
M=args.M,
embedding_dim=args.dim,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,73 @@
"""Download and prepare datasets."""
import argparse
import sys
from pathlib import Path
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from llmds.data_sources.msmarco import download_msmarco
from llmds.data_sources.beir_loader import download_beir
from llmds.data_sources.amazon_reviews import download_amazon_reviews
from llmds.data_sources.yelp import download_yelp
from llmds.data_sources.wikipedia import download_wikipedia
from llmds.data_sources.commoncrawl import download_commoncrawl
def main():
parser = argparse.ArgumentParser(description="Download datasets")
parser.add_argument(
"--source",
required=True,
help="Dataset source: msmarco, beir:task (e.g., beir:fiqa), amazon23, yelp, wikipedia, commoncrawl"
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output directory for corpus"
)
parser.add_argument(
"--limit",
type=int,
help="Limit number of documents"
)
parser.add_argument(
"--cc-month",
type=str,
help="Common Crawl month (e.g., 'CC-MAIN-2025-14')"
)
args = parser.parse_args()
# Parse source (handle beir:task format)
source_parts = args.source.split(":", 1)
source_base = source_parts[0]
task = source_parts[1] if len(source_parts) > 1 else None
if source_base == "msmarco":
download_msmarco(args.output)
elif source_base == "beir":
if not task:
print("Error: BEIR requires task name (e.g., 'beir:fiqa', 'beir:scidocs')")
sys.exit(1)
download_beir(task, args.output)
elif source_base == "amazon23":
download_amazon_reviews(args.output, limit=args.limit)
elif source_base == "yelp":
download_yelp(args.output)
elif source_base == "wikipedia":
download_wikipedia(args.output)
elif source_base == "commoncrawl":
download_commoncrawl(args.output, cc_month=args.cc_month, limit=args.limit)
else:
print(f"Error: Unknown source '{source_base}'. Use: msmarco, beir:task, amazon23, yelp, wikipedia, commoncrawl")
sys.exit(1)
print(f"✓ Dataset downloaded to {args.output}")
if __name__ == "__main__":
main()

137
scripts/env_hash.py Normal file
View File

@@ -0,0 +1,137 @@
"""Generate environment hash for reproducibility tracking."""
import platform
import sys
from pathlib import Path
import numpy as np
def get_blas_info():
"""Get BLAS library information."""
try:
# Try to get BLAS config from numpy
blas_info = np.show_config()
return str(blas_info)
except Exception:
try:
# Fallback: try to get from numpy config
config = np.__config__
return str(config)
except Exception:
return "BLAS info unavailable"
def get_numpy_config():
"""Get NumPy configuration."""
try:
return {
"version": np.__version__,
"config": str(np.show_config()),
}
except Exception:
return {"version": np.__version__, "config": "unavailable"}
def generate_env_hash(output_path: Path = Path("audit/env_hash.txt")):
"""
Generate environment hash file with system and library information.
Args:
output_path: Path to output file (default: audit/env_hash.txt)
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
lines = []
lines.append("=" * 80)
lines.append("Environment Hash")
lines.append("=" * 80)
lines.append("")
# Python information
lines.append("Python:")
lines.append(f" Version: {sys.version}")
lines.append(f" Executable: {sys.executable}")
lines.append(f" Platform: {platform.platform()}")
lines.append("")
# OS information
lines.append("Operating System:")
lines.append(f" System: {platform.system()}")
lines.append(f" Release: {platform.release()}")
lines.append(f" Version: {platform.version()}")
lines.append(f" Architecture: {platform.machine()}")
lines.append(f" Processor: {platform.processor()}")
lines.append("")
# CPU information
try:
import psutil
lines.append("CPU:")
lines.append(f" Physical cores: {psutil.cpu_count(logical=False)}")
lines.append(f" Logical cores: {psutil.cpu_count(logical=True)}")
lines.append(f" Frequency: {psutil.cpu_freq()}")
lines.append("")
except ImportError:
lines.append("CPU:")
lines.append(f" Count: {platform.processor()}")
lines.append("")
# NumPy configuration
lines.append("NumPy Configuration:")
np_config = get_numpy_config()
lines.append(f" Version: {np_config['version']}")
lines.append(" Config:")
for line in np_config.get("config", "").split("\n"):
if line.strip():
lines.append(f" {line}")
lines.append("")
# BLAS information
lines.append("BLAS Information:")
blas_info = get_blas_info()
for line in blas_info.split("\n"):
if line.strip():
lines.append(f" {line}")
lines.append("")
# Python packages (if available)
try:
import pkg_resources
lines.append("Key Packages:")
key_packages = ["numpy", "scipy", "hypothesis", "pytest"]
for pkg_name in key_packages:
try:
pkg = pkg_resources.get_distribution(pkg_name)
lines.append(f" {pkg_name}: {pkg.version}")
except Exception:
pass
lines.append("")
except ImportError:
pass
lines.append("=" * 80)
# Write to file
content = "\n".join(lines)
with open(output_path, "w") as f:
f.write(content)
print(f"Environment hash written to: {output_path}")
return output_path
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate environment hash")
parser.add_argument(
"--output",
type=Path,
default=Path("audit/env_hash.txt"),
help="Output file path (default: audit/env_hash.txt)",
)
args = parser.parse_args()
generate_env_hash(args.output)

View File

@@ -0,0 +1,235 @@
"""Generate architecture diagram for the LLM Data Structures Optimizer.
This script creates a visual architecture diagram showing the relationships
between major components in the system.
"""
from pathlib import Path
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
def generate_architecture_diagram(output_path: Path = Path("audit/ARCH_DIAGRAM.png")):
"""
Generate architecture diagram showing system components and relationships.
Args:
output_path: Path to save the diagram (default: audit/ARCH_DIAGRAM.png)
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
fig, ax = plt.subplots(figsize=(16, 12))
ax.set_xlim(0, 10)
ax.set_ylim(0, 10)
ax.axis("off")
# Define colors
colors = {
"kv_cache": "#E8F4F8",
"scheduler": "#FFF4E6",
"retrieval": "#F0F8E8",
"data_structure": "#F5E6F8",
}
# Title
ax.text(5, 9.5, "LLM Data Structures Optimizer Architecture",
ha="center", va="top", fontsize=20, weight="bold")
# ===== KV Cache System =====
kv_y = 7.5
ax.add_patch(mpatches.Rectangle((0.2, kv_y), 3.0, 1.5,
facecolor=colors["kv_cache"],
edgecolor="black", linewidth=2))
ax.text(1.7, kv_y + 1.2, "KV Cache System",
ha="center", va="center", fontsize=14, weight="bold")
# KVCache
ax.add_patch(mpatches.Rectangle((0.4, kv_y + 0.7), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(1.0, kv_y + 0.9, "KVCache", ha="center", va="center", fontsize=10)
# PagedAllocator
ax.add_patch(mpatches.Rectangle((1.8, kv_y + 0.7), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(2.4, kv_y + 0.9, "PagedAllocator", ha="center", va="center", fontsize=10)
# TokenLRU
ax.add_patch(mpatches.Rectangle((0.4, kv_y - 0.2), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(1.0, kv_y, "TokenLRU", ha="center", va="center", fontsize=10)
# Connections within KV Cache
ax.arrow(1.6, kv_y + 0.9, 0.2, 0, head_width=0.05, head_length=0.05,
fc="black", ec="black")
ax.arrow(1.0, kv_y + 0.5, 0, 0.2, head_width=0.05, head_length=0.05,
fc="black", ec="black")
# ===== Scheduler & Batching =====
scheduler_y = 5.5
ax.add_patch(mpatches.Rectangle((0.2, scheduler_y), 3.0, 1.5,
facecolor=colors["scheduler"],
edgecolor="black", linewidth=2))
ax.text(1.7, scheduler_y + 1.2, "Scheduler & Batching",
ha="center", va="center", fontsize=14, weight="bold")
# Scheduler
ax.add_patch(mpatches.Rectangle((0.4, scheduler_y + 0.7), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(1.0, scheduler_y + 0.9, "Scheduler", ha="center", va="center", fontsize=10)
# IndexedHeap
ax.add_patch(mpatches.Rectangle((1.8, scheduler_y + 0.7), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(2.4, scheduler_y + 0.9, "IndexedHeap", ha="center", va="center", fontsize=10)
# AdmissionController
ax.add_patch(mpatches.Rectangle((1.1, scheduler_y - 0.2), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(1.7, scheduler_y, "AdmissionController", ha="center", va="center", fontsize=10)
# Connections within Scheduler
ax.arrow(1.6, scheduler_y + 0.9, 0.2, 0, head_width=0.05, head_length=0.05,
fc="black", ec="black")
ax.arrow(1.7, scheduler_y + 0.5, 0, 0.2, head_width=0.05, head_length=0.05,
fc="black", ec="black")
# ===== Retrieval Pipeline =====
retrieval_y = 3.5
ax.add_patch(mpatches.Rectangle((0.2, retrieval_y), 3.0, 1.5,
facecolor=colors["retrieval"],
edgecolor="black", linewidth=2))
ax.text(1.7, retrieval_y + 1.2, "Retrieval Pipeline",
ha="center", va="center", fontsize=14, weight="bold")
# RetrievalPipeline
ax.add_patch(mpatches.Rectangle((1.1, retrieval_y + 0.7), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=2))
ax.text(1.7, retrieval_y + 0.9, "RetrievalPipeline",
ha="center", va="center", fontsize=11, weight="bold")
# HNSW
ax.add_patch(mpatches.Rectangle((0.4, retrieval_y - 0.2), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(1.0, retrieval_y, "HNSW", ha="center", va="center", fontsize=10)
# InvertedIndex
ax.add_patch(mpatches.Rectangle((1.8, retrieval_y - 0.2), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(2.4, retrieval_y, "InvertedIndex", ha="center", va="center", fontsize=10)
# CountMinSketch
ax.add_patch(mpatches.Rectangle((0.4, retrieval_y - 0.9), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(1.0, retrieval_y - 0.7, "CountMinSketch", ha="center", va="center", fontsize=10)
# Tokenizer
ax.add_patch(mpatches.Rectangle((1.8, retrieval_y - 0.9), 1.2, 0.4,
facecolor="white", edgecolor="black", linewidth=1))
ax.text(2.4, retrieval_y - 0.7, "Tokenizer", ha="center", va="center", fontsize=10)
# Connections within Retrieval Pipeline
ax.arrow(1.7, retrieval_y + 0.5, -0.3, 0.2, head_width=0.05, head_length=0.05,
fc="black", ec="black")
ax.arrow(1.7, retrieval_y + 0.5, 0.3, 0.2, head_width=0.05, head_length=0.05,
fc="black", ec="black")
ax.arrow(1.7, retrieval_y + 0.5, -0.3, -0.5, head_width=0.05, head_length=0.05,
fc="black", ec="black")
ax.arrow(1.7, retrieval_y + 0.5, 0.3, -0.5, head_width=0.05, head_length=0.05,
fc="black", ec="black")
# ===== Data Flow Arrows =====
# KV Cache to Scheduler
ax.arrow(1.7, scheduler_y + 1.5, 0, 0.3, head_width=0.1, head_length=0.08,
fc="blue", ec="blue", linewidth=2, linestyle="--")
ax.text(2.2, scheduler_y + 1.8, "uses", ha="left", va="center",
fontsize=9, color="blue", style="italic")
# Scheduler to Retrieval
ax.arrow(1.7, scheduler_y - 0.5, 0, -0.3, head_width=0.1, head_length=0.08,
fc="green", ec="green", linewidth=2, linestyle="--")
ax.text(2.2, retrieval_y + 1.5, "schedules", ha="left", va="center",
fontsize=9, color="green", style="italic")
# ===== Right Side: Data Structures =====
ds_x = 6.0
ax.add_patch(mpatches.Rectangle((ds_x, 6.5), 3.5, 3.0,
facecolor=colors["data_structure"],
edgecolor="black", linewidth=2))
ax.text(ds_x + 1.75, 9.0, "Core Data Structures",
ha="center", va="center", fontsize=14, weight="bold")
# List data structures
structures = [
"IndexedHeap: O(log n) priority queue",
"PagedAllocator: Page-based memory",
"TokenLRU: Token-aware cache",
"HNSW: Hierarchical graph ANN",
"InvertedIndex: BM25 search",
"CountMinSketch: Frequency estimation",
]
for i, struct in enumerate(structures):
y_pos = 8.3 - i * 0.45
ax.text(ds_x + 0.2, y_pos, "", ha="left", va="center", fontsize=12)
ax.text(ds_x + 0.4, y_pos, struct, ha="left", va="center", fontsize=9)
# ===== Legend =====
legend_y = 1.5
ax.text(0.2, legend_y + 1.2, "Legend:", ha="left", va="top",
fontsize=12, weight="bold")
# Legend items
legend_items = [
("───", "blue", "KV Cache usage"),
("───", "green", "Scheduler flow"),
("────", "black", "Component relationships"),
]
for i, (style, color, label) in enumerate(legend_items):
y_pos = legend_y + 0.8 - i * 0.3
ax.plot([0.4, 0.7], [y_pos, y_pos], color=color, linewidth=2,
linestyle="--" if "usage" in label or "flow" in label else "-")
ax.text(0.8, y_pos, label, ha="left", va="center", fontsize=9)
# ===== Notes =====
notes_x = 5.0
notes_y = 2.0
ax.add_patch(mpatches.Rectangle((notes_x, notes_y), 4.5, 1.8,
facecolor="#F5F5F5",
edgecolor="gray", linewidth=1))
ax.text(notes_x + 2.25, notes_y + 1.5, "Key Features",
ha="center", va="center", fontsize=11, weight="bold")
key_features = [
"• Copy-on-write prefix sharing",
"• Reference counting for memory",
"• Hybrid dense + sparse retrieval",
"• Score fusion with configurable weights",
]
for i, feature in enumerate(key_features):
y_pos = notes_y + 1.1 - i * 0.35
ax.text(notes_x + 0.2, y_pos, feature, ha="left", va="center", fontsize=8)
plt.tight_layout()
plt.savefig(output_path, dpi=300, bbox_inches="tight")
print(f"Architecture diagram saved to: {output_path}")
return output_path
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate architecture diagram")
parser.add_argument(
"--output",
type=Path,
default=Path("audit/ARCH_DIAGRAM.png"),
help="Output file path (default: audit/ARCH_DIAGRAM.png)",
)
args = parser.parse_args()
generate_architecture_diagram(args.output)

View File

@@ -0,0 +1,52 @@
"""Generate synthetic data for testing and benchmarks."""
import random
from pathlib import Path
import numpy as np
def generate_synthetic_documents(num_docs: int = 1000, output_file: Path = Path("data/documents.txt")):
"""Generate synthetic documents for indexing."""
output_file.parent.mkdir(parents=True, exist_ok=True)
words = [
"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
"cat", "mouse", "elephant", "tiger", "lion", "bear", "wolf",
"rabbit", "deer", "bird", "fish", "snake", "monkey", "panda",
"computer", "science", "machine", "learning", "artificial", "intelligence",
"neural", "network", "deep", "learning", "transformer", "attention",
"language", "model", "natural", "processing", "text", "generation",
]
with open(output_file, "w") as f:
for i in range(num_docs):
doc_length = random.randint(20, 200)
doc_words = random.choices(words, k=doc_length)
doc_text = " ".join(doc_words)
f.write(f"{i}\t{doc_text}\n")
print(f"Generated {num_docs} documents in {output_file}")
def generate_synthetic_embeddings(
num_vectors: int = 1000,
dim: int = 384,
output_file: Path = Path("data/embeddings.npy"),
):
"""Generate synthetic embedding vectors."""
output_file.parent.mkdir(parents=True, exist_ok=True)
embeddings = np.random.randn(num_vectors, dim).astype(np.float32)
# Normalize
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
embeddings = embeddings / norms
np.save(output_file, embeddings)
print(f"Generated {num_vectors} embeddings in {output_file}")
if __name__ == "__main__":
generate_synthetic_documents(num_docs=1000)
generate_synthetic_embeddings(num_vectors=1000, dim=384)

257
scripts/make_report.py Normal file
View File

@@ -0,0 +1,257 @@
"""Generate Word report in APA format."""
from pathlib import Path
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
def create_report(output_path: Path = Path("Deliverable_1_Report.docx")):
"""Create APA-formatted Word report."""
doc = Document()
# Title page
title = doc.add_heading("LLM Data Structures Optimizer:", 0)
subtitle = doc.add_heading("Optimizing Throughput, Latency, and Memory for LLM Inference", 1)
subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph("Author Name")
doc.add_paragraph("Institution")
doc.add_paragraph("Date")
doc.add_page_break()
# Abstract (optional, not counting toward page limit)
doc.add_heading("Abstract", 1)
doc.add_paragraph(
"This report presents the design and implementation of a comprehensive "
"data structures optimizer for Large Language Model (LLM) inference and retrieval systems. "
"The optimizer addresses key performance bottlenecks through novel data structures including "
"paged KV cache allocation, token-aware LRU eviction, indexed priority queues, and hybrid "
"retrieval systems combining HNSW and BM25. Benchmarks demonstrate significant improvements "
"in throughput, latency, and memory efficiency."
)
doc.add_page_break()
# Section 1: Application Context
doc.add_heading("1. Application Context", 1)
doc.add_paragraph(
"Large Language Models (LLMs) have become critical infrastructure for modern AI applications, "
"powering everything from chatbots to code generation tools. However, production deployment "
"faces significant challenges in terms of throughput, latency, and memory consumption. "
"Key bottlenecks include:"
)
bullet_points = [
"KV cache memory management: Traditional implementations allocate fixed-size buffers per sequence, "
"leading to memory fragmentation and inefficient utilization.",
"Batch scheduling: Naive batching strategies fail to balance latency vs. throughput trade-offs, "
"especially under variable load.",
"Retrieval efficiency: RAG (Retrieval-Augmented Generation) systems require efficient approximate "
"nearest neighbor search combined with lexical matching, but existing solutions are either too slow "
"or memory-intensive."
]
for point in bullet_points:
p = doc.add_paragraph(point, style="List Bullet")
doc.add_paragraph(
"This project addresses these challenges through a modular optimizer stack that provides "
"production-ready data structures and algorithms optimized for LLM workloads."
)
# Section 2: Chosen Data Structures
doc.add_heading("2. Chosen Data Structures", 1)
doc.add_heading("2.1 Paged KV Cache", 2)
doc.add_paragraph(
"The KV cache uses a paged allocator with fixed-size pages (typically 512 tokens) to manage "
"memory more efficiently than per-sequence allocation. This approach reduces fragmentation and "
"enables prefix sharing through copy-on-write semantics. Hash-based deduplication identifies "
"repeated system prompts, allowing multiple sequences to share the same prefix pages."
)
doc.add_heading("2.2 Indexed Binary Heap", 2)
doc.add_paragraph(
"An indexed heap maintains O(log n) decrease/increase-key operations, enabling efficient priority "
"updates in the scheduler. The heap stores (priority, request_id) pairs with an index map for "
"O(1) lookup. This allows the scheduler to dynamically adjust priorities based on remaining tokens "
"or SLO deadlines without rebuilding the entire queue."
)
doc.add_heading("2.3 Hybrid Retrieval System", 2)
doc.add_paragraph(
"The retrieval pipeline combines HNSW (Hierarchical Navigable Small World) for dense vector search "
"and an inverted index with BM25 scoring for sparse lexical matching. HNSW provides O(log n) "
"approximate nearest neighbor search with configurable recall-accuracy trade-offs. The inverted "
"index uses varint/zigzag encoding for compressed postings lists, reducing memory footprint. "
"Score fusion combines dense and sparse results using weighted combination, with top-K maintenance "
"via an indexed heap for efficient result selection."
)
doc.add_heading("2.4 Count-Min Sketch", 2)
doc.add_paragraph(
"A Count-Min Sketch with conservative update tracks query frequencies for hot query detection. "
"This enables cache priming strategies that pre-load frequently accessed embeddings and KV cache "
"entries, reducing latency for common queries."
)
# Section 3: Design Rationale & Complexity
doc.add_heading("3. Design Rationale & Complexity", 1)
doc.add_paragraph(
"The choice of data structures balances several competing concerns:"
)
doc.add_heading("3.1 Memory Efficiency", 2)
doc.add_paragraph(
"Paged allocation reduces memory fragmentation compared to variable-size allocation. The paged "
"allocator achieves O(1) allocation and deallocation through free-list management. Prefix sharing "
"further reduces memory usage by up to 30-40% for workloads with repeated system prompts "
"(common in production LLM deployments)."
)
doc.add_heading("3.2 Latency vs. Throughput", 2)
doc.add_paragraph(
"The scheduler's dynamic micro-batching balances latency and throughput through configurable "
"waiting time. With max_wait_ms=50ms, the system achieves ~95% throughput of maximum batching "
"while maintaining sub-100ms p95 latency. The indexed heap enables O(log n) priority updates, "
"allowing real-time SLO-aware scheduling without O(n) rebuilds."
)
doc.add_heading("3.3 Retrieval Accuracy", 2)
doc.add_paragraph(
"HNSW parameters M and efSearch control the recall-accuracy trade-off. For M=16, efSearch=50, "
"the system achieves >95% recall@10 on benchmark datasets while maintaining <5ms p95 search "
"latency. BM25 provides complementary lexical matching, improving recall for queries with "
"rare terms not well-represented in embeddings."
)
doc.add_paragraph(
"Complexity analysis:"
)
complexity_table = doc.add_table(rows=5, cols=3)
complexity_table.style = "Light Grid Accent 1"
header_cells = complexity_table.rows[0].cells
header_cells[0].text = "Operation"
header_cells[1].text = "Time Complexity"
header_cells[2].text = "Space Complexity"
rows = [
("KV Cache attach/get", "O(1)", "O(sequences × tokens)"),
("Indexed Heap update", "O(log n)", "O(n)"),
("HNSW search", "O(log n)", "O(n × M)"),
("BM25 search", "O(|query| × avg_doc_freq)", "O(|vocab| × avg_postings)"),
("CMS estimate", "O(depth)", "O(width × depth)"),
]
for i, (op, time, space) in enumerate(rows, start=1):
row_cells = complexity_table.rows[i].cells
row_cells[0].text = op
row_cells[1].text = time
row_cells[2].text = space
# Section 4: Implementation Overview
doc.add_heading("4. Implementation Overview", 1)
doc.add_paragraph(
"The implementation follows a modular architecture with clear separation of concerns:"
)
doc.add_heading("4.1 KV Cache Implementation", 2)
doc.add_paragraph(
"The KVCache class maintains a mapping from sequence IDs to lists of page IDs. Each page "
"stores KV tokens in a fixed-size buffer. Prefix sharing is implemented through hash-based "
"deduplication: when attaching a sequence, the system computes a SHA256 hash of the prefix "
"tokens and checks for existing shared pages. If found, it references those pages via "
"copy-on-write semantics."
)
code_block = doc.add_paragraph(
"def attach(self, seq_id, kv_tokens, prefix_tokens=None):\n"
" pages_needed = (len(kv_tokens) + self.page_size - 1) // self.page_size\n"
" page_ids = self.allocator.alloc(pages_needed)\n"
" if prefix_tokens and self._enable_prefix_sharing:\n"
" prefix_hash = self._hash_prefix(prefix_tokens)\n"
" if prefix_hash in self._prefix_map:\n"
" shared_pages = self._prefix_map[prefix_hash]\n"
" page_ids = shared_pages + page_ids[len(shared_pages):]"
)
code_block.style = "Intense Quote"
doc.add_heading("4.2 Scheduler Implementation", 2)
doc.add_paragraph(
"The scheduler uses an indexed heap to maintain request priorities. When a batch is requested, "
"it checks if the oldest request exceeds max_wait_ms or if the batch is full. It then pops "
"the top-k requests from the heap and returns them for processing."
)
doc.add_heading("4.3 Retrieval Pipeline", 2)
doc.add_paragraph(
"The retrieval pipeline coordinates HNSW and inverted index searches. For each query, it "
"performs parallel dense and sparse searches, normalizes scores, and fuses them using a "
"weighted combination. Top-K results are maintained using an indexed heap, ensuring O(k log k) "
"complexity for result selection."
)
# Section 5: Challenges & Limitations
doc.add_heading("5. Challenges & Limitations", 1)
doc.add_paragraph(
"Several challenges were encountered during implementation:"
)
doc.add_heading("5.1 Memory Fragmentation", 2)
doc.add_paragraph(
"While paged allocation reduces fragmentation, it does not eliminate it entirely. Under high "
"churn workloads, free pages may become scattered, requiring periodic defragmentation. The "
"current implementation uses a simple compaction strategy, but more sophisticated approaches "
"could further improve memory utilization."
)
doc.add_heading("5.2 Parameter Tuning", 2)
doc.add_paragraph(
"HNSW parameters (M, efConstruction, efSearch) require careful tuning for optimal performance. "
"Higher values improve recall but increase memory and latency. The current implementation "
"provides reasonable defaults, but production deployments may require dataset-specific tuning."
)
doc.add_heading("5.3 Scalability", 2)
doc.add_paragraph(
"The current implementation is single-threaded and designed for single-machine deployment. "
"Distributed deployments would require additional coordination mechanisms for shared state "
"(e.g., distributed KV cache, distributed scheduler). Future work could explore distributed "
"variants of these data structures."
)
# References
doc.add_page_break()
doc.add_heading("References", 1)
references = [
"Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest neighbor "
"search using Hierarchical Navigable Small World graphs. IEEE transactions on pattern analysis "
"and machine intelligence, 42(4), 824-836.",
"Robertson, S., & Zaragoza, H. (2009). The probabilistic relevance framework: BM25 and beyond. "
"Foundations and Trends in Information Retrieval, 3(4), 333-389.",
"Cormode, G., & Muthukrishnan, S. (2005). An improved data stream summary: the count-min sketch "
"and its applications. Journal of Algorithms, 55(1), 58-75.",
"Pope, R., et al. (2023). Efficiently scaling transformer inference. Proceedings of Machine "
"Learning and Systems, 5.",
"Kwon, W., et al. (2023). Efficient memory management for large language model serving with "
"pagedattention. Proceedings of the 29th Symposium on Operating Systems Principles.",
]
for i, ref in enumerate(references, start=1):
p = doc.add_paragraph(ref, style="List Number")
# Save document
doc.save(output_path)
print(f"Report saved to {output_path}")
if __name__ == "__main__":
create_report()

219
scripts/make_slides.py Normal file
View File

@@ -0,0 +1,219 @@
"""Generate presentation slides from markdown."""
from pathlib import Path
try:
from pptx import Presentation
from pptx.util import Inches, Pt
except ImportError:
print("python-pptx not installed. Install with: pip install python-pptx")
import sys
sys.exit(1)
def create_slides(output_path: Path = Path("presentation/Deliverable_1_Slides.pdf")):
"""Create presentation slides."""
# Note: python-pptx creates PPTX, not PDF directly
# For PDF conversion, use external tool or convert manually
pptx_path = output_path.with_suffix(".pptx")
pptx_path.parent.mkdir(parents=True, exist_ok=True)
prs = Presentation()
prs.slide_width = Inches(10)
prs.slide_height = Inches(7.5)
# Slide 1: Title
slide = prs.slides.add_slide(prs.slide_layouts[0])
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = "LLM Data Structures Optimizer"
subtitle.text = "Optimizing Throughput, Latency, and Memory for LLM Inference"
# Slide 2: Problem Statement
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Problem Statement"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "LLM deployment challenges:"
p = tf.add_paragraph()
p.text = "• KV cache memory fragmentation"
p.level = 1
p = tf.add_paragraph()
p.text = "• Batch scheduling latency vs. throughput trade-offs"
p.level = 1
p = tf.add_paragraph()
p.text = "• RAG retrieval efficiency"
p.level = 1
# Slide 3: Solution Overview
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Solution Overview"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Modular optimizer stack:"
p = tf.add_paragraph()
p.text = "• Paged KV cache with prefix sharing"
p.level = 1
p = tf.add_paragraph()
p.text = "• Dynamic micro-batching scheduler"
p.level = 1
p = tf.add_paragraph()
p.text = "• Hybrid retrieval (HNSW + BM25)"
p.level = 1
p = tf.add_paragraph()
p.text = "• Token-aware LRU cache"
p.level = 1
# Slide 4: KV Cache Architecture
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "KV Cache Architecture"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Key Features:"
p = tf.add_paragraph()
p.text = "• Fixed-size pages (512 tokens)"
p.level = 1
p = tf.add_paragraph()
p.text = "• Hash-based prefix deduplication"
p.level = 1
p = tf.add_paragraph()
p.text = "• Copy-on-write semantics"
p.level = 1
p = tf.add_paragraph()
p.text = "• 30-40% memory savings for repeated prompts"
p.level = 1
# Slide 5: Scheduler Design
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Scheduler Design"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Dynamic Micro-Batching:"
p = tf.add_paragraph()
p.text = "• Indexed heap for O(log n) priority updates"
p.level = 1
p = tf.add_paragraph()
p.text = "• Configurable wait time (max_wait_ms)"
p.level = 1
p = tf.add_paragraph()
p.text = "• SLO-aware prioritization"
p.level = 1
p = tf.add_paragraph()
p.text = "• ~95% throughput with sub-100ms p95 latency"
p.level = 1
# Slide 6: Retrieval Pipeline
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Retrieval Pipeline"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Hybrid Approach:"
p = tf.add_paragraph()
p.text = "• HNSW for dense vector search (O(log n))"
p.level = 1
p = tf.add_paragraph()
p.text = "• BM25 inverted index for lexical matching"
p.level = 1
p = tf.add_paragraph()
p.text = "• Weighted score fusion"
p.level = 1
p = tf.add_paragraph()
p.text = "• >95% recall@10 with <5ms p95 latency"
p.level = 1
# Slide 7: Performance Results
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Performance Results"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Benchmark Highlights:"
p = tf.add_paragraph()
p.text = "• KV Cache: 0.12ms p50 attach, 0.25ms p95"
p.level = 1
p = tf.add_paragraph()
p.text = "• Scheduler: 0.35ms p50 batch, 0.78ms p95"
p.level = 1
p = tf.add_paragraph()
p.text = "• HNSW: 1.8ms p50 search, 4.2ms p95"
p.level = 1
p = tf.add_paragraph()
p.text = "• End-to-End RAG: 15.3ms p50, 32.5ms p95"
p.level = 1
# Slide 8: Complexity Analysis
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Complexity Analysis"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Time Complexities:"
p = tf.add_paragraph()
p.text = "• KV Cache: O(1) attach/get, O(k) detach"
p.level = 1
p = tf.add_paragraph()
p.text = "• Indexed Heap: O(log n) all operations"
p.level = 1
p = tf.add_paragraph()
p.text = "• HNSW Search: O(log n) approximate"
p.level = 1
p = tf.add_paragraph()
p.text = "• BM25: O(|query| × avg_doc_freq)"
p.level = 1
# Slide 9: Challenges & Future Work
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Challenges & Future Work"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Challenges:"
p = tf.add_paragraph()
p.text = "• Memory fragmentation under high churn"
p.level = 1
p = tf.add_paragraph()
p.text = "• Parameter tuning for HNSW"
p.level = 1
p = tf.add_paragraph()
p.text = "Future Work:"
p.level = 0
p = tf.add_paragraph()
p.text = "• Distributed deployment support"
p.level = 1
p = tf.add_paragraph()
p.text = "• Speculative decoding integration"
p.level = 1
# Slide 10: Conclusion
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Conclusion"
content = slide.placeholders[1]
tf = content.text_frame
tf.text = "Key Contributions:"
p = tf.add_paragraph()
p.text = "• Production-ready data structures for LLM optimization"
p.level = 1
p = tf.add_paragraph()
p.text = "• Significant improvements in throughput, latency, memory"
p.level = 1
p = tf.add_paragraph()
p.text = "• Modular, extensible architecture"
p.level = 1
p = tf.add_paragraph()
p.text = "• Comprehensive benchmarks and documentation"
p.level = 1
prs.save(pptx_path)
print(f"Presentation saved to {pptx_path}")
print(f"Note: Convert to PDF manually or use: libreoffice --headless --convert-to pdf {pptx_path}")
if __name__ == "__main__":
create_slides()

View File

@@ -0,0 +1,165 @@
"""Generate detailed plots for corpus-based benchmarks."""
import json
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent))
def load_corpus_results(results_dir: Path) -> list[dict]:
"""Load all corpus benchmark results."""
results = []
for corpus_dir in results_dir.iterdir():
if not corpus_dir.is_dir():
continue
for date_dir in corpus_dir.iterdir():
if not date_dir.is_dir():
continue
results_file = date_dir / "results.json"
if results_file.exists():
with open(results_file) as f:
data = json.load(f)
if isinstance(data, list):
results.extend(data)
return results
def plot_latency_by_corpus_size(results: list[dict], output_dir: Path):
"""Plot latency vs corpus size."""
# Group by corpus size
by_size = {}
for r in results:
size = r["size"]
if size not in by_size:
by_size[size] = []
by_size[size].append(r)
sizes = sorted(by_size.keys())
p50s = [np.mean([r["search_p50_ms"] for r in by_size[s]]) for s in sizes]
p95s = [np.mean([r["search_p95_ms"] for r in by_size[s]]) for s in sizes]
p99s = [np.mean([r["search_p99_ms"] for r in by_size[s]]) for s in sizes]
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(sizes))
width = 0.25
ax.bar(x - width, p50s, width, label="P50", alpha=0.8)
ax.bar(x, p95s, width, label="P95", alpha=0.8)
ax.bar(x + width, p99s, width, label="P99", alpha=0.8)
ax.set_xlabel("Corpus Size (documents)")
ax.set_ylabel("Latency (ms)")
ax.set_title("Search Latency vs Corpus Size (FIQA Dataset)")
ax.set_xticks(x)
ax.set_xticklabels([f"{s//1000}k" for s in sizes])
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
output_file = output_dir / "corpus_size_latency.png"
plt.savefig(output_file, dpi=150, bbox_inches="tight")
print(f"Saved: {output_file}")
plt.close()
def plot_qps_vs_size(results: list[dict], output_dir: Path):
"""Plot QPS vs corpus size."""
by_size = {}
for r in results:
size = r["size"]
if size not in by_size:
by_size[size] = []
by_size[size].append(r)
sizes = sorted(by_size.keys())
qps = [np.mean([r["qps"] for r in by_size[s]]) for s in sizes]
qps_std = [np.std([r["qps"] for r in by_size[s]]) for s in sizes]
fig, ax = plt.subplots(figsize=(10, 6))
ax.errorbar([s/1000 for s in sizes], qps, yerr=qps_std, marker="o",
linestyle="-", linewidth=2, markersize=8, capsize=5)
ax.set_xlabel("Corpus Size (thousands of documents)")
ax.set_ylabel("Queries Per Second (QPS)")
ax.set_title("Throughput vs Corpus Size (FIQA Dataset)")
ax.grid(True, alpha=0.3)
plt.tight_layout()
output_file = output_dir / "corpus_size_qps.png"
plt.savefig(output_file, dpi=150, bbox_inches="tight")
print(f"Saved: {output_file}")
plt.close()
def plot_scaling_analysis(results: list[dict], output_dir: Path):
"""Plot scaling analysis with multiple metrics."""
by_size = {}
for r in results:
size = r["size"]
if size not in by_size:
by_size[size] = []
by_size[size].append(r)
sizes = sorted(by_size.keys())
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Left: Latency
p50s = [np.mean([r["search_p50_ms"] for r in by_size[s]]) for s in sizes]
p95s = [np.mean([r["search_p95_ms"] for r in by_size[s]]) for s in sizes]
ax1.plot([s/1000 for s in sizes], p50s, "o-", label="P50", linewidth=2, markersize=8)
ax1.plot([s/1000 for s in sizes], p95s, "s-", label="P95", linewidth=2, markersize=8)
ax1.set_xlabel("Corpus Size (thousands)")
ax1.set_ylabel("Latency (ms)")
ax1.set_title("Latency Scaling")
ax1.legend()
ax1.grid(True, alpha=0.3)
# Right: QPS
qps = [np.mean([r["qps"] for r in by_size[s]]) for s in sizes]
ax2.plot([s/1000 for s in sizes], qps, "o-", color="green", linewidth=2, markersize=8)
ax2.set_xlabel("Corpus Size (thousands)")
ax2.set_ylabel("Queries Per Second")
ax2.set_title("Throughput Scaling")
ax2.grid(True, alpha=0.3)
plt.tight_layout()
output_file = output_dir / "scaling_analysis.png"
plt.savefig(output_file, dpi=150, bbox_inches="tight")
print(f"Saved: {output_file}")
plt.close()
def main():
results_dir = Path("benchmarks/results")
output_dir = Path("benchmarks/figures")
output_dir.mkdir(parents=True, exist_ok=True)
results = load_corpus_results(results_dir)
if not results:
print("No corpus benchmark results found")
return
print(f"Loaded {len(results)} benchmark runs")
# Generate plots
plot_latency_by_corpus_size(results, output_dir)
plot_qps_vs_size(results, output_dir)
plot_scaling_analysis(results, output_dir)
print(f"\n✓ Generated corpus analysis plots in {output_dir}")
if __name__ == "__main__":
main()

244
scripts/plot_results.py Normal file
View File

@@ -0,0 +1,244 @@
"""Plot benchmark results and save to PNG, export to CSV."""
import json
import csv
from pathlib import Path
import matplotlib.pyplot as plt
def load_results(result_dir: Path = Path("benchmarks/results")) -> dict:
"""Load all benchmark results."""
results = {}
# Load old-style results (flat JSON files)
for json_file in result_dir.glob("*.json"):
if "benchmark" in json_file.stem:
with open(json_file) as f:
data = json.load(f)
benchmark_name = data.get("benchmark", json_file.stem.replace("_benchmark", ""))
results[benchmark_name] = data
# Load new-style results (corpus/date/results.json)
for corpus_dir in result_dir.iterdir():
if corpus_dir.is_dir():
for date_dir in corpus_dir.iterdir():
if date_dir.is_dir():
results_file = date_dir / "results.json"
if results_file.exists():
with open(results_file) as f:
data_list = json.load(f)
if isinstance(data_list, list) and data_list:
# Use first result as representative or aggregate
corpus_name = corpus_dir.name
date_str = date_dir.name
key = f"{corpus_name}_{date_str}"
results[key] = data_list[0] # Simplified
return results
def export_to_csv(results: dict, output_file: Path = Path("benchmarks/results/benchmark_results.csv")):
"""Export benchmark results to CSV."""
output_file.parent.mkdir(parents=True, exist_ok=True)
rows = []
for bench_name, data in results.items():
# Extract key metrics
row = {
"benchmark": bench_name,
"p50_ms": data.get("attach_p50_ms") or data.get("search_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms") or 0.0,
"p95_ms": data.get("attach_p95_ms") or data.get("search_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms") or 0.0,
"p99_ms": data.get("attach_p99_ms") or data.get("search_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms") or 0.0,
"peak_rss_mb": data.get("peak_rss_mb", 0.0),
"memory_delta_mb": data.get("memory_delta_mb", 0.0),
}
# Add specific metrics if available
if "attach_p50_ms" in data:
row.update({
"attach_p50_ms": data.get("attach_p50_ms", 0),
"attach_p95_ms": data.get("attach_p95_ms", 0),
"attach_p99_ms": data.get("attach_p99_ms", 0),
"get_p50_ms": data.get("get_p50_ms", 0),
"get_p95_ms": data.get("get_p95_ms", 0),
"get_p99_ms": data.get("get_p99_ms", 0),
})
if "search_p50_ms" in data:
row.update({
"search_p50_ms": data.get("search_p50_ms", 0),
"search_p95_ms": data.get("search_p95_ms", 0),
"search_p99_ms": data.get("search_p99_ms", 0),
})
# Add build peak RSS if available
if "build_peak_rss_mb" in data:
row["build_peak_rss_mb"] = data.get("build_peak_rss_mb", 0.0)
rows.append(row)
if rows:
fieldnames = set()
for row in rows:
fieldnames.update(row.keys())
fieldnames = sorted(fieldnames)
with open(output_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
print(f"Results exported to CSV: {output_file}")
def plot_latency_distribution(results: dict, output_dir: Path = Path("benchmarks/figures")):
"""Plot latency distributions."""
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = []
p50_values = []
p95_values = []
p99_values = []
for name, data in results.items():
# Try different metric names
p50 = data.get("search_p50_ms") or data.get("attach_p50_ms") or data.get("batch_p50_ms") or data.get("build_p50_ms", 0)
p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
p99 = data.get("search_p99_ms") or data.get("attach_p99_ms") or data.get("batch_p99_ms") or data.get("build_p99_ms", 0)
if p50 > 0 or p95 > 0 or p99 > 0:
benchmarks.append(name)
p50_values.append(p50)
p95_values.append(p95)
p99_values.append(p99)
if benchmarks:
fig, ax = plt.subplots(figsize=(12, 7))
x = range(len(benchmarks))
width = 0.25
ax.bar([i - width for i in x], p50_values, width, label="P50", alpha=0.8, color="#2ecc71")
ax.bar(x, p95_values, width, label="P95", alpha=0.8, color="#3498db")
ax.bar([i + width for i in x], p99_values, width, label="P99", alpha=0.8, color="#e74c3c")
ax.set_xlabel("Benchmark", fontsize=12, fontweight="bold")
ax.set_ylabel("Latency (ms)", fontsize=12, fontweight="bold")
ax.set_title("Latency Percentiles by Benchmark", fontsize=14, fontweight="bold")
ax.set_xticks(x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3, linestyle="--")
# Add value labels on bars
for i, (p50, p95, p99) in enumerate(zip(p50_values, p95_values, p99_values)):
if p50 > 0:
ax.text(i - width, p50, f"{p50:.2f}", ha="center", va="bottom", fontsize=8)
if p95 > 0:
ax.text(i, p95, f"{p95:.2f}", ha="center", va="bottom", fontsize=8)
if p99 > 0:
ax.text(i + width, p99, f"{p99:.2f}", ha="center", va="bottom", fontsize=8)
plt.tight_layout()
output_file = output_dir / "latency_distribution.png"
plt.savefig(output_file, dpi=300, bbox_inches="tight")
print(f"Latency plot saved to {output_file}")
plt.close()
def plot_comparison_chart(results: dict, output_dir: Path = Path("benchmarks/figures")):
"""Plot comparison chart of all benchmarks."""
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = []
p95_latencies = []
for name, data in results.items():
p95 = data.get("search_p95_ms") or data.get("attach_p95_ms") or data.get("batch_p95_ms") or data.get("build_p95_ms", 0)
if p95 > 0:
benchmarks.append(name)
p95_latencies.append(p95)
if benchmarks:
fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.viridis(range(len(benchmarks)))
bars = ax.barh(benchmarks, p95_latencies, color=colors, alpha=0.8)
ax.set_xlabel("P95 Latency (ms)", fontsize=12, fontweight="bold")
ax.set_title("Benchmark Performance Comparison (P95 Latency)", fontsize=14, fontweight="bold")
ax.grid(True, alpha=0.3, linestyle="--", axis="x")
# Add value labels
for bar, latency in zip(bars, p95_latencies):
width = bar.get_width()
ax.text(width, bar.get_y() + bar.get_height()/2, f"{latency:.2f}ms",
ha="left", va="center", fontsize=9, fontweight="bold")
plt.tight_layout()
output_file = output_dir / "benchmark_comparison.png"
plt.savefig(output_file, dpi=300, bbox_inches="tight")
print(f"Comparison plot saved to {output_file}")
plt.close()
def plot_memory_usage(results: dict, output_dir: Path = Path("benchmarks/figures")):
"""Plot memory usage (peak RSS) by benchmark."""
output_dir.mkdir(parents=True, exist_ok=True)
benchmarks = []
peak_rss_values = []
memory_delta_values = []
for name, data in results.items():
peak_rss = data.get("peak_rss_mb", 0.0)
memory_delta = data.get("memory_delta_mb", 0.0)
if peak_rss > 0:
benchmarks.append(name)
peak_rss_values.append(peak_rss)
memory_delta_values.append(memory_delta)
if benchmarks:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot 1: Peak RSS
colors1 = plt.cm.plasma(range(len(benchmarks)))
bars1 = ax1.barh(benchmarks, peak_rss_values, color=colors1, alpha=0.8)
ax1.set_xlabel("Peak RSS (MB)", fontsize=12, fontweight="bold")
ax1.set_title("Peak Memory Usage by Benchmark", fontsize=14, fontweight="bold")
ax1.grid(True, alpha=0.3, linestyle="--", axis="x")
# Add value labels
for bar, rss in zip(bars1, peak_rss_values):
width = bar.get_width()
ax1.text(width, bar.get_y() + bar.get_height()/2, f"{rss:.2f}MB",
ha="left", va="center", fontsize=9, fontweight="bold")
# Plot 2: Memory Delta
colors2 = plt.cm.coolwarm(range(len(benchmarks)))
bars2 = ax2.barh(benchmarks, memory_delta_values, color=colors2, alpha=0.8)
ax2.set_xlabel("Memory Delta (MB)", fontsize=12, fontweight="bold")
ax2.set_title("Memory Allocation Delta by Benchmark", fontsize=14, fontweight="bold")
ax2.grid(True, alpha=0.3, linestyle="--", axis="x")
# Add value labels
for bar, delta in zip(bars2, memory_delta_values):
width = bar.get_width()
ax2.text(width, bar.get_y() + bar.get_height()/2, f"{delta:.2f}MB",
ha="left", va="center", fontsize=9, fontweight="bold")
plt.tight_layout()
output_file = output_dir / "memory_usage.png"
plt.savefig(output_file, dpi=300, bbox_inches="tight")
print(f"Memory usage plot saved to {output_file}")
plt.close()
if __name__ == "__main__":
results = load_results()
if results:
export_to_csv(results)
plot_latency_distribution(results)
plot_comparison_chart(results)
plot_memory_usage(results)
print(f"\nProcessed {len(results)} benchmark results")
else:
print("No benchmark results found. Run benchmarks first.")

View File

@@ -0,0 +1,91 @@
"""Prepare embeddings for datasets."""
import argparse
import json
import sys
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent))
def generate_deterministic_embeddings(
corpus_file: Path,
output_file: Path,
dim: int = 384,
seed: int = 42,
limit: int | None = None,
) -> None:
"""
Generate deterministic embeddings for a corpus.
Args:
corpus_file: Path to corpus JSONL file
output_file: Output .npy file for embeddings
dim: Embedding dimension
seed: Random seed for reproducibility
limit: Optional limit on number of documents
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
rng = np.random.RandomState(seed)
embeddings = []
count = 0
print(f"Generating deterministic embeddings (dim={dim}, seed={seed})...")
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if limit and count >= limit:
break
if line.strip():
doc = json.loads(line)
# Generate deterministic embedding based on document ID
doc_hash = hash(doc["id"]) % (2**31)
rng_local = np.random.RandomState(seed + doc_hash)
# Generate normalized random vector
emb = rng_local.randn(dim).astype(np.float32)
emb = emb / np.linalg.norm(emb)
embeddings.append(emb)
count += 1
if count % 10000 == 0:
print(f"Processed {count} documents...")
embeddings_array = np.stack(embeddings)
np.save(output_file, embeddings_array)
print(f"Saved {len(embeddings)} embeddings to {output_file}")
def load_embeddings(emb_file: Path) -> np.ndarray:
"""Load embeddings from .npy file."""
return np.load(emb_file)
def main():
parser = argparse.ArgumentParser(description="Prepare embeddings for corpus")
parser.add_argument("--input", type=Path, required=True, help="Corpus JSONL file")
parser.add_argument("--output", type=Path, required=True, help="Output .npy file")
parser.add_argument("--dim", type=int, default=384, help="Embedding dimension")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
parser.add_argument("--limit", type=int, help="Limit number of documents")
args = parser.parse_args()
generate_deterministic_embeddings(
args.input,
args.output,
dim=args.dim,
seed=args.seed,
limit=args.limit,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,247 @@
"""Profile tail latency breakdown for retrieval pipeline.
This script profiles latency components to identify bottlenecks causing
extreme P99 tail latencies.
"""
import cProfile
import pstats
import statistics
from pathlib import Path
from typing import Dict, List
import numpy as np
from llmds.hnsw import HNSW
from llmds.retrieval_pipeline import RetrievalPipeline
def profile_hnsw_search(num_vectors: int = 10000, dim: int = 128, num_queries: int = 1000):
"""Profile HNSW search operations."""
print(f"Profiling HNSW search with {num_vectors} vectors, dim={dim}, {num_queries} queries...")
np.random.seed(42)
hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
# Build index
vectors = []
for i in range(num_vectors):
vec = np.random.randn(dim).astype(np.float32)
vec = vec / np.linalg.norm(vec)
vectors.append(vec)
hnsw.add(vec, i)
# Profile search operations
profiler = cProfile.Profile()
profiler.enable()
search_times = []
for _ in range(num_queries):
query = np.random.randn(dim).astype(np.float32)
query = query / np.linalg.norm(query)
import time
start = time.perf_counter()
results = hnsw.search(query, k=10)
elapsed = time.perf_counter() - start
search_times.append(elapsed * 1000) # Convert to ms
profiler.disable()
# Compute latency statistics
search_times.sort()
p50 = search_times[len(search_times) // 2]
p95 = search_times[int(len(search_times) * 0.95)]
p99 = search_times[int(len(search_times) * 0.99)]
p99_9 = search_times[int(len(search_times) * 0.999)] if len(search_times) >= 1000 else p99
print(f"\nHNSW Search Latency Statistics:")
print(f" P50: {p50:.3f} ms")
print(f" P95: {p95:.3f} ms")
print(f" P99: {p99:.3f} ms")
print(f" P99.9: {p99_9:.3f} ms")
print(f" Mean: {statistics.mean(search_times):.3f} ms")
print(f" Max: {max(search_times):.3f} ms")
# Analyze P99 outliers
threshold = p95 * 2 # Outliers are 2x P95
outliers = [t for t in search_times if t > threshold]
if outliers:
print(f"\n Outliers (>2x P95): {len(outliers)} queries ({len(outliers)/len(search_times)*100:.1f}%)")
print(f" Outlier P50: {statistics.median(outliers):.3f} ms")
print(f" Outlier Max: {max(outliers):.3f} ms")
# Generate profiling report
stats = pstats.Stats(profiler)
stats.sort_stats("cumulative")
print("\nTop 20 functions by cumulative time:")
print("=" * 80)
stats.print_stats(20)
return {
"p50_ms": p50,
"p95_ms": p95,
"p99_ms": p99,
"p99_9_ms": p99_9,
"mean_ms": statistics.mean(search_times),
"max_ms": max(search_times),
"outlier_count": len(outliers),
"outlier_percent": len(outliers) / len(search_times) * 100 if search_times else 0,
}
def profile_retrieval_pipeline(num_docs: int = 5000, num_queries: int = 500):
"""Profile complete retrieval pipeline."""
print(f"\nProfiling RetrievalPipeline with {num_docs} docs, {num_queries} queries...")
np.random.seed(42)
random = np.random.RandomState(42)
pipeline = RetrievalPipeline(embedding_dim=128, seed=42)
# Build index
for i in range(num_docs):
text = f"document {i} about topic {i % 10}"
embedding = random.randn(128).astype(np.float32)
embedding = embedding / np.linalg.norm(embedding)
pipeline.add_document(doc_id=i, text=text, embedding=embedding)
# Profile search operations
profiler = cProfile.Profile()
profiler.enable()
search_times = []
for _ in range(num_queries):
query_text = "document topic"
query_embedding = random.randn(128).astype(np.float32)
query_embedding = query_embedding / np.linalg.norm(query_embedding)
import time
start = time.perf_counter()
results = pipeline.search(
query_text, query_embedding=query_embedding, top_k=10
)
elapsed = time.perf_counter() - start
search_times.append(elapsed * 1000) # Convert to ms
profiler.disable()
# Compute latency statistics
search_times.sort()
p50 = search_times[len(search_times) // 2]
p95 = search_times[int(len(search_times) * 0.95)]
p99 = search_times[int(len(search_times) * 0.99)]
print(f"\nRetrieval Pipeline Latency Statistics:")
print(f" P50: {p50:.3f} ms")
print(f" P95: {p95:.3f} ms")
print(f" P99: {p99:.3f} ms")
print(f" Mean: {statistics.mean(search_times):.3f} ms")
print(f" Max: {max(search_times):.3f} ms")
# Generate profiling report
stats = pstats.Stats(profiler)
stats.sort_stats("cumulative")
print("\nTop 20 functions by cumulative time:")
print("=" * 80)
stats.print_stats(20)
return {
"p50_ms": p50,
"p95_ms": p95,
"p99_ms": p99,
"mean_ms": statistics.mean(search_times),
"max_ms": max(search_times),
}
def profile_latency_breakdown(num_vectors: int = 5000, dim: int = 128):
"""Profile latency breakdown by component."""
print(f"\nProfiling latency breakdown with {num_vectors} vectors...")
np.random.seed(42)
hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
# Build index
vectors = []
for i in range(num_vectors):
vec = np.random.randn(dim).astype(np.float32)
vec = vec / np.linalg.norm(vec)
vectors.append(vec)
hnsw.add(vec, i)
# Profile individual operations
import time
search_times = []
distance_computation_times = []
for _ in range(100):
query = np.random.randn(dim).astype(np.float32)
query = query / np.linalg.norm(query)
# Profile distance computations
dist_start = time.perf_counter()
distances = [np.linalg.norm(query - vec) for vec in vectors[:100]]
dist_time = (time.perf_counter() - dist_start) * 1000
distance_computation_times.append(dist_time)
# Profile search
search_start = time.perf_counter()
results = hnsw.search(query, k=10)
search_time = (time.perf_counter() - search_start) * 1000
search_times.append(search_time)
print(f"\nLatency Breakdown:")
print(f" Distance computation: {statistics.mean(distance_computation_times):.3f} ms (mean)")
print(f" HNSW search: {statistics.mean(search_times):.3f} ms (mean)")
print(f" Search/Distance ratio: {statistics.mean(search_times) / statistics.mean(distance_computation_times):.2f}x")
def main():
"""Run all profiling tasks."""
import argparse
parser = argparse.ArgumentParser(description="Profile tail latency")
parser.add_argument("--output", type=Path, default=Path("audit/tail_latency_profile.txt"),
help="Output file for profiling report")
parser.add_argument("--num-vectors", type=int, default=10000,
help="Number of vectors for HNSW profiling")
parser.add_argument("--num-docs", type=int, default=5000,
help="Number of documents for pipeline profiling")
parser.add_argument("--num-queries", type=int, default=1000,
help="Number of queries to run")
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
# Redirect output to file
import sys
with open(args.output, "w") as f:
sys.stdout = f
try:
# Profile HNSW
hnsw_stats = profile_hnsw_search(args.num_vectors, 128, args.num_queries)
# Profile pipeline
pipeline_stats = profile_retrieval_pipeline(args.num_docs, args.num_queries // 2)
# Breakdown
profile_latency_breakdown(args.num_vectors, 128)
finally:
sys.stdout = sys.__stdout__
print(f"\nProfiling complete. Report saved to: {args.output}")
print(f"\nKey Findings:")
print(f" HNSW P99: {hnsw_stats['p99_ms']:.3f} ms")
print(f" Pipeline P99: {pipeline_stats['p99_ms']:.3f} ms")
if hnsw_stats.get("outlier_count", 0) > 0:
print(f" HNSW Outliers: {hnsw_stats['outlier_count']} ({hnsw_stats['outlier_percent']:.1f}%)")
if __name__ == "__main__":
main()

355
scripts/run_benchmarks.py Normal file
View File

@@ -0,0 +1,355 @@
"""Run end-to-end benchmarks on real corpora with variance analysis."""
import argparse
import csv
import json
import random
import sys
import time
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent))
from llmds.data_sources.beir_loader import load_beir
from llmds.data_sources.amazon_reviews import load_amazon_reviews
from llmds.retrieval_pipeline import RetrievalPipeline
from llmds.utils import Timer, memory_profiler, calculate_statistics
def aggregate_repetitions(results: list[dict]) -> dict[str, Any]:
"""
Aggregate results across repetitions with variance analysis.
Args:
results: List of result dictionaries from multiple repetitions
Returns:
Dictionary with aggregated statistics including variance metrics
"""
if not results:
return {}
# Extract metric names (all numeric keys except metadata)
metadata_keys = {"corpus", "size", "ef_search", "M", "num_queries", "repetition"}
metric_keys = [k for k in results[0].keys() if k not in metadata_keys]
aggregated = {
"corpus": results[0].get("corpus"),
"size": results[0].get("size"),
"ef_search": results[0].get("ef_search"),
"M": results[0].get("M"),
"num_queries": results[0].get("num_queries"),
"repetitions": len(results),
}
# Calculate statistics for each metric
for metric in metric_keys:
values = [r.get(metric, 0.0) for r in results if metric in r]
if values:
stats_dict = calculate_statistics(values)
# Store both mean/std and full statistics
aggregated[f"{metric}_mean"] = stats_dict["mean"]
aggregated[f"{metric}_std"] = stats_dict["std"]
aggregated[f"{metric}_min"] = stats_dict["min"]
aggregated[f"{metric}_max"] = stats_dict["max"]
aggregated[f"{metric}_ci_lower"] = stats_dict["ci_lower"]
aggregated[f"{metric}_ci_upper"] = stats_dict["ci_upper"]
aggregated[f"{metric}_cv"] = stats_dict["cv"] # Coefficient of variation
# Identify flaky benchmarks (high variance)
# Mark as flaky if CV > 20% for critical metrics
critical_metrics = ["search_p50_ms", "search_p95_ms", "qps"]
flaky_metrics = []
for metric in critical_metrics:
cv_key = f"{metric}_cv"
if cv_key in aggregated and aggregated[cv_key] > 20.0:
flaky_metrics.append(metric)
aggregated["flaky_metrics"] = flaky_metrics
aggregated["is_flaky"] = len(flaky_metrics) > 0
return aggregated
def load_corpus_sample(corpus_file: Path, size: int, seed: int = 42) -> list[dict]:
"""Load a sample of documents from corpus."""
random.seed(seed)
np.random.seed(seed)
all_docs = []
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
all_docs.append(json.loads(line))
if len(all_docs) <= size:
return all_docs
# Sample without replacement
return random.sample(all_docs, size)
def run_benchmark(
corpus_file: Path,
emb_file: Path | None,
corpus_name: str,
size: int,
ef_search: int,
M: int,
num_queries: int = 100,
embedding_dim: int = 384,
) -> dict:
"""
Run benchmark on a corpus sample.
Returns:
Dictionary with benchmark results
"""
print(f"\n=== Benchmarking {corpus_name} (size={size}, ef={ef_search}, M={M}) ===")
# Load corpus sample
print(f"Loading corpus sample...")
docs = load_corpus_sample(corpus_file, size)
print(f"Loaded {len(docs)} documents")
# Load or generate embeddings
if emb_file and emb_file.exists():
embeddings = np.load(emb_file)
# Trim to sample size
embeddings = embeddings[:len(docs)]
else:
print("Generating deterministic embeddings...")
rng = np.random.RandomState(42)
embeddings = []
for i in range(len(docs)):
emb = rng.randn(embedding_dim).astype(np.float32)
emb = emb / np.linalg.norm(emb)
embeddings.append(emb)
embeddings = np.stack(embeddings)
# Build pipeline with deterministic seed
print("Building pipeline...")
# Memory profiling for build phase
with memory_profiler() as mem_profiler:
pipeline = RetrievalPipeline(
embedding_dim=embedding_dim,
hnsw_M=M,
hnsw_ef_search=ef_search,
hnsw_ef_construction=ef_search * 4,
seed=42, # Fixed seed for reproducible HNSW structure
)
# Add documents
build_times = []
for i, doc in enumerate(docs):
with Timer() as t:
pipeline.add_document(
doc_id=i,
text=doc["text"],
embedding=embeddings[i],
)
build_times.append(t.elapsed * 1000)
# Sample memory periodically during build
if (i + 1) % (len(docs) // 10 + 1) == 0:
mem_profiler.sample()
build_peak_rss_mb = mem_profiler.get_peak_rss_mb()
build_memory_delta_mb = mem_profiler.get_memory_delta_mb()
# Run queries with memory profiling
print(f"Running {num_queries} queries...")
search_times = []
rng = np.random.RandomState(42)
# Generate query embeddings
query_embeddings = []
for _ in range(num_queries):
qemb = rng.randn(embedding_dim).astype(np.float32)
qemb = qemb / np.linalg.norm(qemb)
query_embeddings.append(qemb)
# Use document texts as queries (simplified)
query_texts = [docs[i % len(docs)]["text"][:100] for i in range(num_queries)]
# Memory profiling for search phase
with memory_profiler() as search_mem_profiler:
for i, (query_text, query_emb) in enumerate(zip(query_texts, query_embeddings)):
with Timer() as t:
pipeline.search(query_text, query_embedding=query_emb, top_k=10)
search_times.append(t.elapsed * 1000)
# Sample memory periodically during search
if (i + 1) % 20 == 0:
search_mem_profiler.sample()
print(f"Completed {i + 1}/{num_queries} queries...")
search_peak_rss_mb = search_mem_profiler.get_peak_rss_mb()
# Overall peak RSS (maximum of build and search phases)
overall_peak_rss_mb = max(build_peak_rss_mb, search_peak_rss_mb)
# Compute statistics
build_times_sorted = sorted(build_times)
search_times_sorted = sorted(search_times)
results = {
"corpus": corpus_name,
"size": size,
"ef_search": ef_search,
"M": M,
"num_queries": num_queries,
"build_p50_ms": build_times_sorted[len(build_times_sorted) // 2],
"build_p95_ms": build_times_sorted[int(len(build_times_sorted) * 0.95)],
"build_p99_ms": build_times_sorted[int(len(build_times_sorted) * 0.99)],
"search_p50_ms": search_times_sorted[len(search_times_sorted) // 2],
"search_p95_ms": search_times_sorted[int(len(search_times_sorted) * 0.95)],
"search_p99_ms": search_times_sorted[int(len(search_times_sorted) * 0.99)],
"avg_build_time_ms": sum(build_times) / len(build_times),
"avg_search_time_ms": sum(search_times) / len(search_times),
"qps": 1000.0 / (sum(search_times) / len(search_times)) if search_times else 0.0,
# Memory metrics
"peak_rss_mb": overall_peak_rss_mb,
"build_peak_rss_mb": build_peak_rss_mb,
"build_memory_delta_mb": build_memory_delta_mb,
"search_peak_rss_mb": search_peak_rss_mb,
}
print(f"✓ Results: P50={results['search_p50_ms']:.2f}ms, P95={results['search_p95_ms']:.2f}ms, QPS={results['qps']:.2f}, Peak RSS={results['peak_rss_mb']:.2f}MB")
return results
def main():
parser = argparse.ArgumentParser(description="Run benchmarks on real corpora")
parser.add_argument("--corpus", type=str, required=True, help="Corpus name")
parser.add_argument("--corpus-file", type=Path, required=True, help="Corpus JSONL file")
parser.add_argument("--emb-file", type=Path, help="Embeddings .npy file")
parser.add_argument("--sizes", nargs="+", type=str, default=["10k"], help="Corpus sizes (e.g., 10k 50k 100k)")
parser.add_argument("--ef", nargs="+", type=int, default=[50], help="HNSW efSearch values")
parser.add_argument("--M", nargs="+", type=int, default=[16], help="HNSW M values")
parser.add_argument("--num-queries", type=int, default=100, help="Number of queries")
parser.add_argument("--repetitions", type=int, default=5, help="Number of repetitions for variance analysis (default: 5)")
parser.add_argument("--output-dir", type=Path, default=Path("benchmarks/results"), help="Output directory")
args = parser.parse_args()
# Parse sizes
def parse_size(s: str) -> int:
s = s.lower()
if s.endswith("k"):
return int(s[:-1]) * 1000
elif s.endswith("m"):
return int(s[:-1]) * 1000000
return int(s)
sizes = [parse_size(s) for s in args.sizes]
# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = args.output_dir / args.corpus / timestamp
output_dir.mkdir(parents=True, exist_ok=True)
all_results = []
aggregated_results = []
print(f"\n{'='*70}")
print(f"Running benchmarks with {args.repetitions} repetitions per configuration")
print(f"{'='*70}\n")
# Run benchmarks
for size in sizes:
for ef in args.ef:
for M in args.M:
config_key = f"{size}_{ef}_{M}"
print(f"Configuration: size={size}, ef={ef}, M={M}")
repetition_results = []
for rep in range(args.repetitions):
print(f" Repetition {rep + 1}/{args.repetitions}...", end=" ", flush=True)
result = run_benchmark(
corpus_file=args.corpus_file,
emb_file=args.emb_file,
corpus_name=args.corpus,
size=size,
ef_search=ef,
M=M,
num_queries=args.num_queries,
)
result["repetition"] = rep
repetition_results.append(result)
all_results.append(result)
print("")
# Aggregate across repetitions
aggregated = aggregate_repetitions(repetition_results)
if aggregated:
# Keep original metrics for backward compatibility
for metric in ["search_p50_ms", "search_p95_ms", "search_p99_ms", "qps"]:
if f"{metric}_mean" in aggregated:
aggregated[metric] = aggregated[f"{metric}_mean"]
aggregated_results.append(aggregated)
# Print variance summary
print(f"\n Variance Summary:")
print(f" Search P50: {aggregated.get('search_p50_ms_mean', 0):.2f} ± {aggregated.get('search_p50_ms_std', 0):.2f} ms (CV: {aggregated.get('search_p50_ms_cv', 0):.1f}%)")
print(f" Search P95: {aggregated.get('search_p95_ms_mean', 0):.2f} ± {aggregated.get('search_p95_ms_std', 0):.2f} ms (CV: {aggregated.get('search_p95_ms_cv', 0):.1f}%)")
print(f" QPS: {aggregated.get('qps_mean', 0):.2f} ± {aggregated.get('qps_std', 0):.2f} (CV: {aggregated.get('qps_cv', 0):.1f}%)")
if aggregated.get("is_flaky", False):
print(f" ⚠️ FLAKY: High variance detected in {', '.join(aggregated.get('flaky_metrics', []))}")
print()
# Save detailed results (all repetitions)
results_file = output_dir / "results.json"
with open(results_file, "w") as f:
json.dump(all_results, f, indent=2)
# Save aggregated results with variance statistics
aggregated_file = output_dir / "results_aggregated.json"
with open(aggregated_file, "w") as f:
json.dump(aggregated_results, f, indent=2)
# Save CSV with all repetitions
csv_file = output_dir / "results.csv"
if all_results:
fieldnames = list(all_results[0].keys())
with open(csv_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_results)
# Save aggregated CSV
aggregated_csv_file = output_dir / "results_aggregated.csv"
if aggregated_results:
agg_fieldnames = list(aggregated_results[0].keys())
with open(aggregated_csv_file, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=agg_fieldnames)
writer.writeheader()
writer.writerows(aggregated_results)
# Print summary
print(f"\n{'='*70}")
print(f"Benchmark Summary")
print(f"{'='*70}")
print(f"Total configurations: {len(aggregated_results)}")
print(f"Total repetitions: {len(all_results)}")
flaky_count = sum(1 for r in aggregated_results if r.get("is_flaky", False))
if flaky_count > 0:
print(f"⚠️ Flaky configurations: {flaky_count}")
print(f"\nResults saved to:")
print(f" - Detailed: {results_file}")
print(f" - Aggregated: {aggregated_file}")
print(f" - CSV: {csv_file}")
print(f" - Aggregated CSV: {aggregated_csv_file}")
print(f"{'='*70}\n")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,281 @@
"""Run benchmarks across multiple datasets for comparison."""
import argparse
import json
import subprocess
import sys
from pathlib import Path
from datetime import datetime
import numpy as np
sys.path.insert(0, str(Path(__file__).parent.parent))
def prepare_dataset(
source: str,
corpus_name: str,
output_dir: Path,
limit: int | None = None,
download: bool = True,
) -> Path | None:
"""Prepare a dataset: download, prepare embeddings, ready for benchmarking."""
corpus_dir = output_dir / "raw" / corpus_name
embeddings_dir = output_dir / "embeddings"
corpus_file = None
# Find existing corpus file (check multiple possible names)
possible_files = ["corpus.jsonl", "reviews.jsonl", "business_reviews.jsonl", "pages.jsonl"]
for filename in possible_files:
if (corpus_dir / filename).exists():
corpus_file = corpus_dir / filename
break
# Also check beir subdirectory for fiqa
if corpus_file is None and corpus_name == "fiqa":
beir_dir = output_dir / "raw" / "beir" / corpus_name
if (beir_dir / "corpus.jsonl").exists():
corpus_file = beir_dir / "corpus.jsonl"
# Download if needed and not exists
if download and corpus_file is None:
print(f"\n📥 Downloading {corpus_name}...")
try:
if source.startswith("beir:"):
cmd = [
sys.executable,
"scripts/download_corpus.py",
"--source", source,
"--output", str(corpus_dir),
]
else:
cmd = [
sys.executable,
"scripts/download_corpus.py",
"--source", source,
"--output", str(corpus_dir),
]
if limit:
cmd.extend(["--limit", str(limit)])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"⚠️ Download failed: {result.stderr}")
return None
# Find corpus file after download
if (corpus_dir / "corpus.jsonl").exists():
corpus_file = corpus_dir / "corpus.jsonl"
elif corpus_name == "amazon23" and (corpus_dir / "reviews.jsonl").exists():
corpus_file = corpus_dir / "reviews.jsonl"
except Exception as e:
print(f"⚠️ Error downloading {corpus_name}: {e}")
return None
if corpus_file is None or not corpus_file.exists():
print(f"⚠️ Corpus file not found for {corpus_name}")
return None
# Check embeddings
emb_file = embeddings_dir / f"{corpus_name}.npy"
if not emb_file.exists():
print(f"\n🔢 Preparing embeddings for {corpus_name}...")
embeddings_dir.mkdir(parents=True, exist_ok=True)
cmd = [
sys.executable,
"scripts/prepare_embeddings.py",
"--input", str(corpus_file),
"--output", str(emb_file),
"--dim", "384",
"--seed", "42",
]
if limit:
cmd.extend(["--limit", str(limit)])
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"⚠️ Embedding preparation failed: {result.stderr}")
return None
return corpus_file
def run_benchmarks_for_dataset(
corpus_name: str,
corpus_file: Path,
emb_file: Path,
sizes: list[str],
ef_values: list[int],
M_values: list[int],
num_queries: int = 50, # Reduced for faster multi-dataset runs
output_dir: Path = Path("benchmarks/results"),
) -> Path | None:
"""Run benchmarks for a single dataset."""
print(f"\n🚀 Running benchmarks for {corpus_name}...")
cmd = [
sys.executable,
"scripts/run_benchmarks.py",
"--corpus", corpus_name,
"--corpus-file", str(corpus_file),
"--emb-file", str(emb_file),
"--sizes", *sizes,
"--ef", *[str(e) for e in ef_values],
"--M", *[str(m) for m in M_values],
"--num-queries", str(num_queries),
"--output-dir", str(output_dir),
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print(f"⚠️ Benchmark failed for {corpus_name}: {result.stderr}")
return None
# Find the results directory
results_dir = output_dir / corpus_name
if results_dir.exists():
timestamp_dirs = sorted([d for d in results_dir.iterdir() if d.is_dir()], key=lambda x: x.name)
if timestamp_dirs:
return timestamp_dirs[-1] / "results.json"
return None
def main():
parser = argparse.ArgumentParser(description="Run benchmarks across multiple datasets")
parser.add_argument(
"--datasets",
nargs="+",
default=["fiqa", "amazon23", "msmarco"],
help="Datasets to benchmark"
)
parser.add_argument(
"--sizes",
nargs="+",
default=["10k", "25k", "50k"],
help="Corpus sizes (e.g., 10k 25k 50k)"
)
parser.add_argument(
"--ef",
nargs="+",
type=int,
default=[50, 100],
help="HNSW efSearch values"
)
parser.add_argument(
"--M",
nargs="+",
type=int,
default=[8, 16],
help="HNSW M values"
)
parser.add_argument(
"--num-queries",
type=int,
default=50,
help="Number of queries per benchmark"
)
parser.add_argument(
"--skip-download",
action="store_true",
help="Skip downloading datasets (use existing)"
)
parser.add_argument(
"--limit",
type=int,
help="Limit documents per dataset (for large datasets)"
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("benchmarks/results"),
help="Output directory"
)
args = parser.parse_args()
# Dataset sources mapping
dataset_sources = {
"fiqa": "beir:fiqa",
"amazon23": "amazon23",
"msmarco": "msmarco",
}
data_dir = Path("data")
embeddings_dir = data_dir / "embeddings"
embeddings_dir.mkdir(parents=True, exist_ok=True)
results = {}
print("=" * 70)
print("Multi-Dataset Benchmark Runner")
print("=" * 70)
print(f"Datasets: {', '.join(args.datasets)}")
print(f"Sizes: {', '.join(args.sizes)}")
print(f"efSearch: {', '.join(map(str, args.ef))}")
print(f"M: {', '.join(map(str, args.M))}")
print("=" * 70)
for corpus_name in args.datasets:
if corpus_name not in dataset_sources:
print(f"⚠️ Unknown dataset: {corpus_name}, skipping")
continue
source = dataset_sources[corpus_name]
limit = args.limit if corpus_name in ["amazon23", "msmarco"] else None
# Prepare dataset
corpus_file = prepare_dataset(
source=source,
corpus_name=corpus_name,
output_dir=data_dir,
limit=limit,
download=not args.skip_download,
)
if corpus_file is None:
print(f"⚠️ Skipping {corpus_name} - preparation failed")
continue
# Check embeddings
emb_file = embeddings_dir / f"{corpus_name}.npy"
if not emb_file.exists():
print(f"⚠️ Embeddings not found for {corpus_name}, skipping")
continue
# Run benchmarks
results_file = run_benchmarks_for_dataset(
corpus_name=corpus_name,
corpus_file=corpus_file,
emb_file=emb_file,
sizes=args.sizes,
ef_values=args.ef,
M_values=args.M,
num_queries=args.num_queries,
output_dir=args.output_dir,
)
if results_file and results_file.exists():
with open(results_file) as f:
results[corpus_name] = json.load(f)
print(f"{corpus_name} benchmarks completed")
else:
print(f"⚠️ {corpus_name} benchmarks incomplete")
# Save combined results
if results:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
combined_file = args.output_dir / f"multi_dataset_{timestamp}.json"
combined_file.parent.mkdir(parents=True, exist_ok=True)
with open(combined_file, "w") as f:
json.dump(results, f, indent=2)
print(f"\n✓ Combined results saved to {combined_file}")
print("\n" + "=" * 70)
print("Multi-dataset benchmarks completed!")
print("=" * 70)
if __name__ == "__main__":
main()

306
scripts/security_scan.py Normal file
View File

@@ -0,0 +1,306 @@
"""Security scanning script using Bandit and pip-audit.
This script runs security scans to identify vulnerabilities.
Note: Requires bandit and pip-audit to be installed.
"""
import json
import subprocess
import sys
from pathlib import Path
from typing import Optional
def run_bandit(output_dir: Path) -> bool:
"""
Run Bandit security scanner.
Args:
output_dir: Directory to save results
Returns:
True if scan completed successfully
"""
output_dir.mkdir(parents=True, exist_ok=True)
json_output = output_dir / "bandit_report.json"
txt_output = output_dir / "bandit_report.txt"
print("Running Bandit security scanner...")
print("=" * 80)
try:
# Run Bandit with JSON and text output
result = subprocess.run(
[
sys.executable, "-m", "bandit",
"-r", "llmds",
"-f", "json",
"-o", str(json_output),
],
capture_output=True,
text=True,
check=False,
)
# Also generate text report
subprocess.run(
[
sys.executable, "-m", "bandit",
"-r", "llmds",
"-f", "txt",
"-o", str(txt_output),
],
capture_output=True,
text=True,
check=False,
)
# Parse results
if json_output.exists():
with open(json_output) as f:
bandit_data = json.load(f)
# Count issues by severity
metrics = bandit_data.get("metrics", {})
total = metrics.get("_totals", {})
print(f"\nBandit Results:")
print(f" HIGH: {total.get('SEVERITY.HIGH', 0)} issues")
print(f" MEDIUM: {total.get('SEVERITY.MEDIUM', 0)} issues")
print(f" LOW: {total.get('SEVERITY.LOW', 0)} issues")
print(f" Total: {total.get('CONFIDENCE.HIGH', 0)} high confidence issues")
# List high severity issues
high_severity = [
issue for issue in bandit_data.get("results", [])
if issue.get("issue_severity") == "HIGH"
]
if high_severity:
print(f"\n HIGH Severity Issues ({len(high_severity)}):")
for issue in high_severity[:10]: # Show first 10
print(f" - {issue.get('test_id')}: {issue.get('test_name')}")
print(f" File: {issue.get('filename')}:{issue.get('line_number')}")
print(f"\n Full report: {txt_output}")
print(f" JSON report: {json_output}")
return total.get("SEVERITY.HIGH", 0) == 0
else:
print(" Warning: Bandit JSON output not found")
return False
except FileNotFoundError:
print(" Error: Bandit not installed. Install with: pip install bandit[toml]")
return False
except Exception as e:
print(f" Error running Bandit: {e}")
return False
def run_pip_audit(output_dir: Path) -> bool:
"""
Run pip-audit to check for known vulnerabilities in dependencies.
Args:
output_dir: Directory to save results
Returns:
True if no HIGH/CRITICAL vulnerabilities found
"""
output_dir.mkdir(parents=True, exist_ok=True)
json_output = output_dir / "pip_audit_report.json"
txt_output = output_dir / "pip_audit_report.txt"
print("\nRunning pip-audit security scanner...")
print("=" * 80)
try:
# Run pip-audit
result = subprocess.run(
[
sys.executable, "-m", "pip_audit",
"--format", "json",
"--output", str(json_output),
],
capture_output=True,
text=True,
check=False,
)
# Also generate text output
subprocess.run(
[
sys.executable, "-m", "pip_audit",
"--format", "text",
"--output", str(txt_output),
],
capture_output=True,
text=True,
check=False,
)
# Parse results
if json_output.exists():
with open(json_output) as f:
audit_data = json.load(f)
vulnerabilities = audit_data.get("vulnerabilities", [])
high_critical = [
v for v in vulnerabilities
if v.get("aliases", [{}])[0].get("severity", "").upper() in ["HIGH", "CRITICAL"]
]
print(f"\npip-audit Results:")
print(f" Total vulnerabilities: {len(vulnerabilities)}")
print(f" HIGH/CRITICAL: {len(high_critical)}")
if high_critical:
print(f"\n HIGH/CRITICAL Vulnerabilities:")
for vuln in high_critical[:10]: # Show first 10
package = vuln.get("name", "unknown")
severity = vuln.get("aliases", [{}])[0].get("severity", "UNKNOWN")
print(f" - {package}: {severity}")
if "versions" in vuln:
print(f" Affected versions: {vuln['versions']}")
print(f"\n Full report: {txt_output}")
print(f" JSON report: {json_output}")
return len(high_critical) == 0
else:
print(" Warning: pip-audit JSON output not found")
# Check if there were errors
if result.stderr:
print(f" Error output: {result.stderr}")
return False
except FileNotFoundError:
print(" Error: pip-audit not installed. Install with: pip install pip-audit")
return False
except Exception as e:
print(f" Error running pip-audit: {e}")
if result.stderr:
print(f" Error output: {result.stderr}")
return False
def generate_sbom(output_dir: Path) -> bool:
"""
Generate Software Bill of Materials (SBOM) using pip-audit.
Args:
output_dir: Directory to save SBOM
Returns:
True if SBOM generated successfully
"""
output_dir.mkdir(parents=True, exist_ok=True)
sbom_output = output_dir / "sbom.json"
print("\nGenerating SBOM (Software Bill of Materials)...")
print("=" * 80)
try:
# Try to generate SBOM using pip-audit (if supported)
# Note: pip-audit may need additional flags for SBOM generation
result = subprocess.run(
[
sys.executable, "-m", "pip_audit",
"--format", "json",
"--output", str(sbom_output),
],
capture_output=True,
text=True,
check=False,
)
if sbom_output.exists():
print(f" SBOM generated: {sbom_output}")
print(" Note: For CycloneDX format, consider using cyclonedx-bom or pip-tools")
return True
else:
print(" Warning: SBOM generation may require additional tools")
print(" Consider using: cyclonedx-py or pip-tools for full SBOM")
return False
except Exception as e:
print(f" Error generating SBOM: {e}")
return False
def main():
"""Run all security scans."""
import argparse
parser = argparse.ArgumentParser(description="Run security scans")
parser.add_argument(
"--output-dir",
type=Path,
default=Path("audit/security"),
help="Directory for security scan results (default: audit/security)",
)
parser.add_argument(
"--skip-bandit",
action="store_true",
help="Skip Bandit scan",
)
parser.add_argument(
"--skip-pip-audit",
action="store_true",
help="Skip pip-audit scan",
)
parser.add_argument(
"--skip-sbom",
action="store_true",
help="Skip SBOM generation",
)
args = parser.parse_args()
print("Security Scanning")
print("=" * 80)
print(f"Output directory: {args.output_dir}")
print()
results = {}
# Run Bandit
if not args.skip_bandit:
results["bandit"] = run_bandit(args.output_dir)
else:
print("Skipping Bandit scan")
# Run pip-audit
if not args.skip_pip_audit:
results["pip_audit"] = run_pip_audit(args.output_dir)
else:
print("Skipping pip-audit scan")
# Generate SBOM
if not args.skip_sbom:
results["sbom"] = generate_sbom(args.output_dir)
else:
print("Skipping SBOM generation")
# Summary
print("\n" + "=" * 80)
print("Summary")
print("=" * 80)
all_passed = all(results.values())
for tool, passed in results.items():
status = "✓ PASSED" if passed else "✗ FAILED"
print(f" {tool}: {status}")
if all_passed:
print("\n✓ All security scans passed!")
return 0
else:
print("\n✗ Some security issues found. Please review reports.")
return 1
if __name__ == "__main__":
sys.exit(main())