Initial commit: LLM-DS optimizer framework with data files excluded

2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions
--- a/scripts/profile_tail_latency.py
+++ b/scripts/profile_tail_latency.py
@@ -0,0 +1,247 @@
+"""Profile tail latency breakdown for retrieval pipeline.
+
+This script profiles latency components to identify bottlenecks causing
+extreme P99 tail latencies.
+"""
+
+import cProfile
+import pstats
+import statistics
+from pathlib import Path
+from typing import Dict, List
+
+import numpy as np
+
+from llmds.hnsw import HNSW
+from llmds.retrieval_pipeline import RetrievalPipeline
+
+
+def profile_hnsw_search(num_vectors: int = 10000, dim: int = 128, num_queries: int = 1000):
+    """Profile HNSW search operations."""
+    print(f"Profiling HNSW search with {num_vectors} vectors, dim={dim}, {num_queries} queries...")
+    
+    np.random.seed(42)
+    hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
+    
+    # Build index
+    vectors = []
+    for i in range(num_vectors):
+        vec = np.random.randn(dim).astype(np.float32)
+        vec = vec / np.linalg.norm(vec)
+        vectors.append(vec)
+        hnsw.add(vec, i)
+    
+    # Profile search operations
+    profiler = cProfile.Profile()
+    profiler.enable()
+    
+    search_times = []
+    for _ in range(num_queries):
+        query = np.random.randn(dim).astype(np.float32)
+        query = query / np.linalg.norm(query)
+        
+        import time
+        start = time.perf_counter()
+        results = hnsw.search(query, k=10)
+        elapsed = time.perf_counter() - start
+        search_times.append(elapsed * 1000)  # Convert to ms
+    
+    profiler.disable()
+    
+    # Compute latency statistics
+    search_times.sort()
+    p50 = search_times[len(search_times) // 2]
+    p95 = search_times[int(len(search_times) * 0.95)]
+    p99 = search_times[int(len(search_times) * 0.99)]
+    p99_9 = search_times[int(len(search_times) * 0.999)] if len(search_times) >= 1000 else p99
+    
+    print(f"\nHNSW Search Latency Statistics:")
+    print(f"  P50:  {p50:.3f} ms")
+    print(f"  P95:  {p95:.3f} ms")
+    print(f"  P99:  {p99:.3f} ms")
+    print(f"  P99.9: {p99_9:.3f} ms")
+    print(f"  Mean: {statistics.mean(search_times):.3f} ms")
+    print(f"  Max:  {max(search_times):.3f} ms")
+    
+    # Analyze P99 outliers
+    threshold = p95 * 2  # Outliers are 2x P95
+    outliers = [t for t in search_times if t > threshold]
+    if outliers:
+        print(f"\n  Outliers (>2x P95): {len(outliers)} queries ({len(outliers)/len(search_times)*100:.1f}%)")
+        print(f"    Outlier P50: {statistics.median(outliers):.3f} ms")
+        print(f"    Outlier Max: {max(outliers):.3f} ms")
+    
+    # Generate profiling report
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    
+    print("\nTop 20 functions by cumulative time:")
+    print("=" * 80)
+    stats.print_stats(20)
+    
+    return {
+        "p50_ms": p50,
+        "p95_ms": p95,
+        "p99_ms": p99,
+        "p99_9_ms": p99_9,
+        "mean_ms": statistics.mean(search_times),
+        "max_ms": max(search_times),
+        "outlier_count": len(outliers),
+        "outlier_percent": len(outliers) / len(search_times) * 100 if search_times else 0,
+    }
+
+
+def profile_retrieval_pipeline(num_docs: int = 5000, num_queries: int = 500):
+    """Profile complete retrieval pipeline."""
+    print(f"\nProfiling RetrievalPipeline with {num_docs} docs, {num_queries} queries...")
+    
+    np.random.seed(42)
+    random = np.random.RandomState(42)
+    
+    pipeline = RetrievalPipeline(embedding_dim=128, seed=42)
+    
+    # Build index
+    for i in range(num_docs):
+        text = f"document {i} about topic {i % 10}"
+        embedding = random.randn(128).astype(np.float32)
+        embedding = embedding / np.linalg.norm(embedding)
+        pipeline.add_document(doc_id=i, text=text, embedding=embedding)
+    
+    # Profile search operations
+    profiler = cProfile.Profile()
+    profiler.enable()
+    
+    search_times = []
+    for _ in range(num_queries):
+        query_text = "document topic"
+        query_embedding = random.randn(128).astype(np.float32)
+        query_embedding = query_embedding / np.linalg.norm(query_embedding)
+        
+        import time
+        start = time.perf_counter()
+        results = pipeline.search(
+            query_text, query_embedding=query_embedding, top_k=10
+        )
+        elapsed = time.perf_counter() - start
+        search_times.append(elapsed * 1000)  # Convert to ms
+    
+    profiler.disable()
+    
+    # Compute latency statistics
+    search_times.sort()
+    p50 = search_times[len(search_times) // 2]
+    p95 = search_times[int(len(search_times) * 0.95)]
+    p99 = search_times[int(len(search_times) * 0.99)]
+    
+    print(f"\nRetrieval Pipeline Latency Statistics:")
+    print(f"  P50:  {p50:.3f} ms")
+    print(f"  P95:  {p95:.3f} ms")
+    print(f"  P99:  {p99:.3f} ms")
+    print(f"  Mean: {statistics.mean(search_times):.3f} ms")
+    print(f"  Max:  {max(search_times):.3f} ms")
+    
+    # Generate profiling report
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    
+    print("\nTop 20 functions by cumulative time:")
+    print("=" * 80)
+    stats.print_stats(20)
+    
+    return {
+        "p50_ms": p50,
+        "p95_ms": p95,
+        "p99_ms": p99,
+        "mean_ms": statistics.mean(search_times),
+        "max_ms": max(search_times),
+    }
+
+
+def profile_latency_breakdown(num_vectors: int = 5000, dim: int = 128):
+    """Profile latency breakdown by component."""
+    print(f"\nProfiling latency breakdown with {num_vectors} vectors...")
+    
+    np.random.seed(42)
+    hnsw = HNSW(dim=dim, M=16, ef_construction=200, ef_search=50, seed=42)
+    
+    # Build index
+    vectors = []
+    for i in range(num_vectors):
+        vec = np.random.randn(dim).astype(np.float32)
+        vec = vec / np.linalg.norm(vec)
+        vectors.append(vec)
+        hnsw.add(vec, i)
+    
+    # Profile individual operations
+    import time
+    
+    search_times = []
+    distance_computation_times = []
+    
+    for _ in range(100):
+        query = np.random.randn(dim).astype(np.float32)
+        query = query / np.linalg.norm(query)
+        
+        # Profile distance computations
+        dist_start = time.perf_counter()
+        distances = [np.linalg.norm(query - vec) for vec in vectors[:100]]
+        dist_time = (time.perf_counter() - dist_start) * 1000
+        distance_computation_times.append(dist_time)
+        
+        # Profile search
+        search_start = time.perf_counter()
+        results = hnsw.search(query, k=10)
+        search_time = (time.perf_counter() - search_start) * 1000
+        search_times.append(search_time)
+    
+    print(f"\nLatency Breakdown:")
+    print(f"  Distance computation: {statistics.mean(distance_computation_times):.3f} ms (mean)")
+    print(f"  HNSW search: {statistics.mean(search_times):.3f} ms (mean)")
+    print(f"  Search/Distance ratio: {statistics.mean(search_times) / statistics.mean(distance_computation_times):.2f}x")
+
+
+def main():
+    """Run all profiling tasks."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Profile tail latency")
+    parser.add_argument("--output", type=Path, default=Path("audit/tail_latency_profile.txt"),
+                       help="Output file for profiling report")
+    parser.add_argument("--num-vectors", type=int, default=10000,
+                       help="Number of vectors for HNSW profiling")
+    parser.add_argument("--num-docs", type=int, default=5000,
+                       help="Number of documents for pipeline profiling")
+    parser.add_argument("--num-queries", type=int, default=1000,
+                       help="Number of queries to run")
+    args = parser.parse_args()
+    
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    
+    # Redirect output to file
+    import sys
+    with open(args.output, "w") as f:
+        sys.stdout = f
+        try:
+            # Profile HNSW
+            hnsw_stats = profile_hnsw_search(args.num_vectors, 128, args.num_queries)
+            
+            # Profile pipeline
+            pipeline_stats = profile_retrieval_pipeline(args.num_docs, args.num_queries // 2)
+            
+            # Breakdown
+            profile_latency_breakdown(args.num_vectors, 128)
+        finally:
+            sys.stdout = sys.__stdout__
+    
+    print(f"\nProfiling complete. Report saved to: {args.output}")
+    print(f"\nKey Findings:")
+    print(f"  HNSW P99: {hnsw_stats['p99_ms']:.3f} ms")
+    print(f"  Pipeline P99: {pipeline_stats['p99_ms']:.3f} ms")
+    
+    if hnsw_stats.get("outlier_count", 0) > 0:
+        print(f"  HNSW Outliers: {hnsw_stats['outlier_count']} ({hnsw_stats['outlier_percent']:.1f}%)")
+
+
+if __name__ == "__main__":
+    main()
+