289 lines
7.3 KiB
Markdown
289 lines
7.3 KiB
Markdown
# Usage Guide
|
|
|
|
## Basic Examples
|
|
|
|
### KV Cache
|
|
|
|
```python
|
|
from llmds import KVCache
|
|
|
|
# Create cache with prefix sharing (enabled by default)
|
|
cache = KVCache(page_size=512, max_pages=10000, enable_prefix_sharing=True)
|
|
|
|
# Attach KV tokens for a sequence with prefix sharing
|
|
prefix = [1, 2, 3] # Shared system prompt
|
|
kv_tokens = prefix + [4, 5, 6] * 100 # 500 tokens
|
|
cache.attach(seq_id=1, kv_tokens=kv_tokens, prefix_tokens=prefix)
|
|
|
|
# Second sequence with same prefix - will share pages
|
|
kv_tokens2 = prefix + [7, 8, 9] * 100
|
|
cache.attach(seq_id=2, kv_tokens=kv_tokens2, prefix_tokens=prefix)
|
|
|
|
# Retrieve (returns deep copy to prevent corruption)
|
|
cached = cache.get(seq_id=1)
|
|
|
|
# Copy-on-write: if you modify shared pages, they are automatically copied
|
|
# Shared pages are read-only until modified, then lazily copied
|
|
|
|
# Detach when done (reference counting handles shared pages)
|
|
cache.detach(seq_id=1)
|
|
cache.detach(seq_id=2)
|
|
```
|
|
|
|
**Copy-on-Write Behavior:**
|
|
- Shared pages (from prefix sharing) are read-only by default
|
|
- Writing different data to a shared page triggers lazy copying
|
|
- Each sequence gets its own copy of modified pages
|
|
- Original shared pages remain unchanged for other sequences
|
|
- `get()` always returns deep copies to prevent external corruption
|
|
|
|
### Scheduler
|
|
|
|
```python
|
|
from llmds import Scheduler
|
|
|
|
# Create scheduler
|
|
scheduler = Scheduler(max_batch_size=32, max_wait_ms=50.0)
|
|
|
|
# Submit requests
|
|
req_id1 = scheduler.submit(tokens=100)
|
|
req_id2 = scheduler.submit(tokens=200, slo_ms=100.0) # SLO deadline
|
|
|
|
# Get batch (waits for max_wait_ms or until batch is full)
|
|
batch = scheduler.get_batch(force=False)
|
|
|
|
# Process batch...
|
|
# scheduler.complete_batch(batch)
|
|
```
|
|
|
|
### Admission Control
|
|
|
|
```python
|
|
from llmds import AdmissionController
|
|
|
|
# Create controller
|
|
controller = AdmissionController(qps_target=10.0, token_rate_limit=10000)
|
|
|
|
# Check admission
|
|
should_admit, reason = controller.should_admit(estimated_tokens=100)
|
|
if should_admit:
|
|
# Process request
|
|
controller.record_request(tokens=100)
|
|
else:
|
|
# Reject request
|
|
print(f"Rejected: {reason}")
|
|
```
|
|
|
|
### Retrieval Pipeline
|
|
|
|
```python
|
|
from llmds import RetrievalPipeline
|
|
import numpy as np
|
|
|
|
# Create pipeline with reproducible HNSW structure
|
|
pipeline = RetrievalPipeline(embedding_dim=384, seed=42)
|
|
|
|
# Add documents
|
|
for i in range(100):
|
|
text = f"Document {i} content"
|
|
embedding = np.random.randn(384).astype(np.float32)
|
|
embedding = embedding / np.linalg.norm(embedding)
|
|
pipeline.add_document(doc_id=i, text=text, embedding=embedding)
|
|
|
|
# Search
|
|
query = "example query"
|
|
query_embedding = np.random.randn(384).astype(np.float32)
|
|
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
|
|
|
results = pipeline.search(query, query_embedding=query_embedding, top_k=10)
|
|
for doc_id, score in results:
|
|
print(f"Doc {doc_id}: {score:.4f}")
|
|
```
|
|
|
|
## Advanced Usage
|
|
|
|
### Custom Priority Function
|
|
|
|
```python
|
|
from llmds import Scheduler
|
|
|
|
def custom_priority_fn(req):
|
|
# Prioritize by inverse token count
|
|
return 1.0 / (req.tokens + 1.0)
|
|
|
|
scheduler = Scheduler(
|
|
max_batch_size=32,
|
|
max_wait_ms=50.0,
|
|
priority_fn=custom_priority_fn
|
|
)
|
|
```
|
|
|
|
### Token Budget Management
|
|
|
|
```python
|
|
from llmds import TokenLRU
|
|
|
|
def token_counter(value):
|
|
return len(str(value))
|
|
|
|
cache = TokenLRU(token_budget=1000, token_of=token_counter)
|
|
|
|
# Add items (evicts LRU if budget exceeded)
|
|
cache.put("key1", "value with many tokens")
|
|
cache.put("key2", "another value")
|
|
|
|
# Evict until target budget
|
|
evicted = cache.evict_until_budget(target_budget=500)
|
|
```
|
|
|
|
### HNSW Parameter Tuning
|
|
|
|
```python
|
|
from llmds import HNSW
|
|
import numpy as np
|
|
|
|
# Tune for better recall (higher memory)
|
|
hnsw_high_recall = HNSW(
|
|
dim=384,
|
|
M=32, # More connections
|
|
ef_construction=400, # More candidates during build
|
|
ef_search=100, # More candidates during search
|
|
seed=42 # Reproducible graph structure
|
|
)
|
|
|
|
# Tune for faster search (lower memory)
|
|
hnsw_fast = HNSW(
|
|
dim=384,
|
|
M=8, # Fewer connections
|
|
ef_construction=100,
|
|
ef_search=20, # Fewer candidates
|
|
seed=42 # Reproducible graph structure
|
|
)
|
|
|
|
# Reproducible benchmarks
|
|
hnsw_bench = HNSW(dim=128, M=16, ef_construction=200, ef_search=50, seed=42)
|
|
# Same seed ensures identical graph structure across runs
|
|
```
|
|
|
|
## Benchmarking
|
|
|
|
### Running Benchmarks
|
|
|
|
```python
|
|
from benchmarks.bench_kv_cache import benchmark_kv_cache
|
|
|
|
results = benchmark_kv_cache(
|
|
num_sequences=1000,
|
|
tokens_per_seq=1000,
|
|
page_size=512
|
|
)
|
|
print(f"P95 latency: {results['attach_p95_ms']:.2f} ms")
|
|
```
|
|
|
|
### Custom Benchmarks
|
|
|
|
```python
|
|
from llmds.utils import Timer, compute_percentiles, calculate_statistics
|
|
|
|
latencies = []
|
|
|
|
for i in range(100):
|
|
with Timer() as t:
|
|
# Your operation here
|
|
pass
|
|
latencies.append(t.elapsed * 1000) # Convert to milliseconds
|
|
|
|
# Compute percentiles
|
|
percentiles = compute_percentiles(latencies)
|
|
print(f"P50: {percentiles['p50']:.2f} ms")
|
|
print(f"P95: {percentiles['p95']:.2f} ms")
|
|
print(f"P99: {percentiles['p99']:.2f} ms")
|
|
|
|
# Or compute comprehensive statistics
|
|
stats = calculate_statistics(latencies)
|
|
print(f"Mean: {stats['mean']:.2f} ± {stats['std']:.2f} ms")
|
|
print(f"95% CI: [{stats['ci_lower']:.2f}, {stats['ci_upper']:.2f}] ms")
|
|
print(f"CV: {stats['cv']:.2f}%")
|
|
```
|
|
|
|
## Memory Profiling
|
|
|
|
All benchmarks automatically measure peak RSS (Resident Set Size) using `psutil`:
|
|
|
|
```python
|
|
from llmds.utils import memory_profiler
|
|
import numpy as np
|
|
|
|
# Memory profiling in your benchmarks
|
|
with memory_profiler() as profiler:
|
|
# Allocate memory
|
|
data = np.random.randn(1000, 1000).astype(np.float32)
|
|
profiler.sample() # Optional: sample at specific points
|
|
|
|
# More operations
|
|
result = process_data(data)
|
|
|
|
peak_rss_mb = profiler.get_peak_rss_mb()
|
|
memory_delta_mb = profiler.get_memory_delta_mb()
|
|
|
|
print(f"Peak memory: {peak_rss_mb:.2f} MB")
|
|
print(f"Memory allocated: {memory_delta_mb:.2f} MB")
|
|
```
|
|
|
|
**Benchmark Results Include:**
|
|
- `peak_rss_mb`: Peak memory usage during benchmark
|
|
- `memory_delta_mb`: Memory allocated during execution (peak - initial)
|
|
- `build_peak_rss_mb`: Peak memory during build/indexing phase (where applicable)
|
|
|
|
All benchmark scripts automatically include memory profiling - no additional configuration needed.
|
|
|
|
## Integration Examples
|
|
|
|
### RAG Pipeline
|
|
|
|
```python
|
|
from llmds import RetrievalPipeline
|
|
import numpy as np
|
|
|
|
# Initialize
|
|
pipeline = RetrievalPipeline(embedding_dim=384)
|
|
|
|
# Index documents
|
|
documents = ["doc1", "doc2", "doc3"]
|
|
embeddings = [np.random.randn(384) for _ in documents]
|
|
for doc_id, (text, emb) in enumerate(zip(documents, embeddings)):
|
|
emb = emb / np.linalg.norm(emb)
|
|
pipeline.add_document(doc_id=doc_id, text=text, embedding=emb)
|
|
|
|
# Query
|
|
query_emb = np.random.randn(384)
|
|
query_emb = query_emb / np.linalg.norm(query_emb)
|
|
results = pipeline.search("query", query_embedding=query_emb, top_k=5)
|
|
```
|
|
|
|
### LLM Inference with KV Cache
|
|
|
|
```python
|
|
from llmds import KVCache, Scheduler, TokenLRU
|
|
|
|
# Setup
|
|
kv_cache = KVCache()
|
|
scheduler = Scheduler()
|
|
token_cache = TokenLRU(token_budget=100000, token_of=lambda x: len(str(x)))
|
|
|
|
# Process request
|
|
seq_id = 1
|
|
prompt_tokens = [1, 2, 3, 4, 5]
|
|
kv_tokens = generate_kv_cache(prompt_tokens) # Your function
|
|
|
|
kv_cache.attach(seq_id=seq_id, kv_tokens=kv_tokens, prefix_tokens=prompt_tokens)
|
|
|
|
# Use cached KV for generation
|
|
cached_kv = kv_cache.get(seq_id)
|
|
# ... generate tokens using cached KV ...
|
|
|
|
# Cleanup
|
|
kv_cache.detach(seq_id)
|
|
```
|
|
|