Initial commit: LLM-DS optimizer framework with data files excluded

This commit is contained in:
Carlos Gutierrez
2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions

35
llmds/__init__.py Normal file
View File

@@ -0,0 +1,35 @@
"""
LLM Data Structures Optimizer.
A production-grade Python library for optimizing LLM inference and retrieval
through advanced data structures and algorithms.
"""
__version__ = "0.1.0"
from llmds.kv_cache import KVCache
from llmds.paged_allocator import PagedAllocator
from llmds.token_lru import TokenLRU
from llmds.indexed_heap import IndexedHeap
from llmds.scheduler import Scheduler
from llmds.admissions import AdmissionController
from llmds.inverted_index import InvertedIndex
from llmds.hnsw import HNSW
from llmds.cmsketch import CountMinSketch
from llmds.retrieval_pipeline import RetrievalPipeline
from llmds.tokenizer import Tokenizer
__all__ = [
"KVCache",
"PagedAllocator",
"TokenLRU",
"IndexedHeap",
"Scheduler",
"AdmissionController",
"InvertedIndex",
"HNSW",
"CountMinSketch",
"RetrievalPipeline",
"Tokenizer",
]

135
llmds/admissions.py Normal file
View File

@@ -0,0 +1,135 @@
"""Admission controller with rate limiting and QPS tracking."""
import time
from collections import deque
from typing import Optional
class AdmissionController:
"""
Admission controller with token-rate limiting and moving-average QPS.
Controls admission based on token budget and QPS targets.
"""
def __init__(
self,
qps_target: float = 10.0,
token_rate_limit: int = 10000,
window_size: int = 10,
):
"""
Initialize admission controller.
Args:
qps_target: Target queries per second
token_rate_limit: Maximum tokens per second
window_size: Size of moving average window in seconds
"""
self.qps_target = qps_target
self.token_rate_limit = token_rate_limit
self.window_size = window_size
self._request_times: deque[float] = deque()
self._token_history: deque[tuple[float, int]] = deque() # (time, tokens)
self._admitted_requests = 0
self._rejected_requests = 0
def _cleanup_old_requests(self, current_time: float) -> None:
"""Remove requests outside the time window."""
while self._request_times and current_time - self._request_times[0] > self.window_size:
self._request_times.popleft()
while self._token_history and current_time - self._token_history[0][0] > self.window_size:
self._token_history.popleft()
def _get_current_qps(self, current_time: float) -> float:
"""Calculate current QPS over the window."""
self._cleanup_old_requests(current_time)
if not self._request_times:
return 0.0
return len(self._request_times) / self.window_size
def _get_current_token_rate(self, current_time: float) -> float:
"""Calculate current token rate over the window."""
self._cleanup_old_requests(current_time)
if not self._token_history:
return 0.0
total_tokens = sum(tokens for _, tokens in self._token_history)
return total_tokens / self.window_size
def should_admit(self, estimated_tokens: int = 0) -> tuple[bool, str]:
"""
Check if a request should be admitted.
Args:
estimated_tokens: Estimated tokens for this request
Returns:
Tuple of (should_admit, reason)
"""
current_time = time.time()
current_qps = self._get_current_qps(current_time)
current_token_rate = self._get_current_token_rate(current_time)
# Check QPS limit
if current_qps >= self.qps_target:
self._rejected_requests += 1
return False, f"QPS limit exceeded: {current_qps:.2f} >= {self.qps_target}"
# Check token rate limit
if current_token_rate + estimated_tokens / self.window_size > self.token_rate_limit:
self._rejected_requests += 1
return False, f"Token rate limit exceeded"
# Admit request
self._request_times.append(current_time)
if estimated_tokens > 0:
self._token_history.append((current_time, estimated_tokens))
self._admitted_requests += 1
return True, "admitted"
def record_request(self, tokens: int) -> None:
"""
Record a completed request with token count.
Args:
tokens: Number of tokens processed
"""
current_time = time.time()
self._token_history.append((current_time, tokens))
def stats(self) -> dict[str, float]:
"""
Get admission statistics.
Returns:
Dictionary with admission statistics
"""
current_time = time.time()
current_qps = self._get_current_qps(current_time)
current_token_rate = self._get_current_token_rate(current_time)
total_requests = self._admitted_requests + self._rejected_requests
rejection_rate = (
self._rejected_requests / total_requests if total_requests > 0 else 0.0
)
return {
"current_qps": current_qps,
"target_qps": self.qps_target,
"current_token_rate": current_token_rate,
"token_rate_limit": self.token_rate_limit,
"admitted_requests": self._admitted_requests,
"rejected_requests": self._rejected_requests,
"rejection_rate": rejection_rate,
}
def reset(self) -> None:
"""Reset all statistics."""
self._request_times.clear()
self._token_history.clear()
self._admitted_requests = 0
self._rejected_requests = 0

72
llmds/chunking.py Normal file
View File

@@ -0,0 +1,72 @@
"""Text chunking utilities for RAG."""
from typing import Any, Iterator, Optional
def chunk_text(
text: str,
chunk_size: int = 512,
overlap: int = 50,
tokenizer: Optional[Any] = None,
) -> Iterator[str]:
"""
Chunk text into overlapping segments.
Args:
text: Input text to chunk
chunk_size: Target chunk size in tokens/characters
overlap: Overlap between chunks
tokenizer: Optional tokenizer (if None, uses character-based)
Yields:
Text chunks
"""
if tokenizer is not None:
# Token-based chunking
tokens = tokenizer.encode(text)
for i in range(0, len(tokens), chunk_size - overlap):
chunk_tokens = tokens[i:i + chunk_size]
yield tokenizer.decode(chunk_tokens)
else:
# Character-based chunking (simple fallback)
for i in range(0, len(text), chunk_size - overlap):
yield text[i:i + chunk_size]
def chunk_documents(
documents: Iterator[dict[str, Any]],
chunk_size: int = 512,
overlap: int = 50,
tokenizer: Optional[Any] = None,
) -> Iterator[dict[str, Any]]:
"""
Chunk documents into smaller segments.
Args:
documents: Iterator of document dicts with 'id', 'text', 'meta'
chunk_size: Target chunk size
overlap: Overlap between chunks
tokenizer: Optional tokenizer
Yields:
Chunk dictionaries with 'id', 'text', 'meta', 'chunk_idx'
"""
for doc in documents:
doc_id = doc["id"]
text = doc["text"]
meta = doc.get("meta", {})
chunks = list(chunk_text(text, chunk_size, overlap, tokenizer))
for chunk_idx, chunk_text_seg in enumerate(chunks):
yield {
"id": f"{doc_id}_chunk_{chunk_idx}",
"text": chunk_text_seg,
"meta": {
**meta,
"doc_id": doc_id,
"chunk_idx": chunk_idx,
"total_chunks": len(chunks),
}
}

115
llmds/cmsketch.py Normal file
View File

@@ -0,0 +1,115 @@
"""Count-Min Sketch for hot query estimation and cache priming.
Implementation based on:
Cormode, G., & Muthukrishnan, S. (2005). An improved data stream summary:
the count-min sketch and its applications. Journal of Algorithms, 55(1), 58-75.
See docs/CITATIONS.md for full citation details.
"""
import mmh3
from typing import Optional
class CountMinSketch:
"""
Count-Min Sketch for frequency estimation with conservative update.
Uses 4 hash functions (via MurmurHash3) and provides error bounds.
Reference:
Cormode & Muthukrishnan (2005). An improved data stream summary:
the count-min sketch and its applications.
"""
def __init__(self, width: int = 2048, depth: int = 4):
"""
Initialize Count-Min Sketch.
Args:
width: Width of the sketch (number of counters per row)
depth: Depth of the sketch (number of hash functions)
"""
self.width = width
self.depth = depth
self._table: list[list[int]] = [[0] * width for _ in range(depth)]
self._total_count = 0
def _hash(self, item: str, seed: int) -> int:
"""Hash an item with a given seed."""
return mmh3.hash(item, seed) % self.width
def add(self, item: str, count: int = 1) -> None:
"""
Add an item to the sketch.
Args:
item: Item to add
count: Count to add (default 1)
"""
self._total_count += count
min_val = float("inf")
# Find minimum count across all rows
for i in range(self.depth):
idx = self._hash(item, i)
self._table[i][idx] += count
min_val = min(min_val, self._table[i][idx])
# Conservative update: only increment if current count < min
# This reduces overestimation bias
for i in range(self.depth):
idx = self._hash(item, i)
if self._table[i][idx] > min_val:
self._table[i][idx] = int(min_val)
def estimate(self, item: str) -> int:
"""
Estimate the frequency of an item.
Args:
item: Item to estimate
Returns:
Estimated frequency (minimum across all rows)
"""
min_count = float("inf")
for i in range(self.depth):
idx = self._hash(item, i)
min_count = min(min_count, self._table[i][idx])
return int(min_count)
def get_error_bound(self) -> float:
"""
Get theoretical error bound (with high probability).
Returns:
Error bound as a fraction of total count
"""
# With probability 1 - delta, error <= epsilon * total_count
# where epsilon = e / width and delta = (1/2)^depth
epsilon = 2.71828 / self.width
return epsilon * self._total_count
def get_total_count(self) -> int:
"""Get total count of all items."""
return self._total_count
def is_hot(self, item: str, threshold: int) -> bool:
"""
Check if an item is "hot" (above threshold).
Args:
item: Item to check
threshold: Frequency threshold
Returns:
True if estimated frequency >= threshold
"""
return self.estimate(item) >= threshold
def reset(self) -> None:
"""Reset all counters."""
self._table = [[0] * self.width for _ in range(self.depth)]
self._total_count = 0

View File

@@ -0,0 +1,18 @@
"""Data source loaders for real corpora."""
from llmds.data_sources.msmarco import load_msmarco
from llmds.data_sources.beir_loader import load_beir
from llmds.data_sources.amazon_reviews import load_amazon_reviews
from llmds.data_sources.yelp import load_yelp
from llmds.data_sources.wikipedia import load_wikipedia
from llmds.data_sources.commoncrawl import load_commoncrawl
__all__ = [
"load_msmarco",
"load_beir",
"load_amazon_reviews",
"load_yelp",
"load_wikipedia",
"load_commoncrawl",
]

View File

@@ -0,0 +1,128 @@
"""Amazon Reviews 2023 dataset loader."""
import json
import itertools
from pathlib import Path
from typing import Iterator
try:
from datasets import load_dataset
HAS_DATASETS = True
except ImportError:
HAS_DATASETS = False
def download_amazon_reviews(output_dir: Path, limit: int | None = None, streaming: bool = True) -> Path:
"""
Download Amazon Reviews 2023 dataset.
Args:
output_dir: Directory to save corpus
limit: Optional limit on number of reviews
streaming: Use streaming mode for large datasets
Returns:
Path to corpus JSONL file
"""
if not HAS_DATASETS:
raise ImportError(
"Hugging Face datasets library required. Install with: pip install datasets"
)
output_dir.mkdir(parents=True, exist_ok=True)
corpus_file = output_dir / "reviews.jsonl"
if corpus_file.exists():
print(f"Amazon Reviews corpus already exists at {corpus_file}")
return corpus_file
print(f"Downloading Amazon Reviews 2023 (limit={limit})...")
try:
# Try alternative dataset names or use streaming
try:
dataset = load_dataset(
"McAuley-Lab/Amazon-Reviews-2023",
split="train",
streaming=streaming,
trust_remote_code=True
)
except:
# Fallback to streaming from hub
from datasets import load_dataset_builder
builder = load_dataset_builder("McAuley-Lab/Amazon-Reviews-2023")
dataset = builder.as_streaming_dataset(split="train")
streaming = True
count = 0
with open(corpus_file, "w", encoding="utf-8") as f:
iterator = dataset if streaming else itertools.islice(dataset, limit)
for row in iterator:
if limit and count >= limit:
break
# Handle different field names
title = (row.get("title") or row.get("Title") or "").strip()
text = (row.get("text") or row.get("Text") or row.get("Body") or "").strip()
combined_text = (title + " " + text).strip()
if combined_text and len(combined_text) > 20: # Minimum length
doc = {
"id": str(row.get("review_id", row.get("ReviewID", f"amazon_{count}"))),
"text": combined_text,
"meta": {
"asin": row.get("parent_asin", row.get("ParentASIN", "")),
"rating": row.get("rating", row.get("Rating")),
"verified": row.get("verified_purchase", row.get("VerifiedPurchase")),
}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
if count % 10000 == 0:
print(f"Processed {count} reviews...")
print(f"Downloaded {count} Amazon reviews to {corpus_file}")
except Exception as e:
print(f"Error downloading Amazon Reviews: {e}")
print("Creating realistic placeholder corpus...")
# Create more realistic placeholder
reviews_texts = [
"Great product! Works exactly as described. Highly recommend.",
"Good quality for the price. Fast shipping. Satisfied customer.",
"Not what I expected. Returned it after a week of use.",
"Excellent value. This item exceeded my expectations. Will buy again.",
"Decent product but could be better. Average quality for the price.",
]
with open(corpus_file, "w", encoding="utf-8") as f:
for i in range(limit or 200000):
review_text = reviews_texts[i % len(reviews_texts)]
doc = {
"id": f"amazon_{i}",
"text": f"Product Review {i}: {review_text} Details about the product, usage experience, and recommendations. This is placeholder text but provides realistic length for benchmarking.",
"meta": {"rating": (i % 5) + 1, "asin": f"B{i:08d}", "verified": i % 3 == 0}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Created placeholder with {limit or 200000} documents")
return corpus_file
def load_amazon_reviews(corpus_file: Path) -> Iterator[dict]:
"""
Load Amazon Reviews corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)

View File

@@ -0,0 +1,141 @@
"""BEIR dataset loader."""
import json
from pathlib import Path
from typing import Iterator
try:
from datasets import load_dataset
HAS_DATASETS = True
except ImportError:
HAS_DATASETS = False
BEIR_TASKS = {
"fiqa": "BeIR/fiqa",
"scidocs": "BeIR/scidocs",
"nfcorpus": "BeIR/nfcorpus",
"msmarco": "BeIR/msmarco",
"quora": "BeIR/quora",
"scifact": "BeIR/scifact",
"arguana": "BeIR/arguana",
"webis-touche2020": "BeIR/webis-touche2020",
"cqadupstack": "BeIR/cqadupstack",
"climate-fever": "BeIR/climate-fever",
"dbpedia": "BeIR/dbpedia",
"fever": "BeIR/fever",
"hotpotqa": "BeIR/hotpotqa",
"nfcorpus": "BeIR/nfcorpus",
"nq": "BeIR/nq",
"quora": "BeIR/quora",
"signal1m": "BeIR/signal1m",
"trec-covid": "BeIR/trec-covid",
"trec-news": "BeIR/trec-news",
}
def download_beir(task: str, output_dir: Path) -> Path:
"""
Download BEIR dataset for a specific task.
Args:
task: BEIR task name (e.g., 'fiqa', 'scidocs')
output_dir: Directory to save corpus
Returns:
Path to corpus JSONL file
"""
if not HAS_DATASETS:
raise ImportError(
"Hugging Face datasets library required. Install with: pip install datasets"
)
if task not in BEIR_TASKS:
raise ValueError(f"Unknown BEIR task: {task}. Available: {list(BEIR_TASKS.keys())}")
output_dir.mkdir(parents=True, exist_ok=True)
corpus_file = output_dir / "corpus.jsonl"
if corpus_file.exists():
print(f"BEIR {task} corpus already exists at {corpus_file}")
return corpus_file
print(f"Downloading BEIR task: {task}...")
try:
# Try direct HuggingFace dataset load
# BEIR datasets are available under different names
hf_name_map = {
"fiqa": "mteb/fiqa",
"scidocs": "mteb/scidocs",
"nfcorpus": "mteb/nfcorpus",
"msmarco": "ms_marco",
}
if task in hf_name_map:
dataset_name = hf_name_map[task]
print(f"Loading {dataset_name}...")
# Try corpus split first, then train
try:
dataset = load_dataset(dataset_name, split="corpus", trust_remote_code=True)
except:
try:
dataset = load_dataset(dataset_name, split="train", trust_remote_code=True)
except:
dataset = load_dataset(dataset_name, trust_remote_code=True)
count = 0
with open(corpus_file, "w", encoding="utf-8") as f:
for item in dataset:
# Handle different BEIR formats
doc_id = str(item.get("_id", item.get("id", item.get("doc_id", f"{task}_{count}"))))
text = item.get("text", item.get("body", item.get("content", "")))
if text:
doc = {
"id": doc_id,
"text": text,
"meta": {"task": task, "title": item.get("title", "")}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
if count % 10000 == 0:
print(f"Processed {count} documents...")
print(f"Downloaded {count} BEIR {task} documents to {corpus_file}")
else:
raise ValueError(f"Direct HF loading not configured for {task}. Using placeholder.")
except Exception as e:
print(f"Error downloading BEIR {task}: {e}")
print(f"Creating placeholder corpus...")
# Create placeholder with more realistic size
with open(corpus_file, "w", encoding="utf-8") as f:
for i in range(50000): # Larger placeholder
doc = {
"id": f"beir_{task}_{i}",
"text": f"BEIR {task} document {i} content. Financial question answering corpus for retrieval evaluation. This document contains financial information and questions about investing, markets, and trading strategies.",
"meta": {"task": task}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Created placeholder with 50k documents")
return corpus_file
def load_beir(corpus_file: Path) -> Iterator[dict]:
"""
Load BEIR corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)

View File

@@ -0,0 +1,123 @@
"""Common Crawl loader."""
import json
from pathlib import Path
from typing import Iterator
def download_commoncrawl(output_dir: Path, cc_month: str | None = None, limit: int | None = None) -> Path:
"""
Download Common Crawl data.
Args:
output_dir: Directory to save corpus
cc_month: Common Crawl month (e.g., 'CC-MAIN-2025-14')
limit: Optional limit on documents
Returns:
Path to corpus JSONL file
"""
output_dir.mkdir(parents=True, exist_ok=True)
corpus_file = output_dir / "web_pages.jsonl"
if corpus_file.exists():
print(f"Common Crawl corpus already exists at {corpus_file}")
return corpus_file
print("Common Crawl requires cc-downloader tool.")
print("Install: pip install common-crawl-download")
print("Usage: See https://github.com/commoncrawl/cc-downloader")
print("Be respectful of bandwidth when downloading.")
# Placeholder
print("Creating placeholder corpus...")
with open(corpus_file, "w", encoding="utf-8") as f:
size = limit or 10000
for i in range(size):
doc = {
"id": f"cc_{i}",
"text": f"Common Crawl web page {i} content. This is a placeholder.",
"meta": {"url": f"https://example.com/page{i}", "cc_month": cc_month or "CC-MAIN-2025-14"}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Created placeholder corpus with {size} documents")
return corpus_file
def process_commoncrawl_warc(warc_file: Path, output_file: Path, limit: int | None = None) -> None:
"""
Process Common Crawl WARC file to JSONL.
Args:
warc_file: Path to WARC file
output_file: Output JSONL path
limit: Optional limit on documents
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
try:
from warcio.archiveiterator import ArchiveIterator
HAS_WARC = True
except ImportError:
HAS_WARC = False
print("Warning: warcio not installed. Install with: pip install warcio")
if not HAS_WARC:
print("Creating placeholder corpus...")
with open(output_file, "w", encoding="utf-8") as f:
for i in range(limit or 10000):
doc = {
"id": f"cc_{i}",
"text": f"Web page {i} content.",
"meta": {"url": f"https://example.com/page{i}"}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
return
count = 0
with open(warc_file, "rb") as infile, \
open(output_file, "w", encoding="utf-8") as outfile:
for record in ArchiveIterator(infile):
if limit and count >= limit:
break
if record.rec_type == "response" and record.http_headers.get_header("Content-Type", "").startswith("text/html"):
# Extract text (simplified - in production use beautifulsoup)
text = record.read_stream().decode("utf-8", errors="ignore")
# Simple HTML stripping (in production use html2text or similar)
import re
text = re.sub(r"<[^>]+>", "", text)
text = " ".join(text.split())
if len(text) > 100: # Minimum length
doc = {
"id": record.rec_headers.get_header("WARC-Record-ID", f"cc_{count}"),
"text": text[:10000], # Limit text length
"meta": {"url": record.rec_headers.get_header("WARC-Target-URI", "")}
}
outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
if count % 1000 == 0:
print(f"Processed {count} pages...")
print(f"Processed {count} Common Crawl pages to {output_file}")
def load_commoncrawl(corpus_file: Path) -> Iterator[dict]:
"""
Load Common Crawl corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)

View File

@@ -0,0 +1,110 @@
"""MS MARCO dataset loader."""
import json
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Iterator
from urllib.request import urlretrieve
def download_msmarco(output_dir: Path, split: str = "passage") -> Path:
"""
Download MS MARCO dataset.
Args:
output_dir: Directory to save files
split: Dataset split ('passage' or 'doc')
Returns:
Path to downloaded corpus file
"""
output_dir.mkdir(parents=True, exist_ok=True)
base_url = "https://msmarco.blob.core.windows.net/msmarcoranking"
if split == "passage":
collection_url = f"{base_url}/collection.tar.gz"
queries_url = f"{base_url}/queries.tar.gz"
else:
collection_url = f"{base_url}/docranking/collection.tar.gz"
queries_url = f"{base_url}/docranking/queries.tar.gz"
corpus_file = output_dir / "corpus.jsonl"
if corpus_file.exists():
print(f"MS MARCO corpus already exists at {corpus_file}")
return corpus_file
# Download and extract (simplified - in production, use official downloader)
print(f"Downloading MS MARCO {split} collection...")
print("Note: For production use, download from https://microsoft.github.io/msmarco/")
print("This is a placeholder implementation.")
# Placeholder: in real implementation, download and extract tarball
# For now, create a small sample
with open(corpus_file, "w", encoding="utf-8") as f:
for i in range(1000): # Sample
doc = {
"id": f"msmarco_{i}",
"text": f"MS MARCO passage {i} content. This is a placeholder.",
"meta": {"split": split}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Created sample corpus at {corpus_file}")
return corpus_file
def load_msmarco(corpus_file: Path) -> Iterator[dict]:
"""
Load MS MARCO corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)
def normalize_msmarco(
collection_file: Path,
output_file: Path,
limit: int | None = None,
) -> None:
"""
Normalize MS MARCO collection to JSONL format.
Args:
collection_file: Path to MS MARCO collection TSV
output_file: Output JSONL path
limit: Optional limit on number of documents
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
count = 0
with open(collection_file, "r", encoding="utf-8") as infile, \
open(output_file, "w", encoding="utf-8") as outfile:
for line in infile:
if limit and count >= limit:
break
parts = line.strip().split("\t", 2)
if len(parts) >= 2:
doc_id, text = parts[0], parts[1]
doc = {
"id": doc_id,
"text": text,
"meta": {"source": "msmarco"}
}
outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
print(f"Normalized {count} documents to {output_file}")

View File

@@ -0,0 +1,109 @@
"""Wikipedia dump loader."""
import json
import subprocess
from pathlib import Path
from typing import Iterator
try:
import mwparserfromhell
HAS_WIKIPEDIA_PARSER = True
except ImportError:
HAS_WIKIPEDIA_PARSER = False
def download_wikipedia(output_dir: Path, latest: bool = True) -> Path:
"""
Download Wikipedia pages-articles dump.
Args:
output_dir: Directory to save corpus
latest: Use latest dump (otherwise needs specific date)
Returns:
Path to corpus JSONL file
"""
output_dir.mkdir(parents=True, exist_ok=True)
corpus_file = output_dir / "pages.jsonl"
if corpus_file.exists():
print(f"Wikipedia corpus already exists at {corpus_file}")
return corpus_file
print("Wikipedia dump requires manual download from https://dumps.wikimedia.org/enwiki/latest/")
print("Download: enwiki-latest-pages-articles-multistream.xml.bz2")
print("Then run: python scripts/process_wikipedia.py --input <dump> --output <path>")
# Placeholder
print("Creating placeholder corpus...")
with open(corpus_file, "w", encoding="utf-8") as f:
for i in range(1000):
doc = {
"id": f"wiki_{i}",
"text": f"Wikipedia article {i} content. This is a placeholder.",
"meta": {"title": f"Article {i}"}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
return corpus_file
def process_wikipedia_dump(dump_file: Path, output_file: Path, limit: int | None = None) -> None:
"""
Process Wikipedia XML dump to JSONL.
Args:
dump_file: Path to pages-articles XML dump
output_file: Output JSONL path
limit: Optional limit on articles
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
if not HAS_WIKIPEDIA_PARSER:
print("Warning: mwparserfromhell not installed. Install with: pip install mwparserfromhell")
print("Creating placeholder corpus...")
with open(output_file, "w", encoding="utf-8") as f:
for i in range(1000):
doc = {
"id": f"wiki_{i}",
"text": f"Wikipedia article {i} content.",
"meta": {"title": f"Article {i}"}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
return
# Use wikiextractor or similar tool
print("Processing Wikipedia dump (this may take a while)...")
print("For production, use wikiextractor: https://github.com/attardi/wikiextractor")
# Placeholder implementation
count = 0
with open(output_file, "w", encoding="utf-8") as f:
# In production, parse XML dump and extract text
for i in range(limit or 10000):
doc = {
"id": f"wiki_{i}",
"text": f"Wikipedia article {i} extracted text.",
"meta": {"title": f"Article {i}"}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
print(f"Processed {count} Wikipedia articles to {output_file}")
def load_wikipedia(corpus_file: Path) -> Iterator[dict]:
"""
Load Wikipedia corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)

111
llmds/data_sources/yelp.py Normal file
View File

@@ -0,0 +1,111 @@
"""Yelp Open Dataset loader."""
import json
from pathlib import Path
from typing import Iterator
def download_yelp(output_dir: Path) -> Path:
"""
Download Yelp Open Dataset.
Args:
output_dir: Directory to save corpus
Returns:
Path to corpus JSONL file
"""
output_dir.mkdir(parents=True, exist_ok=True)
corpus_file = output_dir / "business_reviews.jsonl"
if corpus_file.exists():
print(f"Yelp corpus already exists at {corpus_file}")
return corpus_file
print("Yelp Open Dataset requires manual download from https://www.yelp.com/dataset")
print("After downloading, extract business.json and review.json")
print("Then run: python scripts/process_yelp.py --business <path> --review <path> --output <path>")
# Placeholder implementation
print("Creating placeholder corpus...")
with open(corpus_file, "w", encoding="utf-8") as f:
for i in range(1000):
doc = {
"id": f"yelp_{i}",
"text": f"Yelp business {i} review content. This is a placeholder.",
"meta": {"business_id": f"biz_{i}", "rating": 4.5}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
return corpus_file
def process_yelp_files(business_file: Path, review_file: Path, output_file: Path, limit: int | None = None) -> None:
"""
Process Yelp JSON files into normalized JSONL.
Args:
business_file: Path to business.json
review_file: Path to review.json
output_file: Output JSONL path
limit: Optional limit on documents
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
# Load businesses
businesses = {}
if business_file.exists():
with open(business_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
biz = json.loads(line)
businesses[biz["business_id"]] = biz
count = 0
with open(review_file, "r", encoding="utf-8") as infile, \
open(output_file, "w", encoding="utf-8") as outfile:
for line in infile:
if limit and count >= limit:
break
if line.strip():
review = json.loads(line)
biz_id = review.get("business_id")
biz = businesses.get(biz_id, {})
# Combine business name + review text
biz_name = biz.get("name", "")
review_text = review.get("text", "")
combined = f"{biz_name} {review_text}".strip()
if combined:
doc = {
"id": f"yelp_{review.get('review_id', count)}",
"text": combined,
"meta": {
"business_id": biz_id,
"rating": review.get("stars"),
"category": biz.get("categories"),
}
}
outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
print(f"Processed {count} Yelp reviews to {output_file}")
def load_yelp(corpus_file: Path) -> Iterator[dict]:
"""
Load Yelp corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)

291
llmds/hnsw.py Normal file
View File

@@ -0,0 +1,291 @@
"""HNSW (Hierarchical Navigable Small World) for approximate nearest neighbor search.
Implementation based on:
Malkov, Y. A., & Yashunin, D. A. (2018). Efficient and robust approximate nearest
neighbor search using Hierarchical Navigable Small World graphs. IEEE transactions
on pattern analysis and machine intelligence, 42(4), 824-836.
See docs/CITATIONS.md for full citation details.
"""
import random
from typing import Any, Optional
import numpy as np
class HNSW:
"""
Hierarchical Navigable Small World graph for approximate nearest neighbor search.
Implements HNSW with configurable M, efConstruction, and efSearch parameters.
Reference:
Malkov & Yashunin (2018). Efficient and robust approximate nearest neighbor
search using Hierarchical Navigable Small World graphs.
"""
def __init__(
self,
dim: int,
M: int = 16,
ef_construction: int = 200,
ef_search: int = 50,
ml: float = 1.0 / np.log(2.0),
seed: Optional[int] = None,
):
"""
Initialize HNSW index.
Args:
dim: Dimension of vectors
M: Maximum number of connections for each node
ef_construction: Size of candidate set during construction
ef_search: Size of candidate set during search
ml: Normalization factor for level assignment
seed: Optional random seed for reproducible level assignments.
If None, uses the global random state.
"""
self.dim = dim
self.M = M
self.ef_construction = ef_construction
self.ef_search = ef_search
self.ml = ml
# Instance-level random state for reproducibility
self._rng = random.Random(seed) if seed is not None else random
# Layers: list of graphs, each graph is dict[node_id] -> list[neighbor_ids]
self._layers: list[dict[int, list[int]]] = []
self._vectors: dict[int, np.ndarray] = {} # node_id -> vector
self._max_level: dict[int, int] = {} # node_id -> max level
self._entry_point: Optional[int] = None
self._entry_level = 0
def _random_level(self) -> int:
"""Generate random level for new node."""
level = 0
while self._rng.random() < np.exp(-self.ml) and level < 10:
level += 1
return level
def _distance(self, a: np.ndarray, b: np.ndarray) -> float:
"""Compute L2 distance between two vectors."""
return float(np.linalg.norm(a - b))
def _search_layer(
self,
query: np.ndarray,
k: int,
entry_points: list[int],
layer: dict[int, list[int]],
) -> list[tuple[int, float]]:
"""
Search in a single layer using greedy search.
Args:
query: Query vector
k: Number of results to return
entry_points: Starting points for search
layer: Graph layer to search
Returns:
List of (node_id, distance) tuples
"""
if not entry_points:
return []
candidates: list[tuple[float, int]] = []
visited = set(entry_points)
best_candidates: list[tuple[float, int]] = []
# Initialize candidates with entry points
for ep in entry_points:
if ep in self._vectors:
dist = self._distance(query, self._vectors[ep])
candidates.append((dist, ep))
best_candidates.append((dist, ep))
# Sort by distance
candidates.sort()
best_candidates.sort()
# Greedy search
while candidates:
dist, current = candidates.pop(0)
# Explore neighbors
if current in layer:
for neighbor in layer[current]:
if neighbor not in visited:
visited.add(neighbor)
if neighbor in self._vectors:
neighbor_dist = self._distance(query, self._vectors[neighbor])
candidates.append((neighbor_dist, neighbor))
best_candidates.append((neighbor_dist, neighbor))
# Maintain top-ef_search candidates
candidates.sort()
if len(candidates) > self.ef_search:
candidates = candidates[: self.ef_search]
# Sort best candidates and return top-k as (node_id, distance) tuples
best_candidates.sort()
results = [(node_id, dist) for dist, node_id in best_candidates[:k]]
return results
def add(self, vec: np.ndarray, vec_id: int) -> None:
"""
Add a vector to the index.
Args:
vec: Vector to add (must be of dimension self.dim)
vec_id: Unique identifier for the vector
"""
if vec.shape != (self.dim,):
raise ValueError(f"Vector dimension mismatch: expected {self.dim}, got {vec.shape[0]}")
if vec_id in self._vectors:
raise ValueError(f"Vector ID {vec_id} already exists")
self._vectors[vec_id] = vec.copy()
level = self._random_level()
self._max_level[vec_id] = level
# Ensure we have enough layers
while len(self._layers) <= level:
self._layers.append({})
# If this is the first node, set as entry point
if self._entry_point is None:
self._entry_point = vec_id
self._entry_level = level
for l in range(level + 1):
self._layers[l][vec_id] = []
return
# Search for nearest neighbors at each level
entry_points = [self._entry_point]
# Start from top layer and work down
for l in range(min(level, self._entry_level), -1, -1):
# Search layer for candidates
candidates = self._search_layer(
vec, self.ef_construction, entry_points, self._layers[l]
)
entry_points = [node_id for node_id, _ in candidates]
# Insert at all levels up to node's level
for l in range(min(level, len(self._layers) - 1) + 1):
if l == 0:
# Bottom layer: connect to M neighbors
candidates = self._search_layer(vec, self.M, entry_points, self._layers[l])
else:
# Upper layers: connect to M neighbors
candidates = self._search_layer(vec, self.M, entry_points, self._layers[l])
# Create connections
neighbors = [node_id for node_id, _ in candidates[: self.M]]
if vec_id not in self._layers[l]:
self._layers[l][vec_id] = []
# Add bidirectional connections
for neighbor in neighbors:
if neighbor not in self._layers[l]:
self._layers[l][neighbor] = []
self._layers[l][vec_id].append(neighbor)
self._layers[l][neighbor].append(vec_id)
# Limit connections to M
if len(self._layers[l][neighbor]) > self.M:
# Remove farthest connection
neighbor_vec = self._vectors[neighbor]
distances = [
(self._distance(self._vectors[n], neighbor_vec), n)
for n in self._layers[l][neighbor]
]
distances.sort(reverse=True)
farthest = distances[0][1]
self._layers[l][neighbor].remove(farthest)
if farthest in self._layers[l]:
self._layers[l][farthest].remove(neighbor)
# Limit connections for new node
if len(self._layers[l][vec_id]) > self.M:
distances = [
(self._distance(self._vectors[n], vec), n) for n in self._layers[l][vec_id]
]
distances.sort()
self._layers[l][vec_id] = [n for _, n in distances[: self.M]]
entry_points = neighbors
# Update entry point if necessary
if level > self._entry_level:
self._entry_point = vec_id
self._entry_level = level
def search(self, query: np.ndarray, k: int) -> list[tuple[int, float]]:
"""
Search for k nearest neighbors.
Args:
query: Query vector
k: Number of results to return
Returns:
List of (vector_id, distance) tuples sorted by distance
"""
if self._entry_point is None:
return []
if query.shape != (self.dim,):
raise ValueError(f"Query dimension mismatch: expected {self.dim}, got {query.shape[0]}")
# Start from top layer
current = self._entry_point
current_level = self._entry_level
# Navigate down to level 0
for l in range(current_level, 0, -1):
if current not in self._layers[l]:
continue
# Find nearest neighbor in this layer
neighbors = self._layers[l].get(current, [])
if not neighbors:
continue
best_dist = self._distance(query, self._vectors[current])
best_node = current
for neighbor in neighbors:
if neighbor in self._vectors:
dist = self._distance(query, self._vectors[neighbor])
if dist < best_dist:
best_dist = dist
best_node = neighbor
current = best_node
# Search layer 0
results = self._search_layer(query, k, [current], self._layers[0])
return results
def stats(self) -> dict[str, Any]:
"""
Get index statistics.
Returns:
Dictionary with index statistics
"""
total_edges = sum(sum(len(neighbors) for neighbors in layer.values()) for layer in self._layers)
return {
"num_vectors": len(self._vectors),
"num_layers": len(self._layers),
"entry_point": self._entry_point,
"entry_level": self._entry_level,
"total_edges": total_edges,
"avg_degree": total_edges / len(self._vectors) if self._vectors else 0.0,
}

272
llmds/indexed_heap.py Normal file
View File

@@ -0,0 +1,272 @@
"""Indexed binary heap with decrease/increase-key operations."""
from typing import Optional
class IndexedHeap:
"""
Indexed binary heap supporting O(log n) decrease/increase-key operations.
Maintains a heap of (score, id) pairs with an index map for O(1) lookup
and O(log n) updates.
"""
def __init__(self, max_heap: bool = False):
"""
Initialize indexed heap.
Args:
max_heap: If True, use max-heap (largest score at top),
otherwise min-heap (smallest score at top)
"""
self._heap: list[tuple[float, int]] = [] # (score, id)
self._pos: dict[int, int] = {} # id -> index in heap
self._max_heap = max_heap
def _compare(self, a: float, b: float) -> bool:
"""Compare two scores based on heap type."""
if self._max_heap:
return a > b
return a < b
def _swap(self, i: int, j: int) -> None:
"""Swap elements at indices i and j, updating position map."""
self._heap[i], self._heap[j] = self._heap[j], self._heap[i]
_, id_i = self._heap[i]
_, id_j = self._heap[j]
self._pos[id_i] = i
self._pos[id_j] = j
def _bubble_up(self, idx: int) -> None:
"""Bubble up element at idx to maintain heap property."""
while idx > 0:
parent = (idx - 1) // 2
score_curr, _ = self._heap[idx]
score_parent, _ = self._heap[parent]
if self._compare(score_curr, score_parent):
self._swap(idx, parent)
idx = parent
else:
break
def _bubble_down(self, idx: int) -> None:
"""Bubble down element at idx to maintain heap property."""
while True:
left = 2 * idx + 1
right = 2 * idx + 2
best = idx
if left < len(self._heap):
score_best, _ = self._heap[best]
score_left, _ = self._heap[left]
if self._compare(score_left, score_best):
best = left
if right < len(self._heap):
score_best, _ = self._heap[best]
score_right, _ = self._heap[right]
if self._compare(score_right, score_best):
best = right
if best != idx:
self._swap(idx, best)
idx = best
else:
break
def push(self, key_id: int, score: float) -> None:
"""
Push an item onto the heap.
Args:
key_id: Unique identifier for the item
score: Score/priority value
"""
if key_id in self._pos:
raise ValueError(f"Key {key_id} already exists in heap")
idx = len(self._heap)
self._heap.append((score, key_id))
self._pos[key_id] = idx
self._bubble_up(idx)
def pop(self) -> tuple[float, int]:
"""
Pop the top element from the heap.
Returns:
Tuple of (score, id)
Raises:
IndexError: If heap is empty
"""
if not self._heap:
raise IndexError("Cannot pop from empty heap")
if len(self._heap) == 1:
score, key_id = self._heap.pop()
del self._pos[key_id]
return score, key_id
# Swap root with last element
self._swap(0, len(self._heap) - 1)
score, key_id = self._heap.pop()
del self._pos[key_id]
if self._heap:
self._bubble_down(0)
return score, key_id
def decrease_key(self, key_id: int, new_score: float) -> None:
"""
Decrease the key value for an item.
For min-heap: new_score must be < old_score (bubble up).
For max-heap: new_score must be < old_score (bubble down).
Args:
key_id: Item identifier
new_score: New score value
Raises:
KeyError: If key_id not found
ValueError: If new_score doesn't satisfy heap property
"""
if key_id not in self._pos:
raise KeyError(f"Key {key_id} not found in heap")
idx = self._pos[key_id]
old_score, _ = self._heap[idx]
# Validate direction - both heap types decrease when new < old
if new_score >= old_score:
heap_type = "max-heap" if self._max_heap else "min-heap"
raise ValueError(f"For {heap_type}, new_score must be < old_score")
self._heap[idx] = (new_score, key_id)
# Bubble direction depends on heap type
if self._max_heap:
# Max-heap: decreasing score means lower priority -> bubble down
self._bubble_down(idx)
else:
# Min-heap: decreasing score means higher priority -> bubble up
self._bubble_up(idx)
def increase_key(self, key_id: int, new_score: float) -> None:
"""
Increase the key value for an item.
For min-heap: new_score must be > old_score (bubble down).
For max-heap: new_score must be > old_score (bubble up).
Args:
key_id: Item identifier
new_score: New score value
Raises:
KeyError: If key_id not found
ValueError: If new_score doesn't satisfy heap property
"""
if key_id not in self._pos:
raise KeyError(f"Key {key_id} not found in heap")
idx = self._pos[key_id]
old_score, _ = self._heap[idx]
# Validate direction - both heap types increase when new > old
if new_score <= old_score:
heap_type = "max-heap" if self._max_heap else "min-heap"
raise ValueError(f"For {heap_type}, new_score must be > old_score")
self._heap[idx] = (new_score, key_id)
# Bubble direction depends on heap type
if self._max_heap:
# Max-heap: increasing score means higher priority -> bubble up
self._bubble_up(idx)
else:
# Min-heap: increasing score means lower priority -> bubble down
self._bubble_down(idx)
def delete(self, key_id: int) -> tuple[float, int]:
"""
Delete an item from the heap.
Args:
key_id: Item identifier
Returns:
Tuple of (score, id) that was deleted
Raises:
KeyError: If key_id not found
"""
if key_id not in self._pos:
raise KeyError(f"Key {key_id} not found in heap")
idx = self._pos[key_id]
score, _ = self._heap[idx]
# Swap with last element
self._swap(idx, len(self._heap) - 1)
self._heap.pop()
del self._pos[key_id]
# Restore heap property
if idx < len(self._heap):
# Try bubbling up first (might be smaller/bigger than parent)
parent = (idx - 1) // 2
if idx > 0:
score_curr, _ = self._heap[idx]
score_parent, _ = self._heap[parent]
if self._compare(score_curr, score_parent):
self._bubble_up(idx)
return score, key_id
# Otherwise bubble down
self._bubble_down(idx)
return score, key_id
def peek(self) -> Optional[tuple[float, int]]:
"""
Peek at the top element without removing it.
Returns:
Tuple of (score, id) or None if empty
"""
if not self._heap:
return None
return self._heap[0]
def get_score(self, key_id: int) -> Optional[float]:
"""
Get the score for a given key_id.
Args:
key_id: Item identifier
Returns:
Score value or None if not found
"""
if key_id not in self._pos:
return None
idx = self._pos[key_id]
score, _ = self._heap[idx]
return score
def size(self) -> int:
"""Get the number of elements in the heap."""
return len(self._heap)
def is_empty(self) -> bool:
"""Check if heap is empty."""
return len(self._heap) == 0
def contains(self, key_id: int) -> bool:
"""Check if key_id exists in heap."""
return key_id in self._pos

222
llmds/inverted_index.py Normal file
View File

@@ -0,0 +1,222 @@
"""Compressed inverted index with BM25 scoring.
Implementation based on:
Robertson, S., & Zaragoza, H. (2009). The probabilistic relevance framework:
BM25 and beyond. Foundations and Trends in Information Retrieval, 3(4), 333-389.
See docs/CITATIONS.md for full citation details.
"""
from collections import defaultdict
from typing import Any, Optional
from llmds.tokenizer import Tokenizer
class InvertedIndex:
"""
Compressed inverted index with varint/zigzag encoding and BM25 scoring.
Stores postings lists with compression and provides BM25 retrieval.
Reference:
Robertson & Zaragoza (2009). The probabilistic relevance framework:
BM25 and beyond.
"""
def __init__(self, tokenizer: Optional[Tokenizer] = None):
"""
Initialize inverted index.
Args:
tokenizer: Tokenizer instance (creates default if None)
"""
self.tokenizer = tokenizer or Tokenizer()
self._inverted_lists: dict[str, list[int]] = defaultdict(list) # term -> doc_ids
self._doc_lengths: dict[int, int] = {} # doc_id -> length
self._doc_terms: dict[int, dict[str, int]] = {} # doc_id -> term -> count
self._total_docs = 0
self._avg_doc_length = 0.0
# BM25 parameters
self.k1 = 1.2
self.b = 0.75
def _encode_varint(self, value: int) -> bytes:
"""Encode integer as varint."""
result = bytearray()
while value >= 0x80:
result.append((value & 0x7F) | 0x80)
value >>= 7
result.append(value & 0x7F)
return bytes(result)
def _decode_varint(self, data: bytes, offset: int) -> tuple[int, int]:
"""Decode varint from bytes."""
value = 0
shift = 0
while offset < len(data):
byte = data[offset]
value |= (byte & 0x7F) << shift
offset += 1
if (byte & 0x80) == 0:
break
shift += 7
return value, offset
def _zigzag_encode(self, value: int) -> int:
"""Zigzag encode for signed integers."""
return (value << 1) ^ (value >> 31)
def _zigzag_decode(self, value: int) -> int:
"""Zigzag decode."""
return (value >> 1) ^ (-(value & 1))
def add_document(self, doc_id: int, text: str) -> None:
"""
Add a document to the index.
Args:
doc_id: Document identifier
text: Document text
"""
tokens = self.tokenizer.encode(text)
term_counts: dict[str, int] = defaultdict(int)
# Count term frequencies
for token_id in tokens:
term = self.tokenizer.decode([token_id])
if term:
term_counts[term] += 1
# Update inverted lists
for term, count in term_counts.items():
if doc_id not in self._inverted_lists[term]:
self._inverted_lists[term].append(doc_id)
# Store document metadata
self._doc_lengths[doc_id] = len(tokens)
self._doc_terms[doc_id] = term_counts
# Update average document length
self._total_docs += 1
total_length = sum(self._doc_lengths.values())
self._avg_doc_length = total_length / self._total_docs if self._total_docs > 0 else 0.0
def _bm25_score(self, term: str, doc_id: int, query_term_freq: int) -> float:
"""
Calculate BM25 score for a term-document pair.
Args:
term: Query term
doc_id: Document ID
query_term_freq: Frequency of term in query
Returns:
BM25 score
"""
if doc_id not in self._doc_terms or term not in self._doc_terms[doc_id]:
return 0.0
# Term frequency in document
tf = self._doc_terms[doc_id][term]
# Document frequency
df = len(self._inverted_lists.get(term, []))
# Inverse document frequency
idf = 0.0
if df > 0:
idf = (self._total_docs - df + 0.5) / (df + 0.5)
idf = max(0.0, idf) # Avoid negative IDF
# Document length normalization
doc_length = self._doc_lengths.get(doc_id, 1)
length_norm = (1 - self.b) + self.b * (doc_length / self._avg_doc_length)
# BM25 formula
score = (
idf
* (tf * (self.k1 + 1))
/ (tf + self.k1 * length_norm)
* (query_term_freq / (query_term_freq + 0.5))
)
return score
def search(self, query: str, top_k: int = 10) -> list[tuple[int, float]]:
"""
Search the index with BM25 scoring.
Args:
query: Query text
top_k: Number of top results to return
Returns:
List of (doc_id, score) tuples sorted by score descending
"""
query_tokens = self.tokenizer.encode(query)
query_term_counts: dict[str, int] = defaultdict(int)
for token_id in query_tokens:
term = self.tokenizer.decode([token_id])
if term:
query_term_counts[term] += 1
# Score all candidate documents
doc_scores: dict[int, float] = defaultdict(float)
for term, query_freq in query_term_counts.items():
if term in self._inverted_lists:
for doc_id in self._inverted_lists[term]:
score = self._bm25_score(term, doc_id, query_freq)
doc_scores[doc_id] += score
# Sort by score and return top-k
sorted_results = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
return sorted_results[:top_k]
def get_term_frequency(self, term: str, doc_id: int) -> int:
"""
Get term frequency in a document.
Args:
term: Term
doc_id: Document ID
Returns:
Term frequency
"""
if doc_id in self._doc_terms:
return self._doc_terms[doc_id].get(term, 0)
return 0
def get_document_frequency(self, term: str) -> int:
"""
Get document frequency of a term.
Args:
term: Term
Returns:
Document frequency
"""
return len(self._inverted_lists.get(term, []))
def stats(self) -> dict[str, Any]:
"""
Get index statistics.
Returns:
Dictionary with index statistics
"""
total_postings = sum(len(postings) for postings in self._inverted_lists.values())
return {
"total_documents": self._total_docs,
"total_terms": len(self._inverted_lists),
"total_postings": total_postings,
"avg_doc_length": self._avg_doc_length,
"avg_postings_per_term": (
total_postings / len(self._inverted_lists) if self._inverted_lists else 0.0
),
}

281
llmds/kv_cache.py Normal file
View File

@@ -0,0 +1,281 @@
"""KV cache with paged allocation and prefix sharing.
Implementation based on techniques from:
Cache-Craft: Managing Chunk-Caches for Efficient Retrieval-Augmented Generation.
See docs/CITATIONS.md for full citation details.
"""
import copy
import hashlib
from typing import Any, Optional
from llmds.paged_allocator import PagedAllocator
class KVCache:
"""
KV cache with paged allocation, prefix sharing, and deduplication.
Implements copy-on-write (COW) for prefix sharing: shared pages are
read-only until a write occurs, at which point they are copied.
Reference:
Cache-Craft: Managing Chunk-Caches for Efficient Retrieval-Augmented Generation.
**Copy-on-Write Semantics:**
- Shared pages (from prefix sharing) are read-only
- Attempts to modify shared pages trigger lazy copying
- Each sequence maintains its own copy of modified pages
- Original shared pages remain unchanged for other sequences
Supports hash-based deduplication of repeated system prompts.
"""
def __init__(
self,
page_size: int = 512,
max_pages: int = 10000,
enable_prefix_sharing: bool = True,
):
"""
Initialize KV cache.
Args:
page_size: Size of each KV cache page in tokens
max_pages: Maximum number of pages to allocate
enable_prefix_sharing: Enable prefix sharing optimization
"""
self.allocator = PagedAllocator(page_size, max_pages)
self.page_size = page_size
self._sequences: dict[int, list[int]] = {} # seq_id -> list[page_ids]
self._kv_data: dict[int, Any] = {} # page_id -> KV data
self._prefix_map: dict[str, list[int]] = {} # hash -> page_ids
self._page_refs: dict[int, int] = {} # page_id -> reference count
self._shared_pages: set[int] = set() # page_ids that are shared (read-only)
self._enable_prefix_sharing = enable_prefix_sharing
self._seq_counter = 0
self._prefix_shares = 0
def _hash_prefix(self, prefix: list[int]) -> str:
"""Compute hash of prefix tokens."""
prefix_str = ",".join(map(str, prefix[:100])) # Limit length
return hashlib.sha256(prefix_str.encode()).hexdigest()
def _copy_if_shared(self, page_id: int, seq_id: int) -> int:
"""
Copy-on-write: if page is shared, create a new copy.
Args:
page_id: Original page ID (may be shared)
seq_id: Sequence ID requesting the copy
Returns:
New page_id if copied, original page_id if not shared
"""
if page_id not in self._shared_pages:
return page_id
# Page is shared - need to copy
new_page_id = self.allocator.alloc(1)[0]
# Copy the data
if page_id in self._kv_data:
self._kv_data[new_page_id] = copy.deepcopy(self._kv_data[page_id])
else:
# Empty page
self._kv_data[new_page_id] = []
# Decrement reference count of original
self._page_refs[page_id] = self._page_refs.get(page_id, 1) - 1
if self._page_refs[page_id] <= 0:
self._shared_pages.discard(page_id)
if page_id in self._page_refs:
del self._page_refs[page_id]
# New page is not shared (single owner)
self._page_refs[new_page_id] = 1
return new_page_id
def attach(
self,
seq_id: int,
kv_tokens: list[Any],
prefix_tokens: Optional[list[int]] = None,
) -> None:
"""
Attach KV cache for a sequence.
Implements copy-on-write: if prefix sharing is used, shared pages
are referenced but will be copied on first write.
Args:
seq_id: Sequence identifier
kv_tokens: KV tokens to cache
prefix_tokens: Optional prefix tokens for sharing
"""
if seq_id in self._sequences:
self.detach(seq_id)
pages_needed = (len(kv_tokens) + self.page_size - 1) // self.page_size
new_page_ids = self.allocator.alloc(pages_needed)
page_ids: list[int] = []
# Try prefix sharing if enabled
shared_prefix_pages: list[int] = []
if self._enable_prefix_sharing and prefix_tokens:
prefix_hash = self._hash_prefix(prefix_tokens)
if prefix_hash in self._prefix_map:
shared_prefix_pages = self._prefix_map[prefix_hash]
# Reference shared pages (will be copied on write if needed)
num_prefix_pages = min(len(shared_prefix_pages), pages_needed)
page_ids.extend(shared_prefix_pages[:num_prefix_pages])
# Update reference counts for shared pages
for shared_page_id in shared_prefix_pages[:num_prefix_pages]:
self._page_refs[shared_page_id] = self._page_refs.get(shared_page_id, 0) + 1
self._shared_pages.add(shared_page_id)
# Use remaining allocated pages for non-shared suffix
page_ids.extend(new_page_ids[num_prefix_pages:])
self._prefix_shares += 1
else:
# First time seeing this prefix - mark these pages as potential shared
num_prefix_pages = min(
(len(prefix_tokens) + self.page_size - 1) // self.page_size,
pages_needed
)
self._prefix_map[prefix_hash] = new_page_ids[:num_prefix_pages]
page_ids = new_page_ids
else:
page_ids = new_page_ids
# Store KV data with copy-on-write semantics
# For shared pages: if data differs, trigger COW; otherwise, reference existing
for i, page_id in enumerate(page_ids):
start = i * self.page_size
end = min(start + self.page_size, len(kv_tokens))
page_data = kv_tokens[start:end]
# Check if this page is shared
if page_id in self._shared_pages:
# Page is shared - check if data matches
existing_data = self._kv_data.get(page_id, [])
if existing_data != page_data:
# Data differs - trigger copy-on-write
page_id = self._copy_if_shared(page_id, seq_id)
page_ids[i] = page_id # Update the page_id in our list
# Now safe to write (page is not shared)
self._kv_data[page_id] = page_data
if page_id not in self._page_refs:
self._page_refs[page_id] = 1
# If data matches, no need to copy or write - just reference the shared page
else:
# Non-shared page - safe to write directly
self._kv_data[page_id] = page_data
if page_id not in self._page_refs:
self._page_refs[page_id] = 1
self._sequences[seq_id] = page_ids
def detach(self, seq_id: int) -> None:
"""
Detach and free KV cache for a sequence.
Decrements reference counts for shared pages. Pages are only freed
when their reference count reaches zero.
Args:
seq_id: Sequence identifier
"""
if seq_id not in self._sequences:
return
page_ids = self._sequences[seq_id]
# Update reference counts and free pages
pages_to_free: list[int] = []
for page_id in page_ids:
if page_id in self._shared_pages:
# Shared page - decrement reference count
self._page_refs[page_id] = self._page_refs.get(page_id, 1) - 1
if self._page_refs[page_id] <= 0:
# No more references - can free
self._shared_pages.discard(page_id)
if page_id in self._kv_data:
del self._kv_data[page_id]
if page_id in self._page_refs:
del self._page_refs[page_id]
pages_to_free.append(page_id)
else:
# Non-shared page - free immediately
if page_id in self._kv_data:
del self._kv_data[page_id]
if page_id in self._page_refs:
del self._page_refs[page_id]
pages_to_free.append(page_id)
# Free pages via allocator
if pages_to_free:
self.allocator.free(pages_to_free)
del self._sequences[seq_id]
def get(self, seq_id: int) -> Optional[list[Any]]:
"""
Get KV cache for a sequence.
Returns a copy of the data to prevent external modifications
from affecting shared pages.
Args:
seq_id: Sequence identifier
Returns:
List of KV tokens or None if not found
"""
if seq_id not in self._sequences:
return None
page_ids = self._sequences[seq_id]
kv_tokens = []
for page_id in page_ids:
if page_id in self._kv_data:
# Return copy to prevent external modification of shared pages
kv_tokens.extend(copy.deepcopy(self._kv_data[page_id]))
return kv_tokens
def stats(self) -> dict[str, Any]:
"""
Get cache statistics.
Returns:
Dictionary with cache statistics
"""
alloc_stats = self.allocator.stats()
return {
"total_sequences": len(self._sequences),
"total_pages": alloc_stats.total_pages,
"allocated_pages": alloc_stats.allocated_pages,
"free_pages": alloc_stats.free_pages,
"prefix_shares": self._prefix_shares,
"prefix_map_size": len(self._prefix_map),
"shared_pages_count": len(self._shared_pages),
"total_page_refs": sum(self._page_refs.values()),
}
def hook_speculative_decode(self, seq_id: int, draft_tokens: list[int]) -> None:
"""
Hook for speculative decoding compatibility.
Placeholder API for future implementation.
Args:
seq_id: Sequence identifier
draft_tokens: Draft tokens from speculative decoding
"""
# Placeholder for speculative decoding integration
pass

117
llmds/paged_allocator.py Normal file
View File

@@ -0,0 +1,117 @@
"""Paged memory allocator with slab allocation for KV cache."""
from dataclasses import dataclass
from typing import Optional
@dataclass
class PageStats:
"""Statistics for page allocation."""
total_pages: int
allocated_pages: int
free_pages: int
fragmentation_ratio: float
allocation_count: int
free_count: int
class PagedAllocator:
"""
Paged memory allocator with fixed-size pages and freelist management.
Uses a slab allocator approach with freelists for efficient allocation
and deallocation of fixed-size page blocks.
"""
def __init__(self, page_size: int, max_pages: int):
"""
Initialize the paged allocator.
Args:
page_size: Size of each page in tokens/bytes
max_pages: Maximum number of pages to allocate
"""
self.page_size = page_size
self.max_pages = max_pages
self._pages: list[Optional[bool]] = [None] * max_pages # None=free, True=allocated
self._free_list: list[int] = list(range(max_pages))
self._allocation_count = 0
self._free_count = 0
def alloc(self, num_pages: int) -> list[int]:
"""
Allocate a contiguous block of pages.
Args:
num_pages: Number of pages to allocate
Returns:
List of page IDs (indices)
Raises:
ValueError: If insufficient pages available
"""
if len(self._free_list) < num_pages:
raise ValueError(f"Insufficient pages: requested {num_pages}, available {len(self._free_list)}")
allocated = []
for _ in range(num_pages):
page_id = self._free_list.pop(0)
self._pages[page_id] = True
allocated.append(page_id)
self._allocation_count += 1
return allocated
def free(self, page_ids: list[int]) -> None:
"""
Free a list of pages.
Args:
page_ids: List of page IDs to free
"""
for page_id in page_ids:
if 0 <= page_id < self.max_pages and self._pages[page_id] is True:
self._pages[page_id] = None
self._free_list.append(page_id)
self._free_count += 1
def stats(self) -> PageStats:
"""
Get allocation statistics.
Returns:
PageStats object with current statistics
"""
allocated = sum(1 for p in self._pages if p is True)
free = len(self._free_list)
fragmentation = 1.0 - (free / self.max_pages) if self.max_pages > 0 else 0.0
return PageStats(
total_pages=self.max_pages,
allocated_pages=allocated,
free_pages=free,
fragmentation_ratio=fragmentation,
allocation_count=self._allocation_count,
free_count=self._free_count,
)
def defragment(self) -> None:
"""
Defragment pages by compacting allocated pages.
This is a simple implementation that moves allocated pages
to the front. More sophisticated strategies could be implemented.
"""
allocated_indices = [i for i, p in enumerate(self._pages) if p is True]
free_indices = [i for i, p in enumerate(self._pages) if p is None]
# Simple compaction: move allocated pages to front
new_pages: list[bool | None] = [None] * self.max_pages
for i, idx in enumerate(allocated_indices):
new_pages[i] = True
self._pages = new_pages
self._free_list = list(range(len(allocated_indices), self.max_pages))

213
llmds/retrieval_pipeline.py Normal file
View File

@@ -0,0 +1,213 @@
"""Retrieval pipeline combining ANN, lexical search, and fusion."""
from typing import Any, Optional
import numpy as np
from llmds.cmsketch import CountMinSketch
from llmds.hnsw import HNSW
from llmds.indexed_heap import IndexedHeap
from llmds.inverted_index import InvertedIndex
from llmds.token_lru import TokenLRU
from llmds.tokenizer import Tokenizer
class RetrievalPipeline:
"""
End-to-end retrieval pipeline combining ANN, lexical search, and fusion.
Combines HNSW for dense embeddings, inverted index for BM25,
and score fusion with top-K maintenance using indexed heap.
"""
def __init__(
self,
embedding_dim: int = 384,
hnsw_M: int = 16,
hnsw_ef_construction: int = 200,
hnsw_ef_search: int = 50,
token_budget: int = 100000,
tokenizer: Optional[Tokenizer] = None,
seed: Optional[int] = None,
):
"""
Initialize retrieval pipeline.
Args:
embedding_dim: Dimension of embedding vectors
hnsw_M: HNSW M parameter
hnsw_ef_construction: HNSW efConstruction parameter
hnsw_ef_search: HNSW efSearch parameter
token_budget: Token budget for cache
tokenizer: Tokenizer instance
seed: Optional random seed for HNSW reproducibility (default: None)
"""
self.tokenizer = tokenizer or Tokenizer()
self.hnsw = HNSW(
dim=embedding_dim,
M=hnsw_M,
ef_construction=hnsw_ef_construction,
ef_search=hnsw_ef_search,
seed=seed,
)
self.inverted_index = InvertedIndex(tokenizer=self.tokenizer)
self.cmsketch = CountMinSketch(width=2048, depth=4)
self.token_cache: TokenLRU[str, str] = TokenLRU[str, str](
token_budget=token_budget,
token_of=lambda text: self.tokenizer.count_tokens(text),
)
def add_document(
self,
doc_id: int,
text: str,
embedding: Optional[np.ndarray] = None,
) -> None:
"""
Add a document to both indices.
Args:
doc_id: Document identifier
text: Document text
embedding: Optional embedding vector (if None, generates random)
"""
# Add to inverted index
self.inverted_index.add_document(doc_id, text)
# Add to HNSW if embedding provided
if embedding is not None:
if embedding.shape != (self.hnsw.dim,):
raise ValueError(
f"Embedding dimension mismatch: expected {self.hnsw.dim}, "
f"got {embedding.shape[0]}"
)
self.hnsw.add(embedding, doc_id)
else:
# Generate random embedding for testing
random_embedding = np.random.randn(self.hnsw.dim).astype(np.float32)
random_embedding = random_embedding / np.linalg.norm(random_embedding)
self.hnsw.add(random_embedding, doc_id)
def search(
self,
query: str,
query_embedding: Optional[np.ndarray] = None,
top_k: int = 10,
fusion_weight: float = 0.5,
) -> list[tuple[int, float]]:
"""
Search with hybrid retrieval and score fusion.
Args:
query: Query text
query_embedding: Optional query embedding vector
top_k: Number of results to return
fusion_weight: Weight for dense search (1-fusion_weight for BM25)
Returns:
List of (doc_id, fused_score) tuples
"""
# Check cache
cached = self.token_cache.get(query)
if cached:
self.cmsketch.add(query)
# Parse cached string back to list of tuples
import ast
try:
parsed_results = ast.literal_eval(cached)
if isinstance(parsed_results, list):
return parsed_results
except (ValueError, SyntaxError):
pass # Fall through to compute results
# BM25 search
bm25_results = self.inverted_index.search(query, top_k=top_k * 2)
# Dense search (if embedding provided)
dense_results = []
if query_embedding is not None:
dense_results = self.hnsw.search(query_embedding, k=top_k * 2)
# Normalize scores
bm25_scores: dict[int, float] = {doc_id: score for doc_id, score in bm25_results}
dense_scores: dict[int, float] = {}
if dense_results:
max_dense = max(dist for _, dist in dense_results) if dense_results else 1.0
min_dense = min(dist for _, dist in dense_results) if dense_results else 0.0
dense_range = max_dense - min_dense if max_dense > min_dense else 1.0
for doc_id, dist in dense_results: # HNSW.search returns (node_id, distance)
# Convert distance to similarity (inverse)
normalized = 1.0 - (dist - min_dense) / dense_range if dense_range > 0 else 1.0
dense_scores[doc_id] = normalized
# Normalize BM25 scores
if bm25_scores:
max_bm25 = max(bm25_scores.values())
min_bm25 = min(bm25_scores.values())
bm25_range = max_bm25 - min_bm25 if max_bm25 > min_bm25 else 1.0
for doc_id in bm25_scores:
bm25_scores[doc_id] = (
(bm25_scores[doc_id] - min_bm25) / bm25_range if bm25_range > 0 else 1.0
)
# Fuse scores using indexed heap
fused_scores: dict[int, float] = {}
all_doc_ids = set(bm25_scores.keys()) | set(dense_scores.keys())
for doc_id in all_doc_ids:
bm25_score = bm25_scores.get(doc_id, 0.0)
dense_score = dense_scores.get(doc_id, 0.0)
# Weighted fusion
fused_score = fusion_weight * dense_score + (1 - fusion_weight) * bm25_score
fused_scores[doc_id] = fused_score
# Top-K using indexed heap
heap = IndexedHeap(max_heap=True)
for doc_id, score in fused_scores.items():
if heap.size() < top_k:
heap.push(doc_id, score)
else:
peek_result = heap.peek()
if peek_result is not None:
min_score, _ = peek_result
if min_score is not None and score > min_score:
heap.pop()
heap.push(doc_id, score)
# Extract results
results = []
while not heap.is_empty():
score, doc_id = heap.pop()
results.append((doc_id, score))
results.reverse() # Highest score first
# Cache results (store as string representation for token counting)
results_str = str(results)
self.token_cache.put(query, results_str)
self.cmsketch.add(query)
return results
def stats(self) -> dict[str, Any]:
"""
Get pipeline statistics.
Returns:
Dictionary with pipeline statistics
"""
hnsw_stats = self.hnsw.stats()
index_stats = self.inverted_index.stats()
return {
"hnsw": hnsw_stats,
"inverted_index": index_stats,
"cmsketch_total_count": self.cmsketch.get_total_count(),
"cache_size": self.token_cache.size(),
"cache_tokens": self.token_cache.total_tokens(),
}

216
llmds/scheduler.py Normal file
View File

@@ -0,0 +1,216 @@
"""Dynamic micro-batching scheduler with priority queue."""
import time
from dataclasses import dataclass
from typing import Any, Callable, Optional
from llmds.indexed_heap import IndexedHeap
@dataclass
class Request:
"""Represents a request in the scheduler."""
request_id: int
tokens: int
priority: float # Higher = more priority
created_at: float
slo_ms: Optional[float] = None # Service level objective in milliseconds
class Scheduler:
"""
Dynamic micro-batching scheduler with priority-based queuing.
Uses an indexed heap to prioritize sequences by remaining length or SLO.
Supports dynamic batching with configurable waiting time vs. throughput trade-offs.
"""
def __init__(
self,
max_batch_size: int = 32,
max_wait_ms: float = 50.0,
priority_fn: Optional[Callable[[Request], float]] = None,
):
"""
Initialize scheduler.
Args:
max_batch_size: Maximum batch size
max_wait_ms: Maximum wait time in milliseconds before batching
priority_fn: Optional function to compute priority from request.
Default: prioritize by remaining tokens (inverse)
"""
self.max_batch_size = max_batch_size
self.max_wait_ms = max_wait_ms
self._heap = IndexedHeap(max_heap=True) # Max heap for priority
self._requests: dict[int, Request] = {}
self._priority_fn = priority_fn or self._default_priority_fn
self._request_counter = 0
self._batch_count = 0
self._total_processed = 0
def _default_priority_fn(self, req: Request) -> float:
"""Default priority: higher priority for shorter sequences (inverse of tokens)."""
return 1.0 / (req.tokens + 1.0)
def _slo_priority_fn(self, req: Request) -> float:
"""Priority based on SLO deadline."""
if req.slo_ms is None:
return self._default_priority_fn(req)
elapsed_ms = (time.time() - req.created_at) * 1000
remaining_ms = req.slo_ms - elapsed_ms
if remaining_ms <= 0:
return float("inf") # Urgent: past deadline
return 1.0 / (remaining_ms + 1.0)
def submit(self, tokens: int, slo_ms: Optional[float] = None) -> int:
"""
Submit a request to the scheduler.
Args:
tokens: Estimated token count for the request
slo_ms: Optional SLO deadline in milliseconds
Returns:
Request ID
"""
req_id = self._request_counter
self._request_counter += 1
req = Request(
request_id=req_id,
tokens=tokens,
priority=self._priority_fn(
Request(
request_id=req_id,
tokens=tokens,
priority=0.0,
created_at=time.time(),
slo_ms=slo_ms,
)
),
created_at=time.time(),
slo_ms=slo_ms,
)
self._requests[req_id] = req
self._heap.push(req_id, req.priority)
return req_id
def get_batch(self, force: bool = False) -> Optional[list[int]]:
"""
Get next batch of requests to process.
Args:
force: If True, return batch even if not full
Returns:
List of request IDs or None if no batch ready
"""
if self._heap.is_empty():
return None
# Check if oldest request exceeds max wait time
oldest_req_id = None
oldest_time = float("inf")
for req_id in self._requests:
if self._requests[req_id].created_at < oldest_time:
oldest_time = self._requests[req_id].created_at
oldest_req_id = req_id
if oldest_req_id:
wait_time_ms = (time.time() - oldest_time) * 1000
if not force and wait_time_ms < self.max_wait_ms:
return None
# Build batch from heap
batch: list[int] = []
temp_heap = IndexedHeap(max_heap=True)
# Pop top requests
while len(batch) < self.max_batch_size and not self._heap.is_empty():
_, req_id = self._heap.pop()
if req_id in self._requests:
batch.append(req_id)
else:
temp_heap.push(req_id, self._requests[req_id].priority)
# Restore heap (add back any that weren't used)
while not temp_heap.is_empty():
_, req_id = temp_heap.pop()
self._heap.push(req_id, self._requests[req_id].priority)
if batch:
self._batch_count += 1
self._total_processed += len(batch)
return batch
return None
def complete_batch(self, request_ids: list[int]) -> None:
"""
Mark a batch as completed and remove requests.
Args:
request_ids: List of completed request IDs
"""
for req_id in request_ids:
if req_id in self._requests:
# Try to remove from heap if present
if self._heap.contains(req_id):
try:
self._heap.delete(req_id)
except KeyError:
pass
del self._requests[req_id]
def update_priority(self, request_id: int, new_tokens: int) -> None:
"""
Update priority for a request (e.g., after partial processing).
Args:
request_id: Request identifier
new_tokens: Updated token count
"""
if request_id not in self._requests:
return
req = self._requests[request_id]
req.tokens = new_tokens
new_priority = self._priority_fn(req)
if self._heap.contains(request_id):
old_priority = self._heap.get_score(request_id)
if old_priority is not None:
if new_priority > old_priority:
self._heap.increase_key(request_id, new_priority)
else:
self._heap.decrease_key(request_id, new_priority)
else:
self._heap.push(request_id, new_priority)
def stats(self) -> dict[str, Any]:
"""
Get scheduler statistics.
Returns:
Dictionary with scheduler statistics
"""
return {
"queue_size": len(self._requests),
"batch_count": self._batch_count,
"total_processed": self._total_processed,
"avg_batch_size": (
self._total_processed / self._batch_count if self._batch_count > 0 else 0.0
),
}
def clear(self) -> None:
"""Clear all pending requests."""
self._heap = IndexedHeap(max_heap=True)
self._requests.clear()

120
llmds/token_lru.py Normal file
View File

@@ -0,0 +1,120 @@
"""Token-aware LRU cache with eviction until budget."""
from collections import OrderedDict
from typing import Callable, Generic, Optional, TypeVar
K = TypeVar("K")
V = TypeVar("V")
class TokenLRU(Generic[K, V]):
"""
Token-aware LRU cache that evicts items until budget is satisfied.
Evicts least recently used items until the total token count
fits within the specified budget.
"""
def __init__(self, token_budget: int, token_of: Callable[[V], int]):
"""
Initialize token-aware LRU cache.
Args:
token_budget: Maximum total tokens allowed
token_of: Function to extract token count from a value
"""
self.budget = token_budget
self.token_of = token_of
self._cache: OrderedDict[K, V] = OrderedDict()
self._total_tokens = 0
def put(self, key: K, value: V) -> None:
"""
Add or update an item in the cache.
Evicts LRU items until budget is satisfied.
Args:
key: Cache key
value: Cache value
"""
token_count = self.token_of(value)
# If key exists, remove old value first
if key in self._cache:
old_value = self._cache[key]
self._total_tokens -= self.token_of(old_value)
del self._cache[key]
# Evict LRU items until we have space
while self._total_tokens + token_count > self.budget and self._cache:
self._evict_lru()
# Add new item
if self._total_tokens + token_count <= self.budget:
self._cache[key] = value
self._total_tokens += token_count
# Move to end (most recently used)
self._cache.move_to_end(key)
def get(self, key: K) -> Optional[V]:
"""
Get an item from the cache.
Moves item to end (most recently used).
Args:
key: Cache key
Returns:
Cached value or None if not found
"""
if key not in self._cache:
return None
value = self._cache[key]
self._cache.move_to_end(key)
return value
def _evict_lru(self) -> tuple[K, V]:
"""
Evict the least recently used item.
Returns:
Tuple of (key, value) that was evicted
"""
if not self._cache:
raise RuntimeError("Cannot evict from empty cache")
key, value = self._cache.popitem(last=False)
self._total_tokens -= self.token_of(value)
return key, value
def evict_until_budget(self, target_budget: int) -> list[tuple[K, V]]:
"""
Evict items until total tokens <= target_budget.
Args:
target_budget: Target token budget
Returns:
List of (key, value) tuples that were evicted
"""
evicted = []
while self._total_tokens > target_budget and self._cache:
evicted.append(self._evict_lru())
return evicted
def total_tokens(self) -> int:
"""Get total tokens currently in cache."""
return self._total_tokens
def size(self) -> int:
"""Get number of items in cache."""
return len(self._cache)
def clear(self) -> None:
"""Clear all items from cache."""
self._cache.clear()
self._total_tokens = 0

149
llmds/tokenizer.py Normal file
View File

@@ -0,0 +1,149 @@
"""Simple BPE-style tokenizer interface."""
from typing import Optional
class Tokenizer:
"""
Simple tokenizer interface with BPE-style stub implementation.
Provides a pluggable interface for tokenization that can be
extended with real tokenizers (e.g., tiktoken, transformers).
"""
def __init__(self, vocab_size: int = 50257):
"""
Initialize tokenizer.
Args:
vocab_size: Vocabulary size (default GPT-2 like)
"""
self.vocab_size = vocab_size
self._word_to_id: dict[str, int] = {}
self._id_to_word: dict[int, str] = {}
self._build_simple_vocab()
def _build_simple_vocab(self) -> None:
"""Build a simple vocabulary for testing."""
# Simple vocabulary: common words + special tokens
special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
common_words = [
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"as",
"is",
"was",
"are",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"should",
"could",
"may",
"might",
"must",
"can",
"this",
"that",
"these",
"those",
"i",
"you",
"he",
"she",
"it",
"we",
"they",
]
all_tokens = special_tokens + common_words
for i, token in enumerate(all_tokens[: self.vocab_size]):
self._word_to_id[token] = i
self._id_to_word[i] = token
def encode(self, text: str) -> list[int]:
"""
Encode text to token IDs.
Args:
text: Input text
Returns:
List of token IDs
"""
# Simple whitespace-based tokenization
words = text.lower().split()
token_ids = []
unk_id = self._word_to_id.get("<unk>", 0)
for word in words:
# Simple BPE-like: try full word, then fallback to char-level
if word in self._word_to_id:
token_ids.append(self._word_to_id[word])
else:
# Character-level fallback
for char in word:
char_token = f"<char_{char}>"
if char_token in self._word_to_id:
token_ids.append(self._word_to_id[char_token])
else:
token_ids.append(unk_id)
return token_ids
def decode(self, token_ids: list[int]) -> str:
"""
Decode token IDs to text.
Args:
token_ids: List of token IDs
Returns:
Decoded text
"""
words = []
for token_id in token_ids:
if token_id in self._id_to_word:
word = self._id_to_word[token_id]
if not word.startswith("<"):
words.append(word)
return " ".join(words)
def count_tokens(self, text: str) -> int:
"""
Count tokens in text.
Args:
text: Input text
Returns:
Token count
"""
return len(self.encode(text))
def get_vocab_size(self) -> int:
"""Get vocabulary size."""
return self.vocab_size

250
llmds/utils.py Normal file
View File

@@ -0,0 +1,250 @@
"""Utility functions."""
import time
from contextlib import contextmanager
from typing import Any, Iterator, Literal, Optional
import numpy as np
try:
import psutil
_PSUTIL_AVAILABLE = True
except ImportError:
_PSUTIL_AVAILABLE = False
psutil = None # type: ignore
try:
from scipy import stats
HAS_SCIPY = True
except ImportError:
HAS_SCIPY = False
class Timer:
"""Simple timer context manager."""
def __init__(self) -> None:
self.start: float | None = None
self.elapsed: float = 0.0
def __enter__(self) -> "Timer":
self.start = time.perf_counter()
return self
def __exit__(self, *args: Any) -> Literal[False]:
if self.start is not None:
self.elapsed = time.perf_counter() - self.start
return False
class MemoryProfiler:
"""
Memory profiler for measuring peak RSS (Resident Set Size).
Tracks memory usage during benchmark execution and reports peak RSS.
"""
def __init__(self) -> None:
"""Initialize memory profiler."""
if not _PSUTIL_AVAILABLE:
raise ImportError("psutil is required for memory profiling. Install with: pip install psutil")
self.process = psutil.Process()
self.initial_rss: Optional[int] = None
self.peak_rss: int = 0
self.current_rss: int = 0
def start(self) -> None:
"""Start memory profiling."""
self.initial_rss = self.process.memory_info().rss
self.peak_rss = self.initial_rss
self.current_rss = self.initial_rss
def sample(self) -> int:
"""
Sample current RSS and update peak.
Returns:
Current RSS in bytes
"""
if not _PSUTIL_AVAILABLE:
return 0
self.current_rss = self.process.memory_info().rss
if self.current_rss > self.peak_rss:
self.peak_rss = self.current_rss
return self.current_rss
def get_peak_rss_mb(self) -> float:
"""
Get peak RSS in megabytes.
Returns:
Peak RSS in MB
"""
return self.peak_rss / (1024 * 1024)
def get_peak_rss_bytes(self) -> int:
"""
Get peak RSS in bytes.
Returns:
Peak RSS in bytes
"""
return self.peak_rss
def get_current_rss_mb(self) -> float:
"""
Get current RSS in megabytes.
Returns:
Current RSS in MB
"""
return self.current_rss / (1024 * 1024)
def get_memory_delta_mb(self) -> float:
"""
Get memory delta from initial RSS in megabytes.
Returns:
Memory delta in MB (peak - initial)
"""
if self.initial_rss is None:
return 0.0
return (self.peak_rss - self.initial_rss) / (1024 * 1024)
@contextmanager
def memory_profiler() -> Iterator[MemoryProfiler]:
"""
Context manager for memory profiling.
Usage:
with memory_profiler() as profiler:
# Your code here
profiler.sample() # Optional: sample at specific points
peak_rss_mb = profiler.get_peak_rss_mb()
Yields:
MemoryProfiler instance
"""
if not _PSUTIL_AVAILABLE:
# Return dummy profiler if psutil not available
class DummyProfiler:
def start(self) -> None: pass
def sample(self) -> int: return 0
def get_peak_rss_mb(self) -> float: return 0.0
def get_peak_rss_bytes(self) -> int: return 0
def get_current_rss_mb(self) -> float: return 0.0
def get_memory_delta_mb(self) -> float: return 0.0
profiler = DummyProfiler() # type: ignore
profiler.start()
yield profiler
return
profiler = MemoryProfiler()
profiler.start()
try:
yield profiler
# Final sample to capture any last-minute allocations
profiler.sample()
finally:
pass
def compute_percentiles(values: list[float]) -> dict[str, float]:
"""
Compute P50, P95, P99 percentiles from a list of values.
Args:
values: List of numeric values
Returns:
Dictionary with p50, p95, p99 keys
"""
if not values:
return {"p50": 0.0, "p95": 0.0, "p99": 0.0}
sorted_values = sorted(values)
n = len(sorted_values)
return {
"p50": sorted_values[n // 2],
"p95": sorted_values[int(n * 0.95)] if n > 1 else sorted_values[0],
"p99": sorted_values[int(n * 0.99)] if n > 1 else sorted_values[0],
}
def calculate_statistics(values: list[float], confidence_level: float = 0.95) -> dict[str, Any]:
"""
Calculate statistical summary for a list of values.
Args:
values: List of numeric values
confidence_level: Confidence level (e.g., 0.95 for 95% CI)
Returns:
Dictionary with mean, std, min, max, percentiles, and confidence intervals
"""
if not values:
return {
"mean": 0.0,
"std": 0.0,
"min": 0.0,
"max": 0.0,
"p50": 0.0,
"p95": 0.0,
"p99": 0.0,
"ci_lower": 0.0,
"ci_upper": 0.0,
"cv": 0.0, # Coefficient of variation
}
values_array = np.array(values)
mean = float(np.mean(values_array))
std = float(np.std(values_array, ddof=1)) # Sample std dev (ddof=1)
min_val = float(np.min(values_array))
max_val = float(np.max(values_array))
# Percentiles
p50 = float(np.percentile(values_array, 50))
p95 = float(np.percentile(values_array, 95))
p99 = float(np.percentile(values_array, 99))
# Confidence interval (t-distribution for small samples)
n = len(values)
if n > 1:
alpha = 1 - confidence_level
if HAS_SCIPY:
# Use t-distribution for small samples
t_critical = stats.t.ppf(1 - alpha / 2, df=n - 1)
margin = t_critical * (std / np.sqrt(n))
else:
# Fallback: use normal distribution approximation (z-score)
# For 95% CI: z = 1.96, for 90% CI: z = 1.645
z_scores = {0.90: 1.645, 0.95: 1.96, 0.99: 2.576}
z_critical = z_scores.get(confidence_level, 1.96)
margin = z_critical * (std / np.sqrt(n))
ci_lower = mean - margin
ci_upper = mean + margin
else:
ci_lower = mean
ci_upper = mean
# Coefficient of variation (relative standard deviation)
cv = (std / mean * 100) if mean > 0 else 0.0
return {
"mean": mean,
"std": std,
"min": min_val,
"max": max_val,
"p50": p50,
"p95": p95,
"p99": p99,
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"cv": cv, # Coefficient of variation (%)
"count": n,
}