Files
llm-rag-ds-optimizer/llmds/chunking.py

73 lines
2.0 KiB
Python

"""Text chunking utilities for RAG."""
from typing import Any, Iterator, Optional
def chunk_text(
text: str,
chunk_size: int = 512,
overlap: int = 50,
tokenizer: Optional[Any] = None,
) -> Iterator[str]:
"""
Chunk text into overlapping segments.
Args:
text: Input text to chunk
chunk_size: Target chunk size in tokens/characters
overlap: Overlap between chunks
tokenizer: Optional tokenizer (if None, uses character-based)
Yields:
Text chunks
"""
if tokenizer is not None:
# Token-based chunking
tokens = tokenizer.encode(text)
for i in range(0, len(tokens), chunk_size - overlap):
chunk_tokens = tokens[i:i + chunk_size]
yield tokenizer.decode(chunk_tokens)
else:
# Character-based chunking (simple fallback)
for i in range(0, len(text), chunk_size - overlap):
yield text[i:i + chunk_size]
def chunk_documents(
documents: Iterator[dict[str, Any]],
chunk_size: int = 512,
overlap: int = 50,
tokenizer: Optional[Any] = None,
) -> Iterator[dict[str, Any]]:
"""
Chunk documents into smaller segments.
Args:
documents: Iterator of document dicts with 'id', 'text', 'meta'
chunk_size: Target chunk size
overlap: Overlap between chunks
tokenizer: Optional tokenizer
Yields:
Chunk dictionaries with 'id', 'text', 'meta', 'chunk_idx'
"""
for doc in documents:
doc_id = doc["id"]
text = doc["text"]
meta = doc.get("meta", {})
chunks = list(chunk_text(text, chunk_size, overlap, tokenizer))
for chunk_idx, chunk_text_seg in enumerate(chunks):
yield {
"id": f"{doc_id}_chunk_{chunk_idx}",
"text": chunk_text_seg,
"meta": {
**meta,
"doc_id": doc_id,
"chunk_idx": chunk_idx,
"total_chunks": len(chunks),
}
}