73 lines
2.0 KiB
Python
73 lines
2.0 KiB
Python
"""Text chunking utilities for RAG."""
|
|
|
|
from typing import Any, Iterator, Optional
|
|
|
|
|
|
def chunk_text(
|
|
text: str,
|
|
chunk_size: int = 512,
|
|
overlap: int = 50,
|
|
tokenizer: Optional[Any] = None,
|
|
) -> Iterator[str]:
|
|
"""
|
|
Chunk text into overlapping segments.
|
|
|
|
Args:
|
|
text: Input text to chunk
|
|
chunk_size: Target chunk size in tokens/characters
|
|
overlap: Overlap between chunks
|
|
tokenizer: Optional tokenizer (if None, uses character-based)
|
|
|
|
Yields:
|
|
Text chunks
|
|
"""
|
|
if tokenizer is not None:
|
|
# Token-based chunking
|
|
tokens = tokenizer.encode(text)
|
|
for i in range(0, len(tokens), chunk_size - overlap):
|
|
chunk_tokens = tokens[i:i + chunk_size]
|
|
yield tokenizer.decode(chunk_tokens)
|
|
else:
|
|
# Character-based chunking (simple fallback)
|
|
for i in range(0, len(text), chunk_size - overlap):
|
|
yield text[i:i + chunk_size]
|
|
|
|
|
|
def chunk_documents(
|
|
documents: Iterator[dict[str, Any]],
|
|
chunk_size: int = 512,
|
|
overlap: int = 50,
|
|
tokenizer: Optional[Any] = None,
|
|
) -> Iterator[dict[str, Any]]:
|
|
"""
|
|
Chunk documents into smaller segments.
|
|
|
|
Args:
|
|
documents: Iterator of document dicts with 'id', 'text', 'meta'
|
|
chunk_size: Target chunk size
|
|
overlap: Overlap between chunks
|
|
tokenizer: Optional tokenizer
|
|
|
|
Yields:
|
|
Chunk dictionaries with 'id', 'text', 'meta', 'chunk_idx'
|
|
"""
|
|
for doc in documents:
|
|
doc_id = doc["id"]
|
|
text = doc["text"]
|
|
meta = doc.get("meta", {})
|
|
|
|
chunks = list(chunk_text(text, chunk_size, overlap, tokenizer))
|
|
|
|
for chunk_idx, chunk_text_seg in enumerate(chunks):
|
|
yield {
|
|
"id": f"{doc_id}_chunk_{chunk_idx}",
|
|
"text": chunk_text_seg,
|
|
"meta": {
|
|
**meta,
|
|
"doc_id": doc_id,
|
|
"chunk_idx": chunk_idx,
|
|
"total_chunks": len(chunks),
|
|
}
|
|
}
|
|
|