Initial commit: LLM-DS optimizer framework with data files excluded

2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions
--- a/llmds/data_sources/msmarco.py
+++ b/llmds/data_sources/msmarco.py
@@ -0,0 +1,110 @@
+"""MS MARCO dataset loader."""
+
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Iterator
+from urllib.request import urlretrieve
+
+
+def download_msmarco(output_dir: Path, split: str = "passage") -> Path:
+    """
+    Download MS MARCO dataset.
+
+    Args:
+        output_dir: Directory to save files
+        split: Dataset split ('passage' or 'doc')
+
+    Returns:
+        Path to downloaded corpus file
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    base_url = "https://msmarco.blob.core.windows.net/msmarcoranking"
+    
+    if split == "passage":
+        collection_url = f"{base_url}/collection.tar.gz"
+        queries_url = f"{base_url}/queries.tar.gz"
+    else:
+        collection_url = f"{base_url}/docranking/collection.tar.gz"
+        queries_url = f"{base_url}/docranking/queries.tar.gz"
+    
+    corpus_file = output_dir / "corpus.jsonl"
+    
+    if corpus_file.exists():
+        print(f"MS MARCO corpus already exists at {corpus_file}")
+        return corpus_file
+    
+    # Download and extract (simplified - in production, use official downloader)
+    print(f"Downloading MS MARCO {split} collection...")
+    print("Note: For production use, download from https://microsoft.github.io/msmarco/")
+    print("This is a placeholder implementation.")
+    
+    # Placeholder: in real implementation, download and extract tarball
+    # For now, create a small sample
+    with open(corpus_file, "w", encoding="utf-8") as f:
+        for i in range(1000):  # Sample
+            doc = {
+                "id": f"msmarco_{i}",
+                "text": f"MS MARCO passage {i} content. This is a placeholder.",
+                "meta": {"split": split}
+            }
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+    
+    print(f"Created sample corpus at {corpus_file}")
+    return corpus_file
+
+
+def load_msmarco(corpus_file: Path) -> Iterator[dict]:
+    """
+    Load MS MARCO corpus from JSONL file.
+
+    Args:
+        corpus_file: Path to corpus JSONL file
+
+    Yields:
+        Document dictionaries with 'id', 'text', 'meta'
+    """
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                yield json.loads(line)
+
+
+def normalize_msmarco(
+    collection_file: Path,
+    output_file: Path,
+    limit: int | None = None,
+) -> None:
+    """
+    Normalize MS MARCO collection to JSONL format.
+
+    Args:
+        collection_file: Path to MS MARCO collection TSV
+        output_file: Output JSONL path
+        limit: Optional limit on number of documents
+    """
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    count = 0
+    with open(collection_file, "r", encoding="utf-8") as infile, \
+         open(output_file, "w", encoding="utf-8") as outfile:
+        for line in infile:
+            if limit and count >= limit:
+                break
+            
+            parts = line.strip().split("\t", 2)
+            if len(parts) >= 2:
+                doc_id, text = parts[0], parts[1]
+                doc = {
+                    "id": doc_id,
+                    "text": text,
+                    "meta": {"source": "msmarco"}
+                }
+                outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
+                count += 1
+    
+    print(f"Normalized {count} documents to {output_file}")
+