Initial commit: LLM-DS optimizer framework with data files excluded
This commit is contained in:
110
llmds/data_sources/msmarco.py
Normal file
110
llmds/data_sources/msmarco.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""MS MARCO dataset loader."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
|
||||
def download_msmarco(output_dir: Path, split: str = "passage") -> Path:
|
||||
"""
|
||||
Download MS MARCO dataset.
|
||||
|
||||
Args:
|
||||
output_dir: Directory to save files
|
||||
split: Dataset split ('passage' or 'doc')
|
||||
|
||||
Returns:
|
||||
Path to downloaded corpus file
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
base_url = "https://msmarco.blob.core.windows.net/msmarcoranking"
|
||||
|
||||
if split == "passage":
|
||||
collection_url = f"{base_url}/collection.tar.gz"
|
||||
queries_url = f"{base_url}/queries.tar.gz"
|
||||
else:
|
||||
collection_url = f"{base_url}/docranking/collection.tar.gz"
|
||||
queries_url = f"{base_url}/docranking/queries.tar.gz"
|
||||
|
||||
corpus_file = output_dir / "corpus.jsonl"
|
||||
|
||||
if corpus_file.exists():
|
||||
print(f"MS MARCO corpus already exists at {corpus_file}")
|
||||
return corpus_file
|
||||
|
||||
# Download and extract (simplified - in production, use official downloader)
|
||||
print(f"Downloading MS MARCO {split} collection...")
|
||||
print("Note: For production use, download from https://microsoft.github.io/msmarco/")
|
||||
print("This is a placeholder implementation.")
|
||||
|
||||
# Placeholder: in real implementation, download and extract tarball
|
||||
# For now, create a small sample
|
||||
with open(corpus_file, "w", encoding="utf-8") as f:
|
||||
for i in range(1000): # Sample
|
||||
doc = {
|
||||
"id": f"msmarco_{i}",
|
||||
"text": f"MS MARCO passage {i} content. This is a placeholder.",
|
||||
"meta": {"split": split}
|
||||
}
|
||||
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Created sample corpus at {corpus_file}")
|
||||
return corpus_file
|
||||
|
||||
|
||||
def load_msmarco(corpus_file: Path) -> Iterator[dict]:
|
||||
"""
|
||||
Load MS MARCO corpus from JSONL file.
|
||||
|
||||
Args:
|
||||
corpus_file: Path to corpus JSONL file
|
||||
|
||||
Yields:
|
||||
Document dictionaries with 'id', 'text', 'meta'
|
||||
"""
|
||||
with open(corpus_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def normalize_msmarco(
|
||||
collection_file: Path,
|
||||
output_file: Path,
|
||||
limit: int | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Normalize MS MARCO collection to JSONL format.
|
||||
|
||||
Args:
|
||||
collection_file: Path to MS MARCO collection TSV
|
||||
output_file: Output JSONL path
|
||||
limit: Optional limit on number of documents
|
||||
"""
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
count = 0
|
||||
with open(collection_file, "r", encoding="utf-8") as infile, \
|
||||
open(output_file, "w", encoding="utf-8") as outfile:
|
||||
for line in infile:
|
||||
if limit and count >= limit:
|
||||
break
|
||||
|
||||
parts = line.strip().split("\t", 2)
|
||||
if len(parts) >= 2:
|
||||
doc_id, text = parts[0], parts[1]
|
||||
doc = {
|
||||
"id": doc_id,
|
||||
"text": text,
|
||||
"meta": {"source": "msmarco"}
|
||||
}
|
||||
outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
||||
count += 1
|
||||
|
||||
print(f"Normalized {count} documents to {output_file}")
|
||||
|
||||
Reference in New Issue
Block a user