Files

111 lines
3.3 KiB
Python

"""MS MARCO dataset loader."""
import json
import os
import subprocess
import tempfile
from pathlib import Path
from typing import Iterator
from urllib.request import urlretrieve
def download_msmarco(output_dir: Path, split: str = "passage") -> Path:
"""
Download MS MARCO dataset.
Args:
output_dir: Directory to save files
split: Dataset split ('passage' or 'doc')
Returns:
Path to downloaded corpus file
"""
output_dir.mkdir(parents=True, exist_ok=True)
base_url = "https://msmarco.blob.core.windows.net/msmarcoranking"
if split == "passage":
collection_url = f"{base_url}/collection.tar.gz"
queries_url = f"{base_url}/queries.tar.gz"
else:
collection_url = f"{base_url}/docranking/collection.tar.gz"
queries_url = f"{base_url}/docranking/queries.tar.gz"
corpus_file = output_dir / "corpus.jsonl"
if corpus_file.exists():
print(f"MS MARCO corpus already exists at {corpus_file}")
return corpus_file
# Download and extract (simplified - in production, use official downloader)
print(f"Downloading MS MARCO {split} collection...")
print("Note: For production use, download from https://microsoft.github.io/msmarco/")
print("This is a placeholder implementation.")
# Placeholder: in real implementation, download and extract tarball
# For now, create a small sample
with open(corpus_file, "w", encoding="utf-8") as f:
for i in range(1000): # Sample
doc = {
"id": f"msmarco_{i}",
"text": f"MS MARCO passage {i} content. This is a placeholder.",
"meta": {"split": split}
}
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Created sample corpus at {corpus_file}")
return corpus_file
def load_msmarco(corpus_file: Path) -> Iterator[dict]:
"""
Load MS MARCO corpus from JSONL file.
Args:
corpus_file: Path to corpus JSONL file
Yields:
Document dictionaries with 'id', 'text', 'meta'
"""
with open(corpus_file, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)
def normalize_msmarco(
collection_file: Path,
output_file: Path,
limit: int | None = None,
) -> None:
"""
Normalize MS MARCO collection to JSONL format.
Args:
collection_file: Path to MS MARCO collection TSV
output_file: Output JSONL path
limit: Optional limit on number of documents
"""
output_file.parent.mkdir(parents=True, exist_ok=True)
count = 0
with open(collection_file, "r", encoding="utf-8") as infile, \
open(output_file, "w", encoding="utf-8") as outfile:
for line in infile:
if limit and count >= limit:
break
parts = line.strip().split("\t", 2)
if len(parts) >= 2:
doc_id, text = parts[0], parts[1]
doc = {
"id": doc_id,
"text": text,
"meta": {"source": "msmarco"}
}
outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
count += 1
print(f"Normalized {count} documents to {output_file}")