Initial commit: LLM-DS optimizer framework with data files excluded

2025-11-06 22:20:11 -05:00
commit f83fe475df
52 changed files with 10666 additions and 0 deletions
--- a/llmds/data_sources/commoncrawl.py
+++ b/llmds/data_sources/commoncrawl.py
@@ -0,0 +1,123 @@
+"""Common Crawl loader."""
+
+import json
+from pathlib import Path
+from typing import Iterator
+
+
+def download_commoncrawl(output_dir: Path, cc_month: str | None = None, limit: int | None = None) -> Path:
+    """
+    Download Common Crawl data.
+
+    Args:
+        output_dir: Directory to save corpus
+        cc_month: Common Crawl month (e.g., 'CC-MAIN-2025-14')
+        limit: Optional limit on documents
+
+    Returns:
+        Path to corpus JSONL file
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    corpus_file = output_dir / "web_pages.jsonl"
+    
+    if corpus_file.exists():
+        print(f"Common Crawl corpus already exists at {corpus_file}")
+        return corpus_file
+    
+    print("Common Crawl requires cc-downloader tool.")
+    print("Install: pip install common-crawl-download")
+    print("Usage: See https://github.com/commoncrawl/cc-downloader")
+    print("Be respectful of bandwidth when downloading.")
+    
+    # Placeholder
+    print("Creating placeholder corpus...")
+    with open(corpus_file, "w", encoding="utf-8") as f:
+        size = limit or 10000
+        for i in range(size):
+            doc = {
+                "id": f"cc_{i}",
+                "text": f"Common Crawl web page {i} content. This is a placeholder.",
+                "meta": {"url": f"https://example.com/page{i}", "cc_month": cc_month or "CC-MAIN-2025-14"}
+            }
+            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+    
+    print(f"Created placeholder corpus with {size} documents")
+    return corpus_file
+
+
+def process_commoncrawl_warc(warc_file: Path, output_file: Path, limit: int | None = None) -> None:
+    """
+    Process Common Crawl WARC file to JSONL.
+
+    Args:
+        warc_file: Path to WARC file
+        output_file: Output JSONL path
+        limit: Optional limit on documents
+    """
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    
+    try:
+        from warcio.archiveiterator import ArchiveIterator
+        HAS_WARC = True
+    except ImportError:
+        HAS_WARC = False
+        print("Warning: warcio not installed. Install with: pip install warcio")
+    
+    if not HAS_WARC:
+        print("Creating placeholder corpus...")
+        with open(output_file, "w", encoding="utf-8") as f:
+            for i in range(limit or 10000):
+                doc = {
+                    "id": f"cc_{i}",
+                    "text": f"Web page {i} content.",
+                    "meta": {"url": f"https://example.com/page{i}"}
+                }
+                f.write(json.dumps(doc, ensure_ascii=False) + "\n")
+        return
+    
+    count = 0
+    with open(warc_file, "rb") as infile, \
+         open(output_file, "w", encoding="utf-8") as outfile:
+        for record in ArchiveIterator(infile):
+            if limit and count >= limit:
+                break
+            
+            if record.rec_type == "response" and record.http_headers.get_header("Content-Type", "").startswith("text/html"):
+                # Extract text (simplified - in production use beautifulsoup)
+                text = record.read_stream().decode("utf-8", errors="ignore")
+                
+                # Simple HTML stripping (in production use html2text or similar)
+                import re
+                text = re.sub(r"<[^>]+>", "", text)
+                text = " ".join(text.split())
+                
+                if len(text) > 100:  # Minimum length
+                    doc = {
+                        "id": record.rec_headers.get_header("WARC-Record-ID", f"cc_{count}"),
+                        "text": text[:10000],  # Limit text length
+                        "meta": {"url": record.rec_headers.get_header("WARC-Target-URI", "")}
+                    }
+                    outfile.write(json.dumps(doc, ensure_ascii=False) + "\n")
+                    count += 1
+                    
+                    if count % 1000 == 0:
+                        print(f"Processed {count} pages...")
+    
+    print(f"Processed {count} Common Crawl pages to {output_file}")
+
+
+def load_commoncrawl(corpus_file: Path) -> Iterator[dict]:
+    """
+    Load Common Crawl corpus from JSONL file.
+
+    Args:
+        corpus_file: Path to corpus JSONL file
+
+    Yields:
+        Document dictionaries with 'id', 'text', 'meta'
+    """
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                yield json.loads(line)
+