Initial commit: SheepOp LLM - Transformer-based language model implementation

- Complete transformer implementation from scratch - Training pipeline with gradient accumulation and mixed precision - Optimized inference with KV caching - Multi-format data processing (PDFs, images, code, text) - Comprehensive documentation - Apache 2.0 license - Example training plots included in docs/images/
2025-11-06 22:07:41 -05:00
commit 3d2da94ce2
60 changed files with 25153 additions and 0 deletions
--- a/download_large_data.py
+++ b/download_large_data.py
@@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+"""
+Download large datasets for training the SheepOp LLM.
+Supports Amazon Reviews, WikiText, OpenWebText, BookCorpus, and more.
+"""
+import argparse
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+def download_amazon_reviews(output: str = "data/amazon_reviews.txt", limit: int = 500000, category: str = "Video_Games_v1_00"):
+    """
+    Download Amazon Product Reviews dataset.
+    
+    Args:
+        output: Output file path
+        limit: Maximum number of reviews to download
+        category: Product category (Video_Games_v1_00, Books_v1_00, etc.)
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("Error: 'datasets' library not installed.")
+        print("Install with: pip install datasets")
+        return False
+    
+    Path(output).parent.mkdir(parents=True, exist_ok=True)
+    
+    print(f"📥 Downloading Amazon Product Reviews (category: {category}, limit: {limit})...")
+    print("   This may take several minutes depending on your connection...")
+    
+    try:
+        # Try different dataset names/approaches
+        # Method 1: Try mc4 (Common Crawl) which includes Amazon-like content
+        print("   Attempting to download from alternative source...")
+        
+        # Use amazon_polarity dataset (smaller but works)
+        try:
+            print("   Trying amazon_polarity dataset...")
+            dataset = load_dataset("amazon_polarity", split=f"train[:{limit}]")
+            
+            with open(output, "w", encoding="utf-8") as f:
+                count = 0
+                for item in dataset:
+                    review = item.get("content", "").strip()
+                    if not review:
+                        review = item.get("text", "").strip()
+                    if review and len(review) > 20:
+                        f.write(review + "\n")
+                        count += 1
+                        if count % 50000 == 0:
+                            print(f"   ✓ Downloaded {count:,} reviews...")
+            
+            print(f"✅ Successfully saved {count:,} reviews to {output}")
+            return True
+            
+        except Exception as e1:
+            print(f"   amazon_polarity failed: {e1}")
+            
+            # Method 2: Use IMDB reviews (similar structure)
+            try:
+                print("   Trying IMDB reviews as alternative...")
+                dataset = load_dataset("imdb", split=f"train[:{limit}]")
+                
+                with open(output, "w", encoding="utf-8") as f:
+                    count = 0
+                    for item in dataset:
+                        review = item.get("text", "").strip()
+                        if review and len(review) > 20:
+                            f.write(review + "\n")
+                            count += 1
+                            if count % 50000 == 0:
+                                print(f"   ✓ Downloaded {count:,} reviews...")
+                
+                print(f"✅ Successfully saved {count:,} reviews to {output}")
+                print("   Note: Using IMDB reviews instead of Amazon reviews")
+                return True
+                
+            except Exception as e2:
+                print(f"   IMDB also failed: {e2}")
+                raise Exception("Both Amazon and IMDB datasets failed. Try using --alternative flag with a different dataset.")
+        
+    except Exception as e:
+        print(f"❌ Error downloading reviews: {e}")
+        print("\n💡 Alternative options:")
+        print("   1. Use WikiText instead: python3 download_large_data.py wiki")
+        print("   2. Use OpenWebText: python3 download_large_data.py openwebtext --limit 100000")
+        print("   3. Try downloading from HuggingFace Hub manually")
+        return False
+
+
+def download_wikitext(output: str = "data/wikitext.txt", version: str = "103"):
+    """
+    Download WikiText dataset (Wikipedia text).
+    
+    Args:
+        output: Output file path
+        version: WikiText version ('2' or '103')
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("Error: 'datasets' library not installed.")
+        print("Install with: pip install datasets")
+        return False
+    
+    Path(output).parent.mkdir(parents=True, exist_ok=True)
+    
+    print(f"📥 Downloading WikiText-{version}...")
+    print("   This may take several minutes...")
+    
+    try:
+        dataset = load_dataset("wikitext", f"wikitext-{version}-v1", split="train")
+        
+        with open(output, "w", encoding="utf-8") as f:
+            count = 0
+            for item in dataset:
+                text = item.get("text", "").strip()
+                # Filter out headers and empty lines
+                if text and len(text) > 20 and not text.startswith("="):
+                    # Split into sentences
+                    sentences = text.split('.')
+                    for s in sentences:
+                        s = s.strip()
+                        if len(s) > 20:
+                            f.write(s + ".\n")
+                            count += 1
+                            if count % 10000 == 0:
+                                print(f"   ✓ Processed {count:,} sentences...")
+        
+        print(f"✅ Successfully saved {count:,} sentences to {output}")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error downloading WikiText: {e}")
+        return False
+
+
+def download_openwebtext(output: str = "data/openwebtext.txt", limit: int = 100000):
+    """
+    Download OpenWebText dataset (web text corpus).
+    
+    Args:
+        output: Output file path
+        limit: Maximum number of samples to download
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("Error: 'datasets' library not installed.")
+        print("Install with: pip install datasets")
+        return False
+    
+    Path(output).parent.mkdir(parents=True, exist_ok=True)
+    
+    print(f"📥 Downloading OpenWebText (limit: {limit:,})...")
+    print("   This may take a while - OpenWebText is very large...")
+    
+    try:
+        dataset = load_dataset("openwebtext", split=f"train[:{limit}]")
+        
+        with open(output, "w", encoding="utf-8") as f:
+            count = 0
+            for item in dataset:
+                text = item.get("text", "").strip()
+                if text:
+                    # Split into sentences
+                    sentences = text.split('.')
+                    for s in sentences:
+                        s = s.strip()
+                        if len(s) > 20:
+                            f.write(s + ".\n")
+                            count += 1
+                            if count % 10000 == 0:
+                                print(f"   ✓ Processed {count:,} sentences...")
+        
+        print(f"✅ Successfully saved {count:,} sentences to {output}")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error downloading OpenWebText: {e}")
+        return False
+
+
+def download_bookcorpus(output: str = "data/bookcorpus.txt", limit: int = 100000):
+    """
+    Download BookCorpus dataset (books).
+    
+    Args:
+        output: Output file path
+        limit: Maximum number of books to download
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("Error: 'datasets' library not installed.")
+        print("Install with: pip install datasets")
+        return False
+    
+    Path(output).parent.mkdir(parents=True, exist_ok=True)
+    
+    print(f"📥 Downloading BookCorpus (limit: {limit:,} books)...")
+    print("   This may take a while...")
+    
+    try:
+        dataset = load_dataset("bookcorpus", split=f"train[:{limit}]")
+        
+        with open(output, "w", encoding="utf-8") as f:
+            count = 0
+            for item in dataset:
+                text = item.get("text", "").strip()
+                if text:
+                    # Split into sentences
+                    sentences = text.split('.')
+                    for s in sentences:
+                        s = s.strip()
+                        if len(s) > 20:
+                            f.write(s + ".\n")
+                            count += 1
+                            if count % 10000 == 0:
+                                print(f"   ✓ Processed {count:,} sentences...")
+        
+        print(f"✅ Successfully saved {count:,} sentences to {output}")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error downloading BookCorpus: {e}")
+        return False
+
+
+def download_wikitext_direct(output: str = "data/wikitext_direct.txt"):
+    """
+    Download WikiText directly from URL (no HuggingFace required).
+    """
+    import urllib.request
+    import zipfile
+    import tempfile
+    import os
+    
+    Path(output).parent.mkdir(parents=True, exist_ok=True)
+    
+    url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
+    
+    print("📥 Downloading WikiText-103 directly from URL...")
+    print("   This may take several minutes...")
+    
+    try:
+        # Download to temp file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp_file:
+            tmp_path = tmp_file.name
+            print(f"   Downloading to temporary file...")
+            urllib.request.urlretrieve(url, tmp_path)
+        
+        # Extract and process
+        print("   Extracting and processing...")
+        with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
+            # Extract wiki.train.tokens
+            with zip_ref.open('wikitext-103/wiki.train.tokens') as f:
+                with open(output, 'w', encoding='utf-8') as out_file:
+                    count = 0
+                    for line in f:
+                        line = line.decode('utf-8').strip()
+                        if line and len(line) > 20 and not line.startswith('='):
+                            sentences = line.split('.')
+                            for s in sentences:
+                                s = s.strip()
+                                if len(s) > 20:
+                                    out_file.write(s + ".\n")
+                                    count += 1
+                                    if count % 10000 == 0:
+                                        print(f"   ✓ Processed {count:,} sentences...")
+        
+        # Clean up
+        os.unlink(tmp_path)
+        
+        print(f"✅ Successfully saved {count:,} sentences to {output}")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error downloading WikiText: {e}")
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Download large datasets for training SheepOp LLM',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Download 500k Amazon reviews
+  python3 download_large_data.py amazon --limit 500000
+  
+  # Download WikiText-103
+  python3 download_large_data.py wiki
+  
+  # Download OpenWebText sample
+  python3 download_large_data.py openwebtext --limit 100000
+  
+  # Download to custom location
+  python3 download_large_data.py amazon --output data/my_reviews.txt
+        """
+    )
+    
+    parser.add_argument(
+        'dataset',
+        choices=['amazon', 'wiki', 'wikitext', 'openwebtext', 'bookcorpus'],
+        help='Dataset to download'
+    )
+    
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Output file path (default: data/<dataset_name>.txt)'
+    )
+    
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=500000,
+        help='Maximum number of samples to download (default: 500000)'
+    )
+    
+    parser.add_argument(
+        '--category',
+        type=str,
+        default='Video_Games_v1_00',
+        help='Amazon reviews category (for amazon dataset only, may not work - uses alternative)'
+    )
+    
+    parser.add_argument(
+        '--use-imdb',
+        action='store_true',
+        help='Use IMDB reviews instead of Amazon (more reliable)'
+    )
+    
+    parser.add_argument(
+        '--version',
+        type=str,
+        default='103',
+        choices=['2', '103'],
+        help='WikiText version: 2 (small) or 103 (large)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Set default output path if not provided
+    if not args.output:
+        if args.dataset == 'amazon':
+            args.output = f"data/amazon_reviews.txt"
+        elif args.dataset in ['wiki', 'wikitext']:
+            args.output = f"data/wikitext_{args.version}.txt"
+        elif args.dataset == 'openwebtext':
+            args.output = "data/openwebtext.txt"
+        elif args.dataset == 'bookcorpus':
+            args.output = "data/bookcorpus.txt"
+    
+    print(f"\n🚀 SheepOp Dataset Downloader")
+    print(f"   Dataset: {args.dataset}")
+    print(f"   Output: {args.output}")
+    print(f"   Limit: {args.limit:,} samples\n")
+    
+    # Download based on dataset type
+    success = False
+    if args.dataset == 'amazon':
+        if args.use_imdb:
+            # Use IMDB directly
+            try:
+                from datasets import load_dataset
+                print("📥 Downloading IMDB Reviews...")
+                dataset = load_dataset("imdb", split=f"train[:{args.limit}]")
+                Path(args.output).parent.mkdir(parents=True, exist_ok=True)
+                with open(args.output, "w", encoding="utf-8") as f:
+                    count = 0
+                    for item in dataset:
+                        review = item.get("text", "").strip()
+                        if review and len(review) > 20:
+                            f.write(review + "\n")
+                            count += 1
+                            if count % 50000 == 0:
+                                print(f"   ✓ Downloaded {count:,} reviews...")
+                print(f"✅ Successfully saved {count:,} reviews to {args.output}")
+                success = True
+            except Exception as e:
+                print(f"❌ Error: {e}")
+                success = False
+        else:
+            success = download_amazon_reviews(args.output, args.limit, args.category)
+    elif args.dataset in ['wiki', 'wikitext']:
+        if args.version == '103':
+            # Try direct download first (no HuggingFace dependency)
+            print("   Attempting direct download (no HuggingFace required)...")
+            success = download_wikitext_direct(args.output)
+            if not success:
+                print("   Falling back to HuggingFace download...")
+                success = download_wikitext(args.output, args.version)
+        else:
+            success = download_wikitext(args.output, args.version)
+    elif args.dataset == 'openwebtext':
+        success = download_openwebtext(args.output, args.limit)
+    elif args.dataset == 'bookcorpus':
+        success = download_bookcorpus(args.output, args.limit)
+    
+    if success:
+        print(f"\n✅ Download complete!")
+        print(f"   File: {args.output}")
+        
+        # Show file info
+        try:
+            import os
+            size_mb = os.path.getsize(args.output) / (1024 * 1024)
+            with open(args.output, 'r', encoding='utf-8') as f:
+                lines = sum(1 for _ in f)
+            print(f"   Size: {size_mb:.2f} MB")
+            print(f"   Lines: {lines:,}")
+        except:
+            pass
+        
+        print(f"\n📚 You can now train with:")
+        print(f"   python3 train.py --data {args.output} --config config.json --device cuda")
+    else:
+        print(f"\n❌ Download failed. Please check the error messages above.")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
+