#!/usr/bin/env python3 """ Download large datasets for training the SheepOp LLM. Supports Amazon Reviews, WikiText, OpenWebText, BookCorpus, and more. """ import argparse import sys from pathlib import Path from typing import Optional def download_amazon_reviews(output: str = "data/amazon_reviews.txt", limit: int = 500000, category: str = "Video_Games_v1_00"): """ Download Amazon Product Reviews dataset. Args: output: Output file path limit: Maximum number of reviews to download category: Product category (Video_Games_v1_00, Books_v1_00, etc.) """ try: from datasets import load_dataset except ImportError: print("Error: 'datasets' library not installed.") print("Install with: pip install datasets") return False Path(output).parent.mkdir(parents=True, exist_ok=True) print(f"šŸ“„ Downloading Amazon Product Reviews (category: {category}, limit: {limit})...") print(" This may take several minutes depending on your connection...") try: # Try different dataset names/approaches # Method 1: Try mc4 (Common Crawl) which includes Amazon-like content print(" Attempting to download from alternative source...") # Use amazon_polarity dataset (smaller but works) try: print(" Trying amazon_polarity dataset...") dataset = load_dataset("amazon_polarity", split=f"train[:{limit}]") with open(output, "w", encoding="utf-8") as f: count = 0 for item in dataset: review = item.get("content", "").strip() if not review: review = item.get("text", "").strip() if review and len(review) > 20: f.write(review + "\n") count += 1 if count % 50000 == 0: print(f" āœ“ Downloaded {count:,} reviews...") print(f"āœ… Successfully saved {count:,} reviews to {output}") return True except Exception as e1: print(f" amazon_polarity failed: {e1}") # Method 2: Use IMDB reviews (similar structure) try: print(" Trying IMDB reviews as alternative...") dataset = load_dataset("imdb", split=f"train[:{limit}]") with open(output, "w", encoding="utf-8") as f: count = 0 for item in dataset: review = item.get("text", "").strip() if review and len(review) > 20: f.write(review + "\n") count += 1 if count % 50000 == 0: print(f" āœ“ Downloaded {count:,} reviews...") print(f"āœ… Successfully saved {count:,} reviews to {output}") print(" Note: Using IMDB reviews instead of Amazon reviews") return True except Exception as e2: print(f" IMDB also failed: {e2}") raise Exception("Both Amazon and IMDB datasets failed. Try using --alternative flag with a different dataset.") except Exception as e: print(f"āŒ Error downloading reviews: {e}") print("\nšŸ’” Alternative options:") print(" 1. Use WikiText instead: python3 download_large_data.py wiki") print(" 2. Use OpenWebText: python3 download_large_data.py openwebtext --limit 100000") print(" 3. Try downloading from HuggingFace Hub manually") return False def download_wikitext(output: str = "data/wikitext.txt", version: str = "103"): """ Download WikiText dataset (Wikipedia text). Args: output: Output file path version: WikiText version ('2' or '103') """ try: from datasets import load_dataset except ImportError: print("Error: 'datasets' library not installed.") print("Install with: pip install datasets") return False Path(output).parent.mkdir(parents=True, exist_ok=True) print(f"šŸ“„ Downloading WikiText-{version}...") print(" This may take several minutes...") try: dataset = load_dataset("wikitext", f"wikitext-{version}-v1", split="train") with open(output, "w", encoding="utf-8") as f: count = 0 for item in dataset: text = item.get("text", "").strip() # Filter out headers and empty lines if text and len(text) > 20 and not text.startswith("="): # Split into sentences sentences = text.split('.') for s in sentences: s = s.strip() if len(s) > 20: f.write(s + ".\n") count += 1 if count % 10000 == 0: print(f" āœ“ Processed {count:,} sentences...") print(f"āœ… Successfully saved {count:,} sentences to {output}") return True except Exception as e: print(f"āŒ Error downloading WikiText: {e}") return False def download_openwebtext(output: str = "data/openwebtext.txt", limit: int = 100000): """ Download OpenWebText dataset (web text corpus). Args: output: Output file path limit: Maximum number of samples to download """ try: from datasets import load_dataset except ImportError: print("Error: 'datasets' library not installed.") print("Install with: pip install datasets") return False Path(output).parent.mkdir(parents=True, exist_ok=True) print(f"šŸ“„ Downloading OpenWebText (limit: {limit:,})...") print(" This may take a while - OpenWebText is very large...") try: dataset = load_dataset("openwebtext", split=f"train[:{limit}]") with open(output, "w", encoding="utf-8") as f: count = 0 for item in dataset: text = item.get("text", "").strip() if text: # Split into sentences sentences = text.split('.') for s in sentences: s = s.strip() if len(s) > 20: f.write(s + ".\n") count += 1 if count % 10000 == 0: print(f" āœ“ Processed {count:,} sentences...") print(f"āœ… Successfully saved {count:,} sentences to {output}") return True except Exception as e: print(f"āŒ Error downloading OpenWebText: {e}") return False def download_bookcorpus(output: str = "data/bookcorpus.txt", limit: int = 100000): """ Download BookCorpus dataset (books). Args: output: Output file path limit: Maximum number of books to download """ try: from datasets import load_dataset except ImportError: print("Error: 'datasets' library not installed.") print("Install with: pip install datasets") return False Path(output).parent.mkdir(parents=True, exist_ok=True) print(f"šŸ“„ Downloading BookCorpus (limit: {limit:,} books)...") print(" This may take a while...") try: dataset = load_dataset("bookcorpus", split=f"train[:{limit}]") with open(output, "w", encoding="utf-8") as f: count = 0 for item in dataset: text = item.get("text", "").strip() if text: # Split into sentences sentences = text.split('.') for s in sentences: s = s.strip() if len(s) > 20: f.write(s + ".\n") count += 1 if count % 10000 == 0: print(f" āœ“ Processed {count:,} sentences...") print(f"āœ… Successfully saved {count:,} sentences to {output}") return True except Exception as e: print(f"āŒ Error downloading BookCorpus: {e}") return False def download_wikitext_direct(output: str = "data/wikitext_direct.txt"): """ Download WikiText directly from URL (no HuggingFace required). """ import urllib.request import zipfile import tempfile import os Path(output).parent.mkdir(parents=True, exist_ok=True) url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip" print("šŸ“„ Downloading WikiText-103 directly from URL...") print(" This may take several minutes...") try: # Download to temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp_file: tmp_path = tmp_file.name print(f" Downloading to temporary file...") urllib.request.urlretrieve(url, tmp_path) # Extract and process print(" Extracting and processing...") with zipfile.ZipFile(tmp_path, 'r') as zip_ref: # Extract wiki.train.tokens with zip_ref.open('wikitext-103/wiki.train.tokens') as f: with open(output, 'w', encoding='utf-8') as out_file: count = 0 for line in f: line = line.decode('utf-8').strip() if line and len(line) > 20 and not line.startswith('='): sentences = line.split('.') for s in sentences: s = s.strip() if len(s) > 20: out_file.write(s + ".\n") count += 1 if count % 10000 == 0: print(f" āœ“ Processed {count:,} sentences...") # Clean up os.unlink(tmp_path) print(f"āœ… Successfully saved {count:,} sentences to {output}") return True except Exception as e: print(f"āŒ Error downloading WikiText: {e}") return False def main(): parser = argparse.ArgumentParser( description='Download large datasets for training SheepOp LLM', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Download 500k Amazon reviews python3 download_large_data.py amazon --limit 500000 # Download WikiText-103 python3 download_large_data.py wiki # Download OpenWebText sample python3 download_large_data.py openwebtext --limit 100000 # Download to custom location python3 download_large_data.py amazon --output data/my_reviews.txt """ ) parser.add_argument( 'dataset', choices=['amazon', 'wiki', 'wikitext', 'openwebtext', 'bookcorpus'], help='Dataset to download' ) parser.add_argument( '--output', type=str, help='Output file path (default: data/.txt)' ) parser.add_argument( '--limit', type=int, default=500000, help='Maximum number of samples to download (default: 500000)' ) parser.add_argument( '--category', type=str, default='Video_Games_v1_00', help='Amazon reviews category (for amazon dataset only, may not work - uses alternative)' ) parser.add_argument( '--use-imdb', action='store_true', help='Use IMDB reviews instead of Amazon (more reliable)' ) parser.add_argument( '--version', type=str, default='103', choices=['2', '103'], help='WikiText version: 2 (small) or 103 (large)' ) args = parser.parse_args() # Set default output path if not provided if not args.output: if args.dataset == 'amazon': args.output = f"data/amazon_reviews.txt" elif args.dataset in ['wiki', 'wikitext']: args.output = f"data/wikitext_{args.version}.txt" elif args.dataset == 'openwebtext': args.output = "data/openwebtext.txt" elif args.dataset == 'bookcorpus': args.output = "data/bookcorpus.txt" print(f"\nšŸš€ SheepOp Dataset Downloader") print(f" Dataset: {args.dataset}") print(f" Output: {args.output}") print(f" Limit: {args.limit:,} samples\n") # Download based on dataset type success = False if args.dataset == 'amazon': if args.use_imdb: # Use IMDB directly try: from datasets import load_dataset print("šŸ“„ Downloading IMDB Reviews...") dataset = load_dataset("imdb", split=f"train[:{args.limit}]") Path(args.output).parent.mkdir(parents=True, exist_ok=True) with open(args.output, "w", encoding="utf-8") as f: count = 0 for item in dataset: review = item.get("text", "").strip() if review and len(review) > 20: f.write(review + "\n") count += 1 if count % 50000 == 0: print(f" āœ“ Downloaded {count:,} reviews...") print(f"āœ… Successfully saved {count:,} reviews to {args.output}") success = True except Exception as e: print(f"āŒ Error: {e}") success = False else: success = download_amazon_reviews(args.output, args.limit, args.category) elif args.dataset in ['wiki', 'wikitext']: if args.version == '103': # Try direct download first (no HuggingFace dependency) print(" Attempting direct download (no HuggingFace required)...") success = download_wikitext_direct(args.output) if not success: print(" Falling back to HuggingFace download...") success = download_wikitext(args.output, args.version) else: success = download_wikitext(args.output, args.version) elif args.dataset == 'openwebtext': success = download_openwebtext(args.output, args.limit) elif args.dataset == 'bookcorpus': success = download_bookcorpus(args.output, args.limit) if success: print(f"\nāœ… Download complete!") print(f" File: {args.output}") # Show file info try: import os size_mb = os.path.getsize(args.output) / (1024 * 1024) with open(args.output, 'r', encoding='utf-8') as f: lines = sum(1 for _ in f) print(f" Size: {size_mb:.2f} MB") print(f" Lines: {lines:,}") except: pass print(f"\nšŸ“š You can now train with:") print(f" python3 train.py --data {args.output} --config config.json --device cuda") else: print(f"\nāŒ Download failed. Please check the error messages above.") sys.exit(1) if __name__ == '__main__': main()