Files
sheepOp/download_large_data.py
Carlos Gutierrez 3d2da94ce2 Initial commit: SheepOp LLM - Transformer-based language model implementation
- Complete transformer implementation from scratch
- Training pipeline with gradient accumulation and mixed precision
- Optimized inference with KV caching
- Multi-format data processing (PDFs, images, code, text)
- Comprehensive documentation
- Apache 2.0 license
- Example training plots included in docs/images/
2025-11-06 22:07:41 -05:00

429 lines
15 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Download large datasets for training the SheepOp LLM.
Supports Amazon Reviews, WikiText, OpenWebText, BookCorpus, and more.
"""
import argparse
import sys
from pathlib import Path
from typing import Optional
def download_amazon_reviews(output: str = "data/amazon_reviews.txt", limit: int = 500000, category: str = "Video_Games_v1_00"):
"""
Download Amazon Product Reviews dataset.
Args:
output: Output file path
limit: Maximum number of reviews to download
category: Product category (Video_Games_v1_00, Books_v1_00, etc.)
"""
try:
from datasets import load_dataset
except ImportError:
print("Error: 'datasets' library not installed.")
print("Install with: pip install datasets")
return False
Path(output).parent.mkdir(parents=True, exist_ok=True)
print(f"📥 Downloading Amazon Product Reviews (category: {category}, limit: {limit})...")
print(" This may take several minutes depending on your connection...")
try:
# Try different dataset names/approaches
# Method 1: Try mc4 (Common Crawl) which includes Amazon-like content
print(" Attempting to download from alternative source...")
# Use amazon_polarity dataset (smaller but works)
try:
print(" Trying amazon_polarity dataset...")
dataset = load_dataset("amazon_polarity", split=f"train[:{limit}]")
with open(output, "w", encoding="utf-8") as f:
count = 0
for item in dataset:
review = item.get("content", "").strip()
if not review:
review = item.get("text", "").strip()
if review and len(review) > 20:
f.write(review + "\n")
count += 1
if count % 50000 == 0:
print(f" ✓ Downloaded {count:,} reviews...")
print(f"✅ Successfully saved {count:,} reviews to {output}")
return True
except Exception as e1:
print(f" amazon_polarity failed: {e1}")
# Method 2: Use IMDB reviews (similar structure)
try:
print(" Trying IMDB reviews as alternative...")
dataset = load_dataset("imdb", split=f"train[:{limit}]")
with open(output, "w", encoding="utf-8") as f:
count = 0
for item in dataset:
review = item.get("text", "").strip()
if review and len(review) > 20:
f.write(review + "\n")
count += 1
if count % 50000 == 0:
print(f" ✓ Downloaded {count:,} reviews...")
print(f"✅ Successfully saved {count:,} reviews to {output}")
print(" Note: Using IMDB reviews instead of Amazon reviews")
return True
except Exception as e2:
print(f" IMDB also failed: {e2}")
raise Exception("Both Amazon and IMDB datasets failed. Try using --alternative flag with a different dataset.")
except Exception as e:
print(f"❌ Error downloading reviews: {e}")
print("\n💡 Alternative options:")
print(" 1. Use WikiText instead: python3 download_large_data.py wiki")
print(" 2. Use OpenWebText: python3 download_large_data.py openwebtext --limit 100000")
print(" 3. Try downloading from HuggingFace Hub manually")
return False
def download_wikitext(output: str = "data/wikitext.txt", version: str = "103"):
"""
Download WikiText dataset (Wikipedia text).
Args:
output: Output file path
version: WikiText version ('2' or '103')
"""
try:
from datasets import load_dataset
except ImportError:
print("Error: 'datasets' library not installed.")
print("Install with: pip install datasets")
return False
Path(output).parent.mkdir(parents=True, exist_ok=True)
print(f"📥 Downloading WikiText-{version}...")
print(" This may take several minutes...")
try:
dataset = load_dataset("wikitext", f"wikitext-{version}-v1", split="train")
with open(output, "w", encoding="utf-8") as f:
count = 0
for item in dataset:
text = item.get("text", "").strip()
# Filter out headers and empty lines
if text and len(text) > 20 and not text.startswith("="):
# Split into sentences
sentences = text.split('.')
for s in sentences:
s = s.strip()
if len(s) > 20:
f.write(s + ".\n")
count += 1
if count % 10000 == 0:
print(f" ✓ Processed {count:,} sentences...")
print(f"✅ Successfully saved {count:,} sentences to {output}")
return True
except Exception as e:
print(f"❌ Error downloading WikiText: {e}")
return False
def download_openwebtext(output: str = "data/openwebtext.txt", limit: int = 100000):
"""
Download OpenWebText dataset (web text corpus).
Args:
output: Output file path
limit: Maximum number of samples to download
"""
try:
from datasets import load_dataset
except ImportError:
print("Error: 'datasets' library not installed.")
print("Install with: pip install datasets")
return False
Path(output).parent.mkdir(parents=True, exist_ok=True)
print(f"📥 Downloading OpenWebText (limit: {limit:,})...")
print(" This may take a while - OpenWebText is very large...")
try:
dataset = load_dataset("openwebtext", split=f"train[:{limit}]")
with open(output, "w", encoding="utf-8") as f:
count = 0
for item in dataset:
text = item.get("text", "").strip()
if text:
# Split into sentences
sentences = text.split('.')
for s in sentences:
s = s.strip()
if len(s) > 20:
f.write(s + ".\n")
count += 1
if count % 10000 == 0:
print(f" ✓ Processed {count:,} sentences...")
print(f"✅ Successfully saved {count:,} sentences to {output}")
return True
except Exception as e:
print(f"❌ Error downloading OpenWebText: {e}")
return False
def download_bookcorpus(output: str = "data/bookcorpus.txt", limit: int = 100000):
"""
Download BookCorpus dataset (books).
Args:
output: Output file path
limit: Maximum number of books to download
"""
try:
from datasets import load_dataset
except ImportError:
print("Error: 'datasets' library not installed.")
print("Install with: pip install datasets")
return False
Path(output).parent.mkdir(parents=True, exist_ok=True)
print(f"📥 Downloading BookCorpus (limit: {limit:,} books)...")
print(" This may take a while...")
try:
dataset = load_dataset("bookcorpus", split=f"train[:{limit}]")
with open(output, "w", encoding="utf-8") as f:
count = 0
for item in dataset:
text = item.get("text", "").strip()
if text:
# Split into sentences
sentences = text.split('.')
for s in sentences:
s = s.strip()
if len(s) > 20:
f.write(s + ".\n")
count += 1
if count % 10000 == 0:
print(f" ✓ Processed {count:,} sentences...")
print(f"✅ Successfully saved {count:,} sentences to {output}")
return True
except Exception as e:
print(f"❌ Error downloading BookCorpus: {e}")
return False
def download_wikitext_direct(output: str = "data/wikitext_direct.txt"):
"""
Download WikiText directly from URL (no HuggingFace required).
"""
import urllib.request
import zipfile
import tempfile
import os
Path(output).parent.mkdir(parents=True, exist_ok=True)
url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
print("📥 Downloading WikiText-103 directly from URL...")
print(" This may take several minutes...")
try:
# Download to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp_file:
tmp_path = tmp_file.name
print(f" Downloading to temporary file...")
urllib.request.urlretrieve(url, tmp_path)
# Extract and process
print(" Extracting and processing...")
with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
# Extract wiki.train.tokens
with zip_ref.open('wikitext-103/wiki.train.tokens') as f:
with open(output, 'w', encoding='utf-8') as out_file:
count = 0
for line in f:
line = line.decode('utf-8').strip()
if line and len(line) > 20 and not line.startswith('='):
sentences = line.split('.')
for s in sentences:
s = s.strip()
if len(s) > 20:
out_file.write(s + ".\n")
count += 1
if count % 10000 == 0:
print(f" ✓ Processed {count:,} sentences...")
# Clean up
os.unlink(tmp_path)
print(f"✅ Successfully saved {count:,} sentences to {output}")
return True
except Exception as e:
print(f"❌ Error downloading WikiText: {e}")
return False
def main():
parser = argparse.ArgumentParser(
description='Download large datasets for training SheepOp LLM',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Download 500k Amazon reviews
python3 download_large_data.py amazon --limit 500000
# Download WikiText-103
python3 download_large_data.py wiki
# Download OpenWebText sample
python3 download_large_data.py openwebtext --limit 100000
# Download to custom location
python3 download_large_data.py amazon --output data/my_reviews.txt
"""
)
parser.add_argument(
'dataset',
choices=['amazon', 'wiki', 'wikitext', 'openwebtext', 'bookcorpus'],
help='Dataset to download'
)
parser.add_argument(
'--output',
type=str,
help='Output file path (default: data/<dataset_name>.txt)'
)
parser.add_argument(
'--limit',
type=int,
default=500000,
help='Maximum number of samples to download (default: 500000)'
)
parser.add_argument(
'--category',
type=str,
default='Video_Games_v1_00',
help='Amazon reviews category (for amazon dataset only, may not work - uses alternative)'
)
parser.add_argument(
'--use-imdb',
action='store_true',
help='Use IMDB reviews instead of Amazon (more reliable)'
)
parser.add_argument(
'--version',
type=str,
default='103',
choices=['2', '103'],
help='WikiText version: 2 (small) or 103 (large)'
)
args = parser.parse_args()
# Set default output path if not provided
if not args.output:
if args.dataset == 'amazon':
args.output = f"data/amazon_reviews.txt"
elif args.dataset in ['wiki', 'wikitext']:
args.output = f"data/wikitext_{args.version}.txt"
elif args.dataset == 'openwebtext':
args.output = "data/openwebtext.txt"
elif args.dataset == 'bookcorpus':
args.output = "data/bookcorpus.txt"
print(f"\n🚀 SheepOp Dataset Downloader")
print(f" Dataset: {args.dataset}")
print(f" Output: {args.output}")
print(f" Limit: {args.limit:,} samples\n")
# Download based on dataset type
success = False
if args.dataset == 'amazon':
if args.use_imdb:
# Use IMDB directly
try:
from datasets import load_dataset
print("📥 Downloading IMDB Reviews...")
dataset = load_dataset("imdb", split=f"train[:{args.limit}]")
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w", encoding="utf-8") as f:
count = 0
for item in dataset:
review = item.get("text", "").strip()
if review and len(review) > 20:
f.write(review + "\n")
count += 1
if count % 50000 == 0:
print(f" ✓ Downloaded {count:,} reviews...")
print(f"✅ Successfully saved {count:,} reviews to {args.output}")
success = True
except Exception as e:
print(f"❌ Error: {e}")
success = False
else:
success = download_amazon_reviews(args.output, args.limit, args.category)
elif args.dataset in ['wiki', 'wikitext']:
if args.version == '103':
# Try direct download first (no HuggingFace dependency)
print(" Attempting direct download (no HuggingFace required)...")
success = download_wikitext_direct(args.output)
if not success:
print(" Falling back to HuggingFace download...")
success = download_wikitext(args.output, args.version)
else:
success = download_wikitext(args.output, args.version)
elif args.dataset == 'openwebtext':
success = download_openwebtext(args.output, args.limit)
elif args.dataset == 'bookcorpus':
success = download_bookcorpus(args.output, args.limit)
if success:
print(f"\n✅ Download complete!")
print(f" File: {args.output}")
# Show file info
try:
import os
size_mb = os.path.getsize(args.output) / (1024 * 1024)
with open(args.output, 'r', encoding='utf-8') as f:
lines = sum(1 for _ in f)
print(f" Size: {size_mb:.2f} MB")
print(f" Lines: {lines:,}")
except:
pass
print(f"\n📚 You can now train with:")
print(f" python3 train.py --data {args.output} --config config.json --device cuda")
else:
print(f"\n❌ Download failed. Please check the error messages above.")
sys.exit(1)
if __name__ == '__main__':
main()