Initial commit: SheepOp LLM - Transformer-based language model implementation
- Complete transformer implementation from scratch - Training pipeline with gradient accumulation and mixed precision - Optimized inference with KV caching - Multi-format data processing (PDFs, images, code, text) - Comprehensive documentation - Apache 2.0 license - Example training plots included in docs/images/
This commit is contained in:
428
download_large_data.py
Executable file
428
download_large_data.py
Executable file
@@ -0,0 +1,428 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download large datasets for training the SheepOp LLM.
|
||||
Supports Amazon Reviews, WikiText, OpenWebText, BookCorpus, and more.
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def download_amazon_reviews(output: str = "data/amazon_reviews.txt", limit: int = 500000, category: str = "Video_Games_v1_00"):
|
||||
"""
|
||||
Download Amazon Product Reviews dataset.
|
||||
|
||||
Args:
|
||||
output: Output file path
|
||||
limit: Maximum number of reviews to download
|
||||
category: Product category (Video_Games_v1_00, Books_v1_00, etc.)
|
||||
"""
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
except ImportError:
|
||||
print("Error: 'datasets' library not installed.")
|
||||
print("Install with: pip install datasets")
|
||||
return False
|
||||
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"📥 Downloading Amazon Product Reviews (category: {category}, limit: {limit})...")
|
||||
print(" This may take several minutes depending on your connection...")
|
||||
|
||||
try:
|
||||
# Try different dataset names/approaches
|
||||
# Method 1: Try mc4 (Common Crawl) which includes Amazon-like content
|
||||
print(" Attempting to download from alternative source...")
|
||||
|
||||
# Use amazon_polarity dataset (smaller but works)
|
||||
try:
|
||||
print(" Trying amazon_polarity dataset...")
|
||||
dataset = load_dataset("amazon_polarity", split=f"train[:{limit}]")
|
||||
|
||||
with open(output, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for item in dataset:
|
||||
review = item.get("content", "").strip()
|
||||
if not review:
|
||||
review = item.get("text", "").strip()
|
||||
if review and len(review) > 20:
|
||||
f.write(review + "\n")
|
||||
count += 1
|
||||
if count % 50000 == 0:
|
||||
print(f" ✓ Downloaded {count:,} reviews...")
|
||||
|
||||
print(f"✅ Successfully saved {count:,} reviews to {output}")
|
||||
return True
|
||||
|
||||
except Exception as e1:
|
||||
print(f" amazon_polarity failed: {e1}")
|
||||
|
||||
# Method 2: Use IMDB reviews (similar structure)
|
||||
try:
|
||||
print(" Trying IMDB reviews as alternative...")
|
||||
dataset = load_dataset("imdb", split=f"train[:{limit}]")
|
||||
|
||||
with open(output, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for item in dataset:
|
||||
review = item.get("text", "").strip()
|
||||
if review and len(review) > 20:
|
||||
f.write(review + "\n")
|
||||
count += 1
|
||||
if count % 50000 == 0:
|
||||
print(f" ✓ Downloaded {count:,} reviews...")
|
||||
|
||||
print(f"✅ Successfully saved {count:,} reviews to {output}")
|
||||
print(" Note: Using IMDB reviews instead of Amazon reviews")
|
||||
return True
|
||||
|
||||
except Exception as e2:
|
||||
print(f" IMDB also failed: {e2}")
|
||||
raise Exception("Both Amazon and IMDB datasets failed. Try using --alternative flag with a different dataset.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error downloading reviews: {e}")
|
||||
print("\n💡 Alternative options:")
|
||||
print(" 1. Use WikiText instead: python3 download_large_data.py wiki")
|
||||
print(" 2. Use OpenWebText: python3 download_large_data.py openwebtext --limit 100000")
|
||||
print(" 3. Try downloading from HuggingFace Hub manually")
|
||||
return False
|
||||
|
||||
|
||||
def download_wikitext(output: str = "data/wikitext.txt", version: str = "103"):
|
||||
"""
|
||||
Download WikiText dataset (Wikipedia text).
|
||||
|
||||
Args:
|
||||
output: Output file path
|
||||
version: WikiText version ('2' or '103')
|
||||
"""
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
except ImportError:
|
||||
print("Error: 'datasets' library not installed.")
|
||||
print("Install with: pip install datasets")
|
||||
return False
|
||||
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"📥 Downloading WikiText-{version}...")
|
||||
print(" This may take several minutes...")
|
||||
|
||||
try:
|
||||
dataset = load_dataset("wikitext", f"wikitext-{version}-v1", split="train")
|
||||
|
||||
with open(output, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for item in dataset:
|
||||
text = item.get("text", "").strip()
|
||||
# Filter out headers and empty lines
|
||||
if text and len(text) > 20 and not text.startswith("="):
|
||||
# Split into sentences
|
||||
sentences = text.split('.')
|
||||
for s in sentences:
|
||||
s = s.strip()
|
||||
if len(s) > 20:
|
||||
f.write(s + ".\n")
|
||||
count += 1
|
||||
if count % 10000 == 0:
|
||||
print(f" ✓ Processed {count:,} sentences...")
|
||||
|
||||
print(f"✅ Successfully saved {count:,} sentences to {output}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error downloading WikiText: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def download_openwebtext(output: str = "data/openwebtext.txt", limit: int = 100000):
|
||||
"""
|
||||
Download OpenWebText dataset (web text corpus).
|
||||
|
||||
Args:
|
||||
output: Output file path
|
||||
limit: Maximum number of samples to download
|
||||
"""
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
except ImportError:
|
||||
print("Error: 'datasets' library not installed.")
|
||||
print("Install with: pip install datasets")
|
||||
return False
|
||||
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"📥 Downloading OpenWebText (limit: {limit:,})...")
|
||||
print(" This may take a while - OpenWebText is very large...")
|
||||
|
||||
try:
|
||||
dataset = load_dataset("openwebtext", split=f"train[:{limit}]")
|
||||
|
||||
with open(output, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for item in dataset:
|
||||
text = item.get("text", "").strip()
|
||||
if text:
|
||||
# Split into sentences
|
||||
sentences = text.split('.')
|
||||
for s in sentences:
|
||||
s = s.strip()
|
||||
if len(s) > 20:
|
||||
f.write(s + ".\n")
|
||||
count += 1
|
||||
if count % 10000 == 0:
|
||||
print(f" ✓ Processed {count:,} sentences...")
|
||||
|
||||
print(f"✅ Successfully saved {count:,} sentences to {output}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error downloading OpenWebText: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def download_bookcorpus(output: str = "data/bookcorpus.txt", limit: int = 100000):
|
||||
"""
|
||||
Download BookCorpus dataset (books).
|
||||
|
||||
Args:
|
||||
output: Output file path
|
||||
limit: Maximum number of books to download
|
||||
"""
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
except ImportError:
|
||||
print("Error: 'datasets' library not installed.")
|
||||
print("Install with: pip install datasets")
|
||||
return False
|
||||
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"📥 Downloading BookCorpus (limit: {limit:,} books)...")
|
||||
print(" This may take a while...")
|
||||
|
||||
try:
|
||||
dataset = load_dataset("bookcorpus", split=f"train[:{limit}]")
|
||||
|
||||
with open(output, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for item in dataset:
|
||||
text = item.get("text", "").strip()
|
||||
if text:
|
||||
# Split into sentences
|
||||
sentences = text.split('.')
|
||||
for s in sentences:
|
||||
s = s.strip()
|
||||
if len(s) > 20:
|
||||
f.write(s + ".\n")
|
||||
count += 1
|
||||
if count % 10000 == 0:
|
||||
print(f" ✓ Processed {count:,} sentences...")
|
||||
|
||||
print(f"✅ Successfully saved {count:,} sentences to {output}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error downloading BookCorpus: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def download_wikitext_direct(output: str = "data/wikitext_direct.txt"):
|
||||
"""
|
||||
Download WikiText directly from URL (no HuggingFace required).
|
||||
"""
|
||||
import urllib.request
|
||||
import zipfile
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
|
||||
|
||||
print("📥 Downloading WikiText-103 directly from URL...")
|
||||
print(" This may take several minutes...")
|
||||
|
||||
try:
|
||||
# Download to temp file
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
print(f" Downloading to temporary file...")
|
||||
urllib.request.urlretrieve(url, tmp_path)
|
||||
|
||||
# Extract and process
|
||||
print(" Extracting and processing...")
|
||||
with zipfile.ZipFile(tmp_path, 'r') as zip_ref:
|
||||
# Extract wiki.train.tokens
|
||||
with zip_ref.open('wikitext-103/wiki.train.tokens') as f:
|
||||
with open(output, 'w', encoding='utf-8') as out_file:
|
||||
count = 0
|
||||
for line in f:
|
||||
line = line.decode('utf-8').strip()
|
||||
if line and len(line) > 20 and not line.startswith('='):
|
||||
sentences = line.split('.')
|
||||
for s in sentences:
|
||||
s = s.strip()
|
||||
if len(s) > 20:
|
||||
out_file.write(s + ".\n")
|
||||
count += 1
|
||||
if count % 10000 == 0:
|
||||
print(f" ✓ Processed {count:,} sentences...")
|
||||
|
||||
# Clean up
|
||||
os.unlink(tmp_path)
|
||||
|
||||
print(f"✅ Successfully saved {count:,} sentences to {output}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error downloading WikiText: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Download large datasets for training SheepOp LLM',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Download 500k Amazon reviews
|
||||
python3 download_large_data.py amazon --limit 500000
|
||||
|
||||
# Download WikiText-103
|
||||
python3 download_large_data.py wiki
|
||||
|
||||
# Download OpenWebText sample
|
||||
python3 download_large_data.py openwebtext --limit 100000
|
||||
|
||||
# Download to custom location
|
||||
python3 download_large_data.py amazon --output data/my_reviews.txt
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'dataset',
|
||||
choices=['amazon', 'wiki', 'wikitext', 'openwebtext', 'bookcorpus'],
|
||||
help='Dataset to download'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
type=str,
|
||||
help='Output file path (default: data/<dataset_name>.txt)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=500000,
|
||||
help='Maximum number of samples to download (default: 500000)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--category',
|
||||
type=str,
|
||||
default='Video_Games_v1_00',
|
||||
help='Amazon reviews category (for amazon dataset only, may not work - uses alternative)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--use-imdb',
|
||||
action='store_true',
|
||||
help='Use IMDB reviews instead of Amazon (more reliable)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
type=str,
|
||||
default='103',
|
||||
choices=['2', '103'],
|
||||
help='WikiText version: 2 (small) or 103 (large)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set default output path if not provided
|
||||
if not args.output:
|
||||
if args.dataset == 'amazon':
|
||||
args.output = f"data/amazon_reviews.txt"
|
||||
elif args.dataset in ['wiki', 'wikitext']:
|
||||
args.output = f"data/wikitext_{args.version}.txt"
|
||||
elif args.dataset == 'openwebtext':
|
||||
args.output = "data/openwebtext.txt"
|
||||
elif args.dataset == 'bookcorpus':
|
||||
args.output = "data/bookcorpus.txt"
|
||||
|
||||
print(f"\n🚀 SheepOp Dataset Downloader")
|
||||
print(f" Dataset: {args.dataset}")
|
||||
print(f" Output: {args.output}")
|
||||
print(f" Limit: {args.limit:,} samples\n")
|
||||
|
||||
# Download based on dataset type
|
||||
success = False
|
||||
if args.dataset == 'amazon':
|
||||
if args.use_imdb:
|
||||
# Use IMDB directly
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
print("📥 Downloading IMDB Reviews...")
|
||||
dataset = load_dataset("imdb", split=f"train[:{args.limit}]")
|
||||
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
count = 0
|
||||
for item in dataset:
|
||||
review = item.get("text", "").strip()
|
||||
if review and len(review) > 20:
|
||||
f.write(review + "\n")
|
||||
count += 1
|
||||
if count % 50000 == 0:
|
||||
print(f" ✓ Downloaded {count:,} reviews...")
|
||||
print(f"✅ Successfully saved {count:,} reviews to {args.output}")
|
||||
success = True
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {e}")
|
||||
success = False
|
||||
else:
|
||||
success = download_amazon_reviews(args.output, args.limit, args.category)
|
||||
elif args.dataset in ['wiki', 'wikitext']:
|
||||
if args.version == '103':
|
||||
# Try direct download first (no HuggingFace dependency)
|
||||
print(" Attempting direct download (no HuggingFace required)...")
|
||||
success = download_wikitext_direct(args.output)
|
||||
if not success:
|
||||
print(" Falling back to HuggingFace download...")
|
||||
success = download_wikitext(args.output, args.version)
|
||||
else:
|
||||
success = download_wikitext(args.output, args.version)
|
||||
elif args.dataset == 'openwebtext':
|
||||
success = download_openwebtext(args.output, args.limit)
|
||||
elif args.dataset == 'bookcorpus':
|
||||
success = download_bookcorpus(args.output, args.limit)
|
||||
|
||||
if success:
|
||||
print(f"\n✅ Download complete!")
|
||||
print(f" File: {args.output}")
|
||||
|
||||
# Show file info
|
||||
try:
|
||||
import os
|
||||
size_mb = os.path.getsize(args.output) / (1024 * 1024)
|
||||
with open(args.output, 'r', encoding='utf-8') as f:
|
||||
lines = sum(1 for _ in f)
|
||||
print(f" Size: {size_mb:.2f} MB")
|
||||
print(f" Lines: {lines:,}")
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"\n📚 You can now train with:")
|
||||
print(f" python3 train.py --data {args.output} --config config.json --device cuda")
|
||||
else:
|
||||
print(f"\n❌ Download failed. Please check the error messages above.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user