Files
sheepOp/setup_storage.py
Carlos Gutierrez 3d2da94ce2 Initial commit: SheepOp LLM - Transformer-based language model implementation
- Complete transformer implementation from scratch
- Training pipeline with gradient accumulation and mixed precision
- Optimized inference with KV caching
- Multi-format data processing (PDFs, images, code, text)
- Comprehensive documentation
- Apache 2.0 license
- Example training plots included in docs/images/
2025-11-06 22:07:41 -05:00

292 lines
10 KiB
Python
Executable File
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Move large files to external storage and create symbolic links.
Helps manage large datasets and checkpoints on systems with limited space.
"""
import os
import shutil
import subprocess
from pathlib import Path
import argparse
def create_storage_structure(storage_root: str):
"""Create directory structure in storage location."""
storage_path = Path(storage_root)
storage_path.mkdir(parents=True, exist_ok=True)
# Create subdirectories
(storage_path / "data").mkdir(exist_ok=True)
(storage_path / "checkpoints").mkdir(exist_ok=True)
(storage_path / "checkpoints_test").mkdir(exist_ok=True)
print(f"✅ Created storage structure at: {storage_path}")
return storage_path
def move_and_link(source_dir: Path, target_dir: Path, link_name: str, dry_run: bool = False):
"""
Move directory contents to storage and create symbolic link.
Args:
source_dir: Source directory in project
target_dir: Target directory in storage
link_name: Name for the symbolic link (same as source_dir name)
dry_run: If True, only show what would be done
"""
source_dir = Path(source_dir)
target_dir = Path(target_dir)
if not source_dir.exists():
print(f"⚠️ Source directory doesn't exist: {source_dir}")
return False
if dry_run:
print(f"\n[DRY RUN] Would move contents from {source_dir} to {target_dir}")
print(f" Would replace {source_dir} with symlink -> {target_dir}")
return True
# Move files (skip Python module files)
moved_count = 0
temp_backup = source_dir.parent / f".{source_dir.name}_backup"
# First, backup Python files
python_files = []
for item in source_dir.iterdir():
if item.suffix == '.py' or item.name.startswith('__'):
python_files.append(item)
# Move non-Python files to storage
for item in source_dir.iterdir():
# Skip Python files and hidden files
if item.name.startswith('__') or item.suffix == '.py' or item.name.startswith('.'):
continue
target_item = target_dir / item.name
if target_item.exists():
print(f"⚠️ Skipping {item.name} (already exists in storage)")
continue
print(f"📦 Moving {item.name}...")
try:
if item.is_dir():
shutil.copytree(item, target_item)
shutil.rmtree(item)
else:
shutil.copy2(item, target_item)
item.unlink()
moved_count += 1
except Exception as e:
print(f"❌ Error moving {item.name}: {e}")
return False
# Copy Python files to storage (keep structure)
for item in python_files:
target_item = target_dir / item.name
if not target_item.exists():
shutil.copy2(item, target_item)
# Replace source directory with symlink
# Step 1: Remove original directory
try:
shutil.rmtree(source_dir)
except Exception as e:
print(f"⚠️ Could not remove {source_dir}: {e}")
return False
# Step 2: Create symlink
try:
source_dir.symlink_to(target_dir)
print(f"✅ Created symbolic link: {source_dir} -> {target_dir}")
return True
except Exception as e:
print(f"❌ Error creating link: {e}")
return False
def main():
parser = argparse.ArgumentParser(
description='Move large files to external storage and create symbolic links',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Dry run (show what would be done)
python3 setup_storage.py --storage /mnt/storage/sheepOp --dry-run
# Move data and checkpoints to storage
python3 setup_storage.py --storage /mnt/storage/sheepOp
# Only move data, not checkpoints
python3 setup_storage.py --storage /mnt/storage/sheepOp --skip-checkpoints
"""
)
parser.add_argument(
'--storage',
type=str,
default='/mnt/storage/sheepOp',
help='Storage root directory (default: /mnt/storage/sheepOp)'
)
parser.add_argument(
'--project-root',
type=str,
default='.',
help='Project root directory (default: current directory)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be done without actually doing it'
)
parser.add_argument(
'--skip-checkpoints',
action='store_true',
help='Skip moving checkpoints (only move data)'
)
parser.add_argument(
'--skip-data',
action='store_true',
help='Skip moving data (only move checkpoints)'
)
args = parser.parse_args()
project_root = Path(args.project_root).resolve()
storage_root = Path(args.storage).resolve()
print(f"🚀 Setting up storage links")
print(f" Project root: {project_root}")
print(f" Storage root: {storage_root}")
print(f" Dry run: {args.dry_run}\n")
# Create storage structure (always create, even in dry-run, to check permissions)
try:
create_storage_structure(args.storage)
except Exception as e:
if args.dry_run:
print(f"⚠️ Could not create storage structure (will be created during actual run): {e}")
else:
print(f"❌ Error creating storage structure: {e}")
return 1
storage_data = storage_root / "data"
storage_checkpoints = storage_root / "checkpoints"
storage_checkpoints_test = storage_root / "checkpoints_test"
project_data = project_root / "data"
project_checkpoints = project_root / "checkpoints"
project_checkpoints_test = project_root / "checkpoints_test"
success = True
# Move data
if not args.skip_data:
print(f"\n📁 Processing data directory...")
if project_data.exists():
if project_data.is_symlink():
print(f" data/ is already a symlink: {project_data.readlink()}")
else:
print(" Moving data files to storage (keeping __init__.py)...")
# Copy __init__.py to storage first
init_file = project_data / "__init__.py"
if init_file.exists():
storage_init = storage_data / "__init__.py"
if not storage_init.exists():
if args.dry_run:
print(f" [DRY RUN] Would copy: __init__.py -> {storage_init}")
else:
# Ensure storage directory exists
storage_data.mkdir(parents=True, exist_ok=True)
shutil.copy2(init_file, storage_init)
print(" ✅ Copied __init__.py to storage")
else:
print(" __init__.py already exists in storage")
# Move all other files
moved_files = []
for item in project_data.iterdir():
if item.name == '__init__.py' or item.name.startswith('__'):
continue
target_item = storage_data / item.name
if args.dry_run:
print(f" [DRY RUN] Would move: {item.name} -> {target_item}")
moved_files.append(item.name)
else:
if not target_item.exists():
if item.is_dir():
shutil.copytree(item, target_item)
shutil.rmtree(item)
else:
shutil.copy2(item, target_item)
item.unlink()
moved_files.append(item.name)
print(f" ✅ Moved: {item.name}")
else:
print(f" ⚠️ Already exists: {item.name}")
# Replace data/ with symlink
if not args.dry_run and moved_files:
# Remove original directory
init_backup = project_data / "__init__.py"
if init_backup.exists():
# Keep a reference
pass
shutil.rmtree(project_data)
# Create symlink
project_data.symlink_to(storage_data)
print(f" ✅ Replaced data/ with symlink -> {storage_data}")
else:
print(" data/ directory doesn't exist, creating symlink...")
if not args.dry_run:
project_data.symlink_to(storage_data)
print(f" ✅ Created data/ symlink -> {storage_data}")
# Move checkpoints
if not args.skip_checkpoints:
print(f"\n💾 Processing checkpoints...")
if project_checkpoints.exists():
print(" Moving checkpoints to storage...")
success = move_and_link(
project_checkpoints,
storage_checkpoints,
"checkpoints",
args.dry_run
) and success
if project_checkpoints_test.exists():
print(" Moving checkpoints_test to storage...")
success = move_and_link(
project_checkpoints_test,
storage_checkpoints_test,
"checkpoints_test",
args.dry_run
) and success
if args.dry_run:
print(f"\n✅ Dry run complete. Use without --dry-run to execute.")
else:
if success:
print(f"\n✅ Storage setup complete!")
print(f"\n📋 Next steps:")
print(f" 1. Your data files are now in: {storage_root}/data/")
print(f" 2. Your checkpoints will be saved to: {storage_root}/checkpoints/")
print(f" 3. Links are created in your project directory")
print(f" 4. Training will automatically use the storage location")
else:
print(f"\n❌ Some operations failed. Please check the errors above.")
return 1
return 0
if __name__ == '__main__':
exit(main())