- Complete transformer implementation from scratch - Training pipeline with gradient accumulation and mixed precision - Optimized inference with KV caching - Multi-format data processing (PDFs, images, code, text) - Comprehensive documentation - Apache 2.0 license - Example training plots included in docs/images/
292 lines
10 KiB
Python
Executable File
292 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Move large files to external storage and create symbolic links.
|
||
Helps manage large datasets and checkpoints on systems with limited space.
|
||
"""
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
from pathlib import Path
|
||
import argparse
|
||
|
||
|
||
def create_storage_structure(storage_root: str):
|
||
"""Create directory structure in storage location."""
|
||
storage_path = Path(storage_root)
|
||
storage_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Create subdirectories
|
||
(storage_path / "data").mkdir(exist_ok=True)
|
||
(storage_path / "checkpoints").mkdir(exist_ok=True)
|
||
(storage_path / "checkpoints_test").mkdir(exist_ok=True)
|
||
|
||
print(f"✅ Created storage structure at: {storage_path}")
|
||
return storage_path
|
||
|
||
|
||
def move_and_link(source_dir: Path, target_dir: Path, link_name: str, dry_run: bool = False):
|
||
"""
|
||
Move directory contents to storage and create symbolic link.
|
||
|
||
Args:
|
||
source_dir: Source directory in project
|
||
target_dir: Target directory in storage
|
||
link_name: Name for the symbolic link (same as source_dir name)
|
||
dry_run: If True, only show what would be done
|
||
"""
|
||
source_dir = Path(source_dir)
|
||
target_dir = Path(target_dir)
|
||
|
||
if not source_dir.exists():
|
||
print(f"⚠️ Source directory doesn't exist: {source_dir}")
|
||
return False
|
||
|
||
if dry_run:
|
||
print(f"\n[DRY RUN] Would move contents from {source_dir} to {target_dir}")
|
||
print(f" Would replace {source_dir} with symlink -> {target_dir}")
|
||
return True
|
||
|
||
# Move files (skip Python module files)
|
||
moved_count = 0
|
||
temp_backup = source_dir.parent / f".{source_dir.name}_backup"
|
||
|
||
# First, backup Python files
|
||
python_files = []
|
||
for item in source_dir.iterdir():
|
||
if item.suffix == '.py' or item.name.startswith('__'):
|
||
python_files.append(item)
|
||
|
||
# Move non-Python files to storage
|
||
for item in source_dir.iterdir():
|
||
# Skip Python files and hidden files
|
||
if item.name.startswith('__') or item.suffix == '.py' or item.name.startswith('.'):
|
||
continue
|
||
|
||
target_item = target_dir / item.name
|
||
if target_item.exists():
|
||
print(f"⚠️ Skipping {item.name} (already exists in storage)")
|
||
continue
|
||
|
||
print(f"📦 Moving {item.name}...")
|
||
try:
|
||
if item.is_dir():
|
||
shutil.copytree(item, target_item)
|
||
shutil.rmtree(item)
|
||
else:
|
||
shutil.copy2(item, target_item)
|
||
item.unlink()
|
||
moved_count += 1
|
||
except Exception as e:
|
||
print(f"❌ Error moving {item.name}: {e}")
|
||
return False
|
||
|
||
# Copy Python files to storage (keep structure)
|
||
for item in python_files:
|
||
target_item = target_dir / item.name
|
||
if not target_item.exists():
|
||
shutil.copy2(item, target_item)
|
||
|
||
# Replace source directory with symlink
|
||
# Step 1: Remove original directory
|
||
try:
|
||
shutil.rmtree(source_dir)
|
||
except Exception as e:
|
||
print(f"⚠️ Could not remove {source_dir}: {e}")
|
||
return False
|
||
|
||
# Step 2: Create symlink
|
||
try:
|
||
source_dir.symlink_to(target_dir)
|
||
print(f"✅ Created symbolic link: {source_dir} -> {target_dir}")
|
||
return True
|
||
except Exception as e:
|
||
print(f"❌ Error creating link: {e}")
|
||
return False
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Move large files to external storage and create symbolic links',
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
# Dry run (show what would be done)
|
||
python3 setup_storage.py --storage /mnt/storage/sheepOp --dry-run
|
||
|
||
# Move data and checkpoints to storage
|
||
python3 setup_storage.py --storage /mnt/storage/sheepOp
|
||
|
||
# Only move data, not checkpoints
|
||
python3 setup_storage.py --storage /mnt/storage/sheepOp --skip-checkpoints
|
||
"""
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--storage',
|
||
type=str,
|
||
default='/mnt/storage/sheepOp',
|
||
help='Storage root directory (default: /mnt/storage/sheepOp)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--project-root',
|
||
type=str,
|
||
default='.',
|
||
help='Project root directory (default: current directory)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--dry-run',
|
||
action='store_true',
|
||
help='Show what would be done without actually doing it'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--skip-checkpoints',
|
||
action='store_true',
|
||
help='Skip moving checkpoints (only move data)'
|
||
)
|
||
|
||
parser.add_argument(
|
||
'--skip-data',
|
||
action='store_true',
|
||
help='Skip moving data (only move checkpoints)'
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
project_root = Path(args.project_root).resolve()
|
||
storage_root = Path(args.storage).resolve()
|
||
|
||
print(f"🚀 Setting up storage links")
|
||
print(f" Project root: {project_root}")
|
||
print(f" Storage root: {storage_root}")
|
||
print(f" Dry run: {args.dry_run}\n")
|
||
|
||
# Create storage structure (always create, even in dry-run, to check permissions)
|
||
try:
|
||
create_storage_structure(args.storage)
|
||
except Exception as e:
|
||
if args.dry_run:
|
||
print(f"⚠️ Could not create storage structure (will be created during actual run): {e}")
|
||
else:
|
||
print(f"❌ Error creating storage structure: {e}")
|
||
return 1
|
||
|
||
storage_data = storage_root / "data"
|
||
storage_checkpoints = storage_root / "checkpoints"
|
||
storage_checkpoints_test = storage_root / "checkpoints_test"
|
||
|
||
project_data = project_root / "data"
|
||
project_checkpoints = project_root / "checkpoints"
|
||
project_checkpoints_test = project_root / "checkpoints_test"
|
||
|
||
success = True
|
||
|
||
# Move data
|
||
if not args.skip_data:
|
||
print(f"\n📁 Processing data directory...")
|
||
if project_data.exists():
|
||
if project_data.is_symlink():
|
||
print(f" ℹ️ data/ is already a symlink: {project_data.readlink()}")
|
||
else:
|
||
print(" Moving data files to storage (keeping __init__.py)...")
|
||
# Copy __init__.py to storage first
|
||
init_file = project_data / "__init__.py"
|
||
if init_file.exists():
|
||
storage_init = storage_data / "__init__.py"
|
||
if not storage_init.exists():
|
||
if args.dry_run:
|
||
print(f" [DRY RUN] Would copy: __init__.py -> {storage_init}")
|
||
else:
|
||
# Ensure storage directory exists
|
||
storage_data.mkdir(parents=True, exist_ok=True)
|
||
shutil.copy2(init_file, storage_init)
|
||
print(" ✅ Copied __init__.py to storage")
|
||
else:
|
||
print(" ℹ️ __init__.py already exists in storage")
|
||
|
||
# Move all other files
|
||
moved_files = []
|
||
for item in project_data.iterdir():
|
||
if item.name == '__init__.py' or item.name.startswith('__'):
|
||
continue
|
||
|
||
target_item = storage_data / item.name
|
||
if args.dry_run:
|
||
print(f" [DRY RUN] Would move: {item.name} -> {target_item}")
|
||
moved_files.append(item.name)
|
||
else:
|
||
if not target_item.exists():
|
||
if item.is_dir():
|
||
shutil.copytree(item, target_item)
|
||
shutil.rmtree(item)
|
||
else:
|
||
shutil.copy2(item, target_item)
|
||
item.unlink()
|
||
moved_files.append(item.name)
|
||
print(f" ✅ Moved: {item.name}")
|
||
else:
|
||
print(f" ⚠️ Already exists: {item.name}")
|
||
|
||
# Replace data/ with symlink
|
||
if not args.dry_run and moved_files:
|
||
# Remove original directory
|
||
init_backup = project_data / "__init__.py"
|
||
if init_backup.exists():
|
||
# Keep a reference
|
||
pass
|
||
shutil.rmtree(project_data)
|
||
|
||
# Create symlink
|
||
project_data.symlink_to(storage_data)
|
||
print(f" ✅ Replaced data/ with symlink -> {storage_data}")
|
||
else:
|
||
print(" ℹ️ data/ directory doesn't exist, creating symlink...")
|
||
if not args.dry_run:
|
||
project_data.symlink_to(storage_data)
|
||
print(f" ✅ Created data/ symlink -> {storage_data}")
|
||
|
||
# Move checkpoints
|
||
if not args.skip_checkpoints:
|
||
print(f"\n💾 Processing checkpoints...")
|
||
|
||
if project_checkpoints.exists():
|
||
print(" Moving checkpoints to storage...")
|
||
success = move_and_link(
|
||
project_checkpoints,
|
||
storage_checkpoints,
|
||
"checkpoints",
|
||
args.dry_run
|
||
) and success
|
||
|
||
if project_checkpoints_test.exists():
|
||
print(" Moving checkpoints_test to storage...")
|
||
success = move_and_link(
|
||
project_checkpoints_test,
|
||
storage_checkpoints_test,
|
||
"checkpoints_test",
|
||
args.dry_run
|
||
) and success
|
||
|
||
if args.dry_run:
|
||
print(f"\n✅ Dry run complete. Use without --dry-run to execute.")
|
||
else:
|
||
if success:
|
||
print(f"\n✅ Storage setup complete!")
|
||
print(f"\n📋 Next steps:")
|
||
print(f" 1. Your data files are now in: {storage_root}/data/")
|
||
print(f" 2. Your checkpoints will be saved to: {storage_root}/checkpoints/")
|
||
print(f" 3. Links are created in your project directory")
|
||
print(f" 4. Training will automatically use the storage location")
|
||
else:
|
||
print(f"\n❌ Some operations failed. Please check the errors above.")
|
||
return 1
|
||
|
||
return 0
|
||
|
||
|
||
if __name__ == '__main__':
|
||
exit(main())
|
||
|