#!/usr/bin/env python3 """ Download GitHub repositories with open licenses for code training. Uses GitHub API to find and clone repositories automatically. Includes support for Neovim, Lua, Bash, and ethical hacking repos. """ import argparse import subprocess import sys import os from pathlib import Path from typing import List, Optional, Dict import json import urllib.request import urllib.parse import time from tqdm import tqdm def get_directory_size(directory: Path) -> int: """Get total size of directory in bytes.""" total = 0 try: for entry in directory.rglob('*'): if entry.is_file(): try: total += entry.stat().st_size except (OSError, PermissionError): pass except Exception: pass return total def format_size(size_bytes: int) -> str: """Format bytes to human-readable size.""" for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size_bytes < 1024.0: return f"{size_bytes:.2f} {unit}" size_bytes /= 1024.0 return f"{size_bytes:.2f} PB" # Open source licenses (permissive and commonly used) OPEN_LICENSES = [ 'mit', 'apache-2.0', 'bsd-3-clause', 'bsd-2-clause', 'isc', 'unlicense', 'mpl-2.0', 'lgpl-2.1', 'lgpl-3.0', 'gpl-2.0', 'gpl-3.0', ] # Popular programming languages POPULAR_LANGUAGES = [ 'python', 'javascript', 'typescript', 'java', 'cpp', 'c', 'go', 'rust', 'ruby', 'php', 'swift', 'kotlin', 'scala', 'r', 'sql', 'lua', 'shell', # For bash/shell scripts ] # Predefined repository categories REPO_CATEGORIES = { 'nvim': { 'query': 'neovim OR nvim-config OR neovim-config', 'language': None, 'description': 'Neovim configuration and plugins' }, 'lua': { 'query': None, 'language': 'lua', 'description': 'Lua programming language repositories' }, 'bash': { 'query': None, 'language': 'shell', 'description': 'Bash/shell script repositories' }, 'zsh': { 'query': 'zsh-config OR oh-my-zsh OR zsh-plugin', 'language': None, 'description': 'Zsh configuration and plugins' }, 'python': { 'query': None, 'language': 'python', 'description': 'Python programming repositories' }, 'hacking': { 'query': 'ethical-hacking OR cybersecurity OR penetration-testing OR security-tools OR red-team', 'language': None, 'description': 'Ethical hacking and cybersecurity tools' }, 'security': { 'query': 'security-tools OR cybersecurity OR penetration-testing OR red-team OR blue-team', 'language': None, 'description': 'Security and cybersecurity repositories' }, 'all-open': { 'query': None, 'language': None, 'description': 'All repositories with open licenses (any language)' }, } def search_github_repos( language: Optional[str] = None, license: Optional[str] = None, query: Optional[str] = None, min_stars: int = 100, max_repos: int = 100, sort: str = 'stars', order: str = 'desc' ) -> List[dict]: """ Search GitHub for repositories matching criteria. Args: language: Programming language (e.g., 'python', 'javascript') license: License type (e.g., 'mit', 'apache-2.0') query: Custom search query min_stars: Minimum number of stars max_repos: Maximum number of repos to return sort: Sort by ('stars', 'updated', 'created') order: Order ('desc' or 'asc') Returns: List of repository dictionaries """ # Build query query_parts = [] if query: # Custom query (for categories like nvim, hacking) query_parts.append(query) else: # Standard language-based query if language: query_parts.append(f"language:{language}") if license: query_parts.append(f"license:{license}") query_parts.append(f"stars:>={min_stars}") search_query = " ".join(query_parts) # GitHub API endpoint base_url = "https://api.github.com/search/repositories" params = { 'q': search_query, 'sort': sort, 'order': order, 'per_page': min(100, max_repos), # GitHub max is 100 per page } url = f"{base_url}?{urllib.parse.urlencode(params)}" print(f"šŸ” Searching GitHub for repositories...") print(f" Query: {search_query}") print(f" Max repos: {max_repos}") try: # Make request req = urllib.request.Request(url) req.add_header('Accept', 'application/vnd.github.v3+json') req.add_header('User-Agent', 'SheepOp-Repo-Downloader') # Add GitHub token if available github_token = os.environ.get('GITHUB_TOKEN') if github_token: req.add_header('Authorization', f'token {github_token}') with urllib.request.urlopen(req) as response: data = json.loads(response.read().decode()) repos = data.get('items', [])[:max_repos] print(f"āœ… Found {len(repos)} repositories") return repos except urllib.error.HTTPError as e: if e.code == 403: print("āŒ Rate limit exceeded. Please wait a few minutes or use a GitHub token.") print(" To use a token, set GITHUB_TOKEN environment variable:") print(" export GITHUB_TOKEN=your_token_here") else: print(f"āŒ Error searching GitHub: {e}") if e.code == 422: print(" Tip: Try adjusting your search query or reducing max-repos") return [] except Exception as e: print(f"āŒ Error: {e}") return [] def clone_repo(repo_url: str, output_dir: Path, depth: Optional[int] = None) -> bool: """ Clone a repository. Args: repo_url: Repository URL (https://github.com/user/repo.git) output_dir: Directory to clone into depth: Shallow clone depth (None = full clone) Returns: True if successful """ repo_name = repo_url.split('/')[-1].replace('.git', '') target_dir = output_dir / repo_name # Skip if already exists if target_dir.exists(): return True # Silent skip (progress bar will show it) try: cmd = ['git', 'clone', '--quiet'] # Quiet mode for cleaner output if depth: cmd.extend(['--depth', str(depth)]) cmd.append(repo_url) cmd.append(str(target_dir)) result = subprocess.run( cmd, capture_output=True, text=True, timeout=300 # 5 minute timeout ) return result.returncode == 0 except subprocess.TimeoutExpired: return False except Exception as e: return False def download_category( category: str, output_dir: Path, license: Optional[str] = None, min_stars: int = 100, max_repos: int = 50, shallow: bool = True, max_size_bytes: Optional[int] = None, ) -> tuple: """ Download repositories for a specific category. Returns: (cloned_count, failed_count) """ if category not in REPO_CATEGORIES: print(f"āŒ Unknown category: {category}") return 0, 0 cat_info = REPO_CATEGORIES[category] print(f"\nšŸ“¦ Downloading {category} repositories...") print(f" {cat_info['description']}") # For 'all-open' category, don't filter by license unless explicitly specified search_license = None if category == 'all-open' and not license else (license or 'mit') repos = search_github_repos( language=cat_info['language'], license=search_license, query=cat_info['query'], min_stars=min_stars, max_repos=max_repos, ) if not repos: print(f" No repositories found for {category}") return 0, 0 print(f" Cloning {len(repos)} repositories...") cloned = 0 failed = 0 # Progress bar for cloning pbar = tqdm( total=len(repos), desc=f"Cloning {category}", unit="repo", ncols=100, mininterval=0.1, maxinterval=1.0, file=sys.stderr, # Write to stderr to avoid buffering issues dynamic_ncols=True, # Auto-adjust to terminal width disable=False, # Explicitly enable ) # Cache size to avoid recalculating every iteration cached_size = get_directory_size(output_dir) if max_size_bytes else 0 size_check_counter = 0 for i, repo in enumerate(repos, 1): # Check size limit every 5 repos (to avoid blocking progress bar) if max_size_bytes: size_check_counter += 1 if size_check_counter >= 5: cached_size = get_directory_size(output_dir) size_check_counter = 0 if cached_size >= max_size_bytes: pbar.close() print(f"\nāš ļø Size limit reached: {format_size(cached_size)} >= {format_size(max_size_bytes)}") print(f" Stopping downloads for {category}.") break repo_url = repo['clone_url'] repo_name = repo['full_name'] stars = repo['stargazers_count'] repo_lang = repo.get('language', 'N/A') # Update progress bar before clone pbar.set_postfix({ 'Current': repo_name.split('/')[-1][:20], 'Stars': f"{stars:,}", 'Lang': repo_lang[:8], 'Cloned': cloned, 'Failed': failed, 'Size': format_size(cached_size) if max_size_bytes else 'N/A' }) success = clone_repo( repo_url, output_dir, depth=1 if shallow else None ) if success: cloned += 1 else: failed += 1 # Update progress bar after clone (advance by 1) pbar.update(1) pbar.refresh() # Force immediate refresh sys.stderr.flush() # Force flush stderr to ensure progress bar displays # Rate limiting: small delay between clones time.sleep(0.5) pbar.close() return cloned, failed def download_repos( output_dir: str = "data/repos", language: Optional[str] = None, license: Optional[str] = None, min_stars: int = 100, max_repos: int = 50, shallow: bool = True, languages: Optional[List[str]] = None, categories: Optional[List[str]] = None, max_size_gb: Optional[float] = None, ) -> bool: """ Download repositories from GitHub. Args: output_dir: Directory to clone repositories into language: Single language to filter by license: License type to filter by min_stars: Minimum stars max_repos: Maximum repos to download per category/language shallow: Use shallow clone (faster, less history) languages: List of languages to download categories: List of categories to download (nvim, lua, bash, zsh, python, hacking, security, all-open) max_size_gb: Maximum total size in GB (stops downloading when reached) Returns: True if successful """ output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Convert GB to bytes max_size_bytes = int(max_size_gb * 1024**3) if max_size_gb else None if max_size_bytes: current_size = get_directory_size(output_path) print(f"šŸ“Š Current directory size: {format_size(current_size)}") if current_size >= max_size_bytes: print(f"āš ļø Already at size limit: {format_size(current_size)} >= {format_size(max_size_bytes)}") return False print(f"šŸ“Š Size limit: {format_size(max_size_bytes)}") total_cloned = 0 total_failed = 0 # Download by categories if categories: print(f"\nšŸ“¦ Processing {len(categories)} categories...") # Overall progress bar for categories cat_pbar = tqdm( categories, desc="Categories", unit="category", ncols=100, position=0, leave=True, mininterval=0.1, maxinterval=1.0, file=sys.stderr, # Write to stderr to avoid buffering issues dynamic_ncols=True, # Auto-adjust to terminal width disable=False, # Explicitly enable ) for category in cat_pbar: # Check size limit before processing category if max_size_bytes: current_size = get_directory_size(output_path) if current_size >= max_size_bytes: cat_pbar.close() print(f"\nāš ļø Size limit reached: {format_size(current_size)} >= {format_size(max_size_bytes)}") print(f" Stopping all downloads.") break cat_pbar.set_description(f"Category: {category}") current_size = get_directory_size(output_path) if max_size_bytes else 0 cat_pbar.set_postfix({ 'Total Cloned': total_cloned, 'Total Failed': total_failed, 'Size': format_size(current_size) if max_size_bytes else 'N/A' }) cat_pbar.refresh() # Force refresh cloned, failed = download_category( category=category, output_dir=output_path, license=license, min_stars=min_stars, max_repos=max_repos, shallow=shallow, max_size_bytes=max_size_bytes, ) total_cloned += cloned total_failed += failed cat_pbar.close() # Download by languages languages_to_process = languages or ([language] if language else []) for lang in languages_to_process: # Check size limit if max_size_bytes: current_size = get_directory_size(output_path) if current_size >= max_size_bytes: print(f"\nāš ļø Size limit reached: {format_size(current_size)} >= {format_size(max_size_bytes)}") break print(f"\nšŸ“¦ Processing {lang} repositories...") repos = search_github_repos( language=lang, license=license or 'mit', min_stars=min_stars, max_repos=max_repos, ) if not repos: print(f" No repositories found for {lang}") continue print(f" Cloning {len(repos)} repositories...") # Progress bar for language-based cloning pbar = tqdm( total=len(repos), desc=f"Cloning {lang}", unit="repo", ncols=100, mininterval=0.1, maxinterval=1.0, file=sys.stderr, # Write to stderr to avoid buffering issues dynamic_ncols=True, # Auto-adjust to terminal width disable=False, # Explicitly enable ) # Cache size to avoid recalculating every iteration cached_size = get_directory_size(output_path) if max_size_bytes else 0 size_check_counter = 0 for i, repo in enumerate(repos, 1): # Check size limit every 5 repos if max_size_bytes: size_check_counter += 1 if size_check_counter >= 5: cached_size = get_directory_size(output_path) size_check_counter = 0 if cached_size >= max_size_bytes: pbar.close() print(f"\nāš ļø Size limit reached: {format_size(cached_size)} >= {format_size(max_size_bytes)}") break repo_url = repo['clone_url'] repo_name = repo['full_name'] stars = repo['stargazers_count'] # Update progress bar before clone pbar.set_postfix({ 'Current': repo_name.split('/')[-1][:20], 'Stars': f"{stars:,}", 'Cloned': total_cloned, 'Failed': total_failed, 'Size': format_size(cached_size) if max_size_bytes else 'N/A' }) success = clone_repo( repo_url, output_path, depth=1 if shallow else None ) if success: total_cloned += 1 else: total_failed += 1 # Update progress bar after clone (advance by 1) pbar.update(1) pbar.refresh() # Force immediate refresh sys.stderr.flush() # Force flush stderr to ensure progress bar displays # Rate limiting time.sleep(0.5) pbar.close() final_size = get_directory_size(output_path) if max_size_bytes else 0 print(f"\nāœ… Download complete!") print(f" Cloned: {total_cloned}") print(f" Failed: {total_failed}") if max_size_bytes: print(f" Total size: {format_size(final_size)} / {format_size(max_size_bytes)}") print(f" Location: {output_path.absolute()}") return total_cloned > 0 def main(): parser = argparse.ArgumentParser( description='Download GitHub repositories with open licenses for code training', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Download Neovim configs python3 download_repos.py --categories nvim --max-repos 100 # Download Lua repos python3 download_repos.py --categories lua --max-repos 50 # Download Bash scripts python3 download_repos.py --categories bash --max-repos 50 # Download ethical hacking repos python3 download_repos.py --categories hacking --max-repos 100 # Download all your categories python3 download_repos.py --categories nvim lua bash zsh python hacking --max-repos 50 # Download with 1 TB size limit python3 download_repos.py --categories all-open --max-repos 1000 --max-size 1024.0 # Download with specific license python3 download_repos.py --categories nvim --license apache-2.0 --max-repos 50 """ ) parser.add_argument( '--output', type=str, default='data/repos', help='Output directory (default: data/repos)' ) parser.add_argument( '--language', type=str, choices=POPULAR_LANGUAGES, help='Programming language to filter by' ) parser.add_argument( '--languages', type=str, nargs='+', choices=POPULAR_LANGUAGES, help='Multiple languages to download' ) parser.add_argument( '--categories', type=str, nargs='+', choices=list(REPO_CATEGORIES.keys()), help='Categories to download: nvim, lua, bash, zsh, python, hacking, security, all-open' ) parser.add_argument( '--license', type=str, choices=OPEN_LICENSES, default='mit', help='License type (default: mit)' ) parser.add_argument( '--min-stars', type=int, default=100, help='Minimum stars (default: 100)' ) parser.add_argument( '--max-repos', type=int, default=50, help='Maximum repos per category/language (default: 50)' ) parser.add_argument( '--full-clone', action='store_true', help='Do full clone instead of shallow (slower but includes full history)' ) parser.add_argument( '--max-size', type=float, help='Maximum total size in GB (stops downloading when reached, e.g., 1024.0 for 1 TB)' ) args = parser.parse_args() # Default to categories if nothing specified if not args.categories and not args.language and not args.languages: print("ā„¹ļø No categories or languages specified. Use --categories or --language") print(" Available categories:", ", ".join(REPO_CATEGORIES.keys())) print(" Example: --categories nvim lua bash hacking") return print("šŸš€ SheepOp Repository Downloader") print("=" * 50) success = download_repos( output_dir=args.output, language=args.language, license=args.license, min_stars=args.min_stars, max_repos=args.max_repos, shallow=not args.full_clone, languages=args.languages, categories=args.categories, max_size_gb=args.max_size, ) if success: print(f"\nšŸ“š You can now train with:") print(f" python3 train.py --data {args.output} --config config.json --device cuda") else: print("\nāŒ No repositories were downloaded.") sys.exit(1) if __name__ == '__main__': main()