Initial commit: SheepOp LLM - Transformer-based language model implementation

- Complete transformer implementation from scratch
- Training pipeline with gradient accumulation and mixed precision
- Optimized inference with KV caching
- Multi-format data processing (PDFs, images, code, text)
- Comprehensive documentation
- Apache 2.0 license
- Example training plots included in docs/images/
This commit is contained in:
Carlos Gutierrez
2025-11-06 22:07:41 -05:00
commit 3d2da94ce2
60 changed files with 25153 additions and 0 deletions

36
config.json Normal file
View File

@@ -0,0 +1,36 @@
{
"model": {
"vocab_size": 50257,
"d_model": 512,
"num_layers": 6,
"num_heads": 8,
"d_ff": 2048,
"max_seq_len": 512,
"dropout": 0.1,
"activation": "gelu",
"layer_norm_eps": 1e-5,
"bias": false,
"tie_weights": true
},
"training": {
"batch_size": 8,
"max_epochs": 50,
"learning_rate": 1e-4,
"weight_decay": 0.01,
"warmup_steps": 1000,
"max_grad_norm": 1.0,
"gradient_accumulation_steps": 16,
"use_amp": true,
"save_dir": "./checkpoints",
"log_interval": 50,
"eval_interval": 500
},
"data": {
"data_dir": "./data",
"max_length": 512,
"stride": null,
"num_workers": 12
},
"device": "cuda",
"seed": 42
}