Initial commit: SheepOp LLM - Transformer-based language model implementation

- Complete transformer implementation from scratch - Training pipeline with gradient accumulation and mixed precision - Optimized inference with KV caching - Multi-format data processing (PDFs, images, code, text) - Comprehensive documentation - Apache 2.0 license - Example training plots included in docs/images/
2025-11-06 22:07:41 -05:00
commit 3d2da94ce2
60 changed files with 25153 additions and 0 deletions
--- a/config.json
+++ b/config.json
@@ -0,0 +1,36 @@
+{
+  "model": {
+    "vocab_size": 50257,
+    "d_model": 512,
+    "num_layers": 6,
+    "num_heads": 8,
+    "d_ff": 2048,
+    "max_seq_len": 512,
+    "dropout": 0.1,
+    "activation": "gelu",
+    "layer_norm_eps": 1e-5,
+    "bias": false,
+    "tie_weights": true
+  },
+  "training": {
+    "batch_size": 8,
+    "max_epochs": 50,
+    "learning_rate": 1e-4,
+    "weight_decay": 0.01,
+    "warmup_steps": 1000,
+    "max_grad_norm": 1.0,
+    "gradient_accumulation_steps": 16,
+    "use_amp": true,
+    "save_dir": "./checkpoints",
+    "log_interval": 50,
+    "eval_interval": 500
+  },
+  "data": {
+    "data_dir": "./data",
+    "max_length": 512,
+    "stride": null,
+    "num_workers": 12
+  },
+  "device": "cuda",
+  "seed": 42
+}