Initial commit: SheepOp LLM - Transformer-based language model implementation

- Complete transformer implementation from scratch - Training pipeline with gradient accumulation and mixed precision - Optimized inference with KV caching - Multi-format data processing (PDFs, images, code, text) - Comprehensive documentation - Apache 2.0 license - Example training plots included in docs/images/
2025-11-06 22:07:41 -05:00
commit 3d2da94ce2
60 changed files with 25153 additions and 0 deletions
--- a/models/blocks.py
+++ b/models/blocks.py
@@ -0,0 +1,153 @@
+"""
+Transformer building blocks: Feed-forward networks and transformer blocks
+"""
+import torch
+import torch.nn as nn
+from typing import Optional
+from .attention import MultiHeadAttention
+from .optimized_attention import OptimizedMultiHeadAttention
+
+
+class FeedForward(nn.Module):
+    """
+    Position-wise Feed-Forward Network.
+    Implements two linear transformations with activation in between.
+    """
+    
+    def __init__(
+        self,
+        d_model: int,
+        d_ff: int,
+        dropout: float = 0.1,
+        activation: str = 'gelu',
+        bias: bool = False,
+    ):
+        """
+        Args:
+            d_model: Model dimension
+            d_ff: Feed-forward dimension (typically 4 * d_model)
+            dropout: Dropout probability
+            activation: Activation function ('gelu' or 'relu')
+            bias: Whether to use bias in linear layers
+        """
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff, bias=bias)
+        self.linear2 = nn.Linear(d_ff, d_model, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+        
+        if activation == 'gelu':
+            self.activation = nn.GELU()
+        elif activation == 'relu':
+            self.activation = nn.ReLU()
+        else:
+            raise ValueError(f"Unsupported activation: {activation}")
+            
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+            
+        Returns:
+            Output tensor [batch_size, seq_len, d_model]
+        """
+        x = self.linear1(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        x = self.linear2(x)
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """
+    Transformer block with self-attention and feed-forward network.
+    Includes residual connections and layer normalization.
+    """
+    
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        d_ff: int,
+        dropout: float = 0.1,
+        activation: str = 'gelu',
+        layer_norm_eps: float = 1e-5,
+        bias: bool = False,
+        causal: bool = False,
+        use_optimized_attention: bool = False,
+    ):
+        """
+        Args:
+            d_model: Model dimension
+            num_heads: Number of attention heads
+            d_ff: Feed-forward dimension
+            dropout: Dropout probability
+            activation: Activation function
+            layer_norm_eps: Epsilon for layer normalization
+            bias: Whether to use bias in linear layers
+            causal: Whether to use causal masking
+            use_optimized_attention: Whether to use optimized attention with KV caching
+        """
+        super().__init__()
+        
+        # Self-attention with pre-norm architecture
+        if use_optimized_attention:
+            self.self_attn = OptimizedMultiHeadAttention(
+                d_model=d_model,
+                num_heads=num_heads,
+                dropout=dropout,
+                bias=bias,
+                causal=causal,
+                use_flash_attention=True,
+            )
+        else:
+            self.self_attn = MultiHeadAttention(
+                d_model=d_model,
+                num_heads=num_heads,
+                dropout=dropout,
+                bias=bias,
+                causal=causal,
+            )
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        
+        # Feed-forward network
+        self.feed_forward = FeedForward(
+            d_model=d_model,
+            d_ff=d_ff,
+            dropout=dropout,
+            activation=activation,
+            bias=bias,
+        )
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        
+        self.dropout = nn.Dropout(dropout)
+        
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass through transformer block.
+        
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+            mask: Optional attention mask
+            
+        Returns:
+            Output tensor [batch_size, seq_len, d_model]
+        """
+        # Pre-norm self-attention with residual connection
+        residual = x
+        x = self.norm1(x)
+        attn_out, _ = self.self_attn(x, x, x, mask=mask)
+        x = residual + self.dropout(attn_out)
+        
+        # Pre-norm feed-forward with residual connection
+        residual = x
+        x = self.norm2(x)
+        ff_out = self.feed_forward(x)
+        x = residual + self.dropout(ff_out)
+        
+        return x
+
+