Restore TinyGPT implementation files after stash merge

- Move TinyGPT files to correct directory structure - Resolve merge conflicts from stash restoration - TinyGPT now implements attention and transformer models using TinyTorch foundation
2026-05-22 03:59:33 -05:00 · 2025-09-17 09:43:19 -04:00
parent 41ae3a6937
commit 906def8745
3 changed files with 1074 additions and 0 deletions
--- a/tinyGPT/tinyGPT/core/attention.py
+++ b/tinyGPT/tinyGPT/core/attention.py
@@ -0,0 +1,352 @@
+"""
+Attention mechanisms for TinyGPT transformer models.
+
+Implements self-attention and multi-head attention using TinyTorch components.
+"""
+
+import numpy as np
+import sys
+import os
+
+# Add TinyTorch to path for reusing components
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+
+try:
+    from tinytorch.core.tensor import Tensor
+    from tinytorch.core.layers import Dense
+    from tinytorch.core.activations import Softmax
+except ImportError:
+    print("⚠️ TinyTorch not available. Using mock implementations for development.")
+    # Mock implementations for development
+    class Tensor:
+        def __init__(self, data):
+            self.data = np.array(data)
+            self.shape = self.data.shape
+            
+        def __matmul__(self, other):
+            if isinstance(other, Tensor):
+                return Tensor(self.data @ other.data)
+            return Tensor(self.data @ other)
+            
+        def transpose(self, axes=None):
+            if axes is None:
+                return Tensor(self.data.T)
+            return Tensor(np.transpose(self.data, axes))
+        
+        def softmax(self, axis=-1):
+            exp_data = np.exp(self.data - np.max(self.data, axis=axis, keepdims=True))
+            return Tensor(exp_data / np.sum(exp_data, axis=axis, keepdims=True))
+            
+        def __add__(self, other):
+            if isinstance(other, Tensor):
+                return Tensor(self.data + other.data)
+            return Tensor(self.data + other)
+            
+        def __mul__(self, other):
+            if isinstance(other, Tensor):
+                return Tensor(self.data * other.data)
+            return Tensor(self.data * other)
+    
+    class Dense:
+        def __init__(self, in_features, out_features):
+            self.in_features = in_features
+            self.out_features = out_features
+            self.weight = Tensor(np.random.randn(in_features, out_features) * 0.1)
+            self.bias = Tensor(np.zeros(out_features))
+            
+        def forward(self, x):
+            return x @ self.weight + self.bias
+    
+    class Softmax:
+        def forward(self, x):
+            return x.softmax()
+
+
+class MultiHeadAttention:
+    """Multi-head self-attention mechanism using TinyTorch Dense layers."""
+    
+    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
+        """Initialize multi-head attention.
+        
+        Args:
+            d_model: Model dimension (embedding size)
+            num_heads: Number of attention heads
+            dropout: Dropout rate (not implemented yet)
+        """
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads
+        self.dropout = dropout
+        
+        # Linear projections for Q, K, V using TinyTorch Dense layers
+        self.w_q = Dense(d_model, d_model)
+        self.w_k = Dense(d_model, d_model)
+        self.w_v = Dense(d_model, d_model)
+        self.w_o = Dense(d_model, d_model)  # Output projection
+        
+        self.softmax = Softmax()
+        
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, 
+                mask: Tensor = None) -> Tensor:
+        """Forward pass of multi-head attention.
+        
+        Args:
+            query: Query tensor of shape (batch_size, seq_len, d_model)
+            key: Key tensor of shape (batch_size, seq_len, d_model)
+            value: Value tensor of shape (batch_size, seq_len, d_model)
+            mask: Optional attention mask
+            
+        Returns:
+            Attention output of shape (batch_size, seq_len, d_model)
+        """
+        batch_size, seq_len, d_model = query.shape
+        
+        # Reshape for TinyTorch Dense layers (expects 2D)
+        query_2d = Tensor(query.data.reshape(-1, d_model))  # (batch_size * seq_len, d_model)
+        key_2d = Tensor(key.data.reshape(-1, d_model))
+        value_2d = Tensor(value.data.reshape(-1, d_model))
+        
+        # Linear projections
+        Q_2d = self.w_q.forward(query_2d)  # (batch_size * seq_len, d_model)
+        K_2d = self.w_k.forward(key_2d)
+        V_2d = self.w_v.forward(value_2d)
+        
+        # Reshape back to 3D
+        Q = Tensor(Q_2d.data.reshape(batch_size, seq_len, d_model))
+        K = Tensor(K_2d.data.reshape(batch_size, seq_len, d_model))
+        V = Tensor(V_2d.data.reshape(batch_size, seq_len, d_model))
+        
+        # Reshape for multi-head attention
+        Q = self._reshape_for_attention(Q)  # (batch_size, num_heads, seq_len, d_k)
+        K = self._reshape_for_attention(K)  # (batch_size, num_heads, seq_len, d_k)
+        V = self._reshape_for_attention(V)  # (batch_size, num_heads, seq_len, d_k)
+        
+        # Scaled dot-product attention
+        attention_output = self._scaled_dot_product_attention(Q, K, V, mask)
+        
+        # Concatenate heads
+        attention_output = self._combine_heads(attention_output)
+        
+        # Final linear projection (reshape for Dense layer)
+        batch_size, seq_len, d_model = attention_output.shape
+        attention_2d = Tensor(attention_output.data.reshape(-1, d_model))
+        output_2d = self.w_o.forward(attention_2d)
+        output = Tensor(output_2d.data.reshape(batch_size, seq_len, d_model))
+        
+        return output
+    
+    def _reshape_for_attention(self, x: Tensor) -> Tensor:
+        """Reshape tensor for multi-head attention."""
+        batch_size, seq_len, d_model = x.shape
+        # Reshape to (batch_size, seq_len, num_heads, d_k)
+        reshaped = Tensor(x.data.reshape(batch_size, seq_len, self.num_heads, self.d_k))
+        # Transpose to (batch_size, num_heads, seq_len, d_k)
+        return Tensor(reshaped.data.transpose(0, 2, 1, 3))
+    
+    def _combine_heads(self, x: Tensor) -> Tensor:
+        """Combine attention heads back into single tensor."""
+        batch_size, num_heads, seq_len, d_k = x.shape
+        # Transpose back to (batch_size, seq_len, num_heads, d_k)
+        transposed = Tensor(x.data.transpose(0, 2, 1, 3))
+        # Reshape to (batch_size, seq_len, d_model)
+        return Tensor(transposed.data.reshape(batch_size, seq_len, self.d_model))
+    
+    def _scaled_dot_product_attention(self, Q: Tensor, K: Tensor, V: Tensor, 
+                                    mask: Tensor = None) -> Tensor:
+        """Compute scaled dot-product attention."""
+        # Compute attention scores
+        # Q: (batch_size, num_heads, seq_len, d_k)
+        # K: (batch_size, num_heads, seq_len, d_k)
+        # Scores: (batch_size, num_heads, seq_len, seq_len)
+        
+        K_T = K.data.transpose(0, 1, 3, 2)  # Transpose K
+        scores = Tensor(np.matmul(Q.data, K_T))  # QK^T using numpy matmul
+        scores = scores * (1.0 / np.sqrt(self.d_k))  # Scale
+        
+        # Apply mask if provided (for causal attention)
+        if mask is not None:
+            scores = scores + (mask * -1e9)
+        
+        # Apply softmax manually since TinyTorch Tensor doesn't have softmax
+        # Subtract max for numerical stability
+        scores_max = np.max(scores.data, axis=-1, keepdims=True)
+        scores_shifted = scores.data - scores_max
+        exp_scores = np.exp(scores_shifted)
+        softmax_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
+        attention_weights = Tensor(softmax_weights)
+        
+        # Apply attention to values
+        # attention_weights: (batch_size, num_heads, seq_len, seq_len)
+        # V: (batch_size, num_heads, seq_len, d_k)
+        # Output: (batch_size, num_heads, seq_len, d_k)
+        output = Tensor(np.matmul(attention_weights.data, V.data))
+        
+        return output
+
+
+class SelfAttention:
+    """Simplified self-attention for easier understanding."""
+    
+    def __init__(self, d_model: int):
+        """Initialize self-attention.
+        
+        Args:
+            d_model: Model dimension
+        """
+        self.d_model = d_model
+        self.scale = 1.0 / np.sqrt(d_model)
+        
+        # Single-head attention projections
+        self.w_q = Dense(d_model, d_model)
+        self.w_k = Dense(d_model, d_model)
+        self.w_v = Dense(d_model, d_model)
+        
+        self.softmax = Softmax()
+        
+    def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
+        """Forward pass of self-attention.
+        
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, d_model)
+            mask: Optional attention mask
+            
+        Returns:
+            Attention output of same shape as input
+        """
+        # Compute Q, K, V
+        Q = self.w_q.forward(x)  # (batch_size, seq_len, d_model)
+        K = self.w_k.forward(x)  # (batch_size, seq_len, d_model)
+        V = self.w_v.forward(x)  # (batch_size, seq_len, d_model)
+        
+        # Compute attention scores
+        scores = Q @ K.transpose((0, 2, 1))  # (batch_size, seq_len, seq_len)
+        scores = scores * self.scale
+        
+        # Apply mask if provided
+        if mask is not None:
+            scores = scores + (mask * -1e9)
+        
+        # Apply softmax
+        attention_weights = scores.softmax(axis=-1)
+        
+        # Apply attention to values
+        output = attention_weights @ V  # (batch_size, seq_len, d_model)
+        
+        return output
+
+
+def create_causal_mask(seq_len: int) -> Tensor:
+    """Create causal mask for preventing attention to future tokens.
+    
+    Args:
+        seq_len: Sequence length
+        
+    Returns:
+        Causal mask of shape (seq_len, seq_len)
+    """
+    # Create lower triangular matrix (0 = attend, 1 = mask)
+    mask = np.triu(np.ones((seq_len, seq_len)), k=1)
+    return Tensor(mask)
+
+
+class PositionalEncoding:
+    """Sinusoidal positional encoding for transformer models."""
+    
+    def __init__(self, d_model: int, max_length: int = 5000):
+        """Initialize positional encoding.
+        
+        Args:
+            d_model: Model dimension
+            max_length: Maximum sequence length
+        """
+        self.d_model = d_model
+        self.max_length = max_length
+        
+        # Create positional encoding matrix
+        pe = np.zeros((max_length, d_model))
+        position = np.arange(0, max_length).reshape(-1, 1)
+        
+        # Compute div_term for sinusoidal encoding
+        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
+        
+        # Apply sin to even indices
+        pe[:, 0::2] = np.sin(position * div_term)
+        
+        # Apply cos to odd indices
+        if d_model % 2 == 0:
+            pe[:, 1::2] = np.cos(position * div_term)
+        else:
+            pe[:, 1::2] = np.cos(position * div_term[:-1])
+        
+        self.pe = Tensor(pe)
+        
+    def forward(self, x: Tensor) -> Tensor:
+        """Add positional encoding to input embeddings.
+        
+        Args:
+            x: Input embeddings of shape (batch_size, seq_len, d_model)
+            
+        Returns:
+            Embeddings with positional encoding added
+        """
+        batch_size, seq_len, d_model = x.shape
+        
+        # Get positional encodings for this sequence length
+        pos_encoding = Tensor(self.pe.data[:seq_len, :])
+        
+        # Add to input (broadcasting across batch dimension)
+        return x + pos_encoding
+
+
+if __name__ == "__main__":
+    # Test attention mechanisms
+    print("🧪 Testing TinyGPT Attention Mechanisms")
+    print("=" * 50)
+    
+    # Test parameters
+    batch_size = 2
+    seq_len = 10
+    d_model = 64
+    num_heads = 8
+    
+    # Create sample input
+    x = Tensor(np.random.randn(batch_size, seq_len, d_model))
+    print(f"Input shape: {x.shape}")
+    
+    # Test self-attention
+    print("\n🎯 Self-Attention:")
+    self_attn = SelfAttention(d_model)
+    output = self_attn.forward(x)
+    print(f"Output shape: {output.shape}")
+    
+    # Test multi-head attention
+    print("\n🔀 Multi-Head Attention:")
+    multi_head_attn = MultiHeadAttention(d_model, num_heads)
+    output = multi_head_attn.forward(x, x, x)
+    print(f"Output shape: {output.shape}")
+    
+    # Test causal mask
+    print("\n🎭 Causal Mask:")
+    mask = create_causal_mask(seq_len)
+    print(f"Mask shape: {mask.shape}")
+    print(f"Mask sample:\n{mask.data[:5, :5]}")
+    
+    # Test with causal mask
+    masked_output = self_attn.forward(x, mask)
+    print(f"Masked output shape: {masked_output.shape}")
+    
+    # Test positional encoding
+    print("\n📍 Positional Encoding:")
+    pos_encoding = PositionalEncoding(d_model, max_length=100)
+    encoded_x = pos_encoding.forward(x)
+    print(f"Encoded shape: {encoded_x.shape}")
+    
+    print("\n✅ Attention mechanism tests completed!")
+    print("\n💡 Key insights:")
+    print("   • Self-attention allows tokens to attend to each other")
+    print("   • Multi-head attention captures different types of relationships")
+    print("   • Causal masking prevents attention to future tokens")
+    print("   • Positional encoding adds sequence order information")
+    print("   • All components reuse TinyTorch Dense layers! 🎉")
--- a/tinyGPT/tinyGPT/core/models.py
+++ b/tinyGPT/tinyGPT/core/models.py
@@ -0,0 +1,425 @@
+"""
+TinyGPT transformer models built on TinyTorch components.
+
+Implements GPT-style autoregressive language models that maximize reuse
+of TinyTorch layers while adding transformer-specific components.
+"""
+
+import numpy as np
+import sys
+import os
+
+# Add TinyTorch to path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+
+try:
+    from tinytorch.core.tensor import Tensor
+    from tinytorch.core.layers import Dense
+    from tinytorch.core.activations import ReLU, Softmax
+    # Don't import Sequential from TinyTorch - it doesn't handle 3D tensors
+    TINYTORCH_AVAILABLE = True
+except ImportError:
+    print("⚠️ TinyTorch not available. Using mock implementations.")
+    # Use mock implementations from attention.py
+    from .attention import Tensor, Dense
+    TINYTORCH_AVAILABLE = False
+    
+    class ReLU:
+        def forward(self, x):
+            return Tensor(np.maximum(0, x.data))
+    
+    class Softmax:
+        def forward(self, x):
+            return x.softmax()
+
+# Custom Sequential that handles 3D tensors (works with or without TinyTorch)
+class Sequential:
+    def __init__(self, layers):
+        self.layers = layers
+        
+    def forward(self, x):
+        # Handle 3D tensors by reshaping for Dense layers
+        original_shape = x.shape
+        if len(original_shape) == 3:
+            batch_size, seq_len, d_model = original_shape
+            x = Tensor(x.data.reshape(-1, d_model))
+            
+        for layer in self.layers:
+            x = layer.forward(x)
+            
+        # Reshape back to original dimensions
+        if len(original_shape) == 3:
+            x = Tensor(x.data.reshape(batch_size, seq_len, -1))
+            
+        return x
+
+from .attention import MultiHeadAttention, PositionalEncoding, create_causal_mask
+
+
+class LayerNorm:
+    """Layer normalization for transformer models."""
+    
+    def __init__(self, d_model: int, eps: float = 1e-6):
+        """Initialize layer normalization.
+        
+        Args:
+            d_model: Model dimension
+            eps: Small constant for numerical stability
+        """
+        self.d_model = d_model
+        self.eps = eps
+        
+        # Learnable parameters (simplified - would need proper gradient handling)
+        self.gamma = Tensor(np.ones(d_model))
+        self.beta = Tensor(np.zeros(d_model))
+        
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply layer normalization.
+        
+        Args:
+            x: Input tensor of shape (..., d_model)
+            
+        Returns:
+            Normalized tensor of same shape
+        """
+        # Compute mean and variance along last dimension
+        mean = np.mean(x.data, axis=-1, keepdims=True)
+        var = np.var(x.data, axis=-1, keepdims=True)
+        
+        # Normalize
+        normalized = (x.data - mean) / np.sqrt(var + self.eps)
+        
+        # Scale and shift
+        output = normalized * self.gamma.data + self.beta.data
+        
+        return Tensor(output)
+
+
+class TransformerBlock:
+    """Single transformer block with self-attention and feedforward network."""
+    
+    def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
+        """Initialize transformer block.
+        
+        Args:
+            d_model: Model dimension
+            num_heads: Number of attention heads
+            d_ff: Feedforward network dimension
+            dropout: Dropout rate (not implemented)
+        """
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_ff = d_ff
+        self.dropout = dropout
+        
+        # Multi-head self-attention
+        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)
+        
+        # Feedforward network using TinyTorch Dense layers
+        self.feedforward = Sequential([
+            Dense(d_model, d_ff),
+            ReLU(),
+            Dense(d_ff, d_model)
+        ])
+        
+        # Layer normalization
+        self.ln1 = LayerNorm(d_model)
+        self.ln2 = LayerNorm(d_model)
+        
+    def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
+        """Forward pass of transformer block.
+        
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, d_model)
+            mask: Optional attention mask
+            
+        Returns:
+            Output tensor of same shape as input
+        """
+        # Self-attention with residual connection and layer norm
+        attn_output = self.self_attention.forward(x, x, x, mask)
+        x = self.ln1.forward(x + attn_output)  # Residual connection
+        
+        # Feedforward with residual connection and layer norm
+        ff_output = self.feedforward.forward(x)
+        x = self.ln2.forward(x + ff_output)  # Residual connection
+        
+        return x
+
+
+class TinyGPT:
+    """TinyGPT: GPT-style transformer model using TinyTorch components."""
+    
+    def __init__(self, vocab_size: int, d_model: int = 256, num_heads: int = 8, 
+                 num_layers: int = 6, d_ff: int = None, max_length: int = 1024,
+                 dropout: float = 0.1):
+        """Initialize TinyGPT model.
+        
+        Args:
+            vocab_size: Vocabulary size
+            d_model: Model dimension (embedding size)
+            num_heads: Number of attention heads
+            num_layers: Number of transformer layers
+            d_ff: Feedforward dimension (default: 4 * d_model)
+            max_length: Maximum sequence length
+            dropout: Dropout rate
+        """
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.d_ff = d_ff or 4 * d_model
+        self.max_length = max_length
+        self.dropout = dropout
+        
+        # Token embeddings using TinyTorch Dense layer
+        self.token_embedding = Dense(vocab_size, d_model)
+        
+        # Positional encoding
+        self.positional_encoding = PositionalEncoding(d_model, max_length)
+        
+        # Transformer blocks
+        self.blocks = [
+            TransformerBlock(d_model, num_heads, self.d_ff, dropout)
+            for _ in range(num_layers)
+        ]
+        
+        # Final layer norm
+        self.ln_final = LayerNorm(d_model)
+        
+        # Output projection to vocabulary using TinyTorch Dense layer
+        self.output_projection = Dense(d_model, vocab_size)
+        
+        print(f"🤖 TinyGPT initialized:")
+        print(f"   Vocab size: {vocab_size}")
+        print(f"   Model dim: {d_model}")
+        print(f"   Heads: {num_heads}")
+        print(f"   Layers: {num_layers}")
+        print(f"   Parameters: ~{self.count_parameters():,}")
+        
+    def forward(self, input_ids: Tensor, use_cache: bool = False) -> Tensor:
+        """Forward pass of TinyGPT.
+        
+        Args:
+            input_ids: Token indices of shape (batch_size, seq_len)
+            use_cache: Whether to use caching (not implemented)
+            
+        Returns:
+            Logits of shape (batch_size, seq_len, vocab_size)
+        """
+        batch_size, seq_len = input_ids.shape
+        
+        # Convert token indices to one-hot encoding for embedding
+        # This is a simplified approach - in practice, we'd use proper embedding layers
+        one_hot = np.zeros((batch_size, seq_len, self.vocab_size))
+        for b in range(batch_size):
+            for s in range(seq_len):
+                token_id = int(input_ids.data[b, s])
+                if 0 <= token_id < self.vocab_size:
+                    one_hot[b, s, token_id] = 1.0
+        
+        # Token embeddings (reshape for Dense layer)
+        one_hot_2d = Tensor(one_hot.reshape(-1, self.vocab_size))  # (batch_size * seq_len, vocab_size)
+        x_2d = self.token_embedding.forward(one_hot_2d)  # (batch_size * seq_len, d_model)
+        x = Tensor(x_2d.data.reshape(batch_size, seq_len, self.d_model))  # (batch_size, seq_len, d_model)
+        
+        # Add positional encoding
+        x = self.positional_encoding.forward(x)
+        
+        # Create causal mask
+        mask = create_causal_mask(seq_len)
+        
+        # Pass through transformer blocks
+        for block in self.blocks:
+            x = block.forward(x, mask)
+        
+        # Final layer norm
+        x = self.ln_final.forward(x)
+        
+        # Project to vocabulary (reshape for Dense layer)
+        x_2d = Tensor(x.data.reshape(-1, self.d_model))  # (batch_size * seq_len, d_model)
+        logits_2d = self.output_projection.forward(x_2d)  # (batch_size * seq_len, vocab_size)
+        logits = Tensor(logits_2d.data.reshape(batch_size, seq_len, self.vocab_size))  # (batch_size, seq_len, vocab_size)
+        
+        return logits
+    
+    def generate(self, input_ids: Tensor, max_new_tokens: int = 50, 
+                temperature: float = 1.0, do_sample: bool = True) -> Tensor:
+        """Generate text autoregressively.
+        
+        Args:
+            input_ids: Starting token indices of shape (1, seq_len)
+            max_new_tokens: Maximum number of new tokens to generate
+            temperature: Sampling temperature (higher = more random)
+            do_sample: Whether to sample or use greedy decoding
+            
+        Returns:
+            Generated token sequence including input
+        """
+        generated = input_ids.data.copy()
+        
+        for _ in range(max_new_tokens):
+            # Forward pass
+            logits = self.forward(Tensor(generated))
+            
+            # Get logits for last token
+            next_token_logits = logits.data[0, -1, :]  # (vocab_size,)
+            
+            # Apply temperature
+            if temperature != 1.0:
+                next_token_logits = next_token_logits / temperature
+            
+            # Sample next token
+            if do_sample:
+                # Softmax to get probabilities
+                probs = np.exp(next_token_logits) / np.sum(np.exp(next_token_logits))
+                next_token = np.random.choice(len(probs), p=probs)
+            else:
+                # Greedy decoding
+                next_token = np.argmax(next_token_logits)
+            
+            # Append to sequence
+            generated = np.concatenate([
+                generated,
+                np.array([[next_token]])
+            ], axis=1)
+            
+            # Stop if we hit maximum length
+            if generated.shape[1] >= self.max_length:
+                break
+        
+        return Tensor(generated)
+    
+    def count_parameters(self) -> int:
+        """Estimate number of parameters in the model."""
+        params = 0
+        
+        # Token embedding: vocab_size * d_model
+        params += self.vocab_size * self.d_model
+        
+        # Each transformer block
+        for _ in range(self.num_layers):
+            # Multi-head attention: 4 * d_model * d_model (Q, K, V, O projections)
+            params += 4 * self.d_model * self.d_model
+            
+            # Feedforward: d_model * d_ff + d_ff * d_model
+            params += 2 * self.d_model * self.d_ff
+            
+            # Layer norms: 2 * 2 * d_model (gamma and beta for each)
+            params += 4 * self.d_model
+        
+        # Final layer norm: 2 * d_model
+        params += 2 * self.d_model
+        
+        # Output projection: d_model * vocab_size
+        params += self.d_model * self.vocab_size
+        
+        return params
+
+
+class SimpleLM:
+    """Simplified language model for testing and comparison."""
+    
+    def __init__(self, vocab_size: int, d_model: int = 128, d_hidden: int = 256):
+        """Initialize simple language model.
+        
+        Args:
+            vocab_size: Vocabulary size
+            d_model: Embedding dimension
+            d_hidden: Hidden layer dimension
+        """
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_hidden = d_hidden
+        
+        # Simple feedforward network using TinyTorch components
+        self.embedding = Dense(vocab_size, d_model)
+        self.hidden = Dense(d_model, d_hidden)
+        self.activation = ReLU()
+        self.output = Dense(d_hidden, vocab_size)
+        
+        print(f"🔤 Simple LM initialized: {vocab_size} vocab, {d_model} dim")
+        
+    def forward(self, input_ids: Tensor) -> Tensor:
+        """Forward pass of simple language model."""
+        batch_size, seq_len = input_ids.shape
+        
+        # Convert to one-hot
+        one_hot = np.zeros((batch_size, seq_len, self.vocab_size))
+        for b in range(batch_size):
+            for s in range(seq_len):
+                token_id = int(input_ids.data[b, s])
+                if 0 <= token_id < self.vocab_size:
+                    one_hot[b, s, token_id] = 1.0
+        
+        # Simple feedforward (reshape for Dense layers)
+        one_hot_2d = Tensor(one_hot.reshape(-1, self.vocab_size))
+        x = self.embedding.forward(one_hot_2d)
+        x = self.hidden.forward(x)
+        x = self.activation.forward(x)
+        logits_2d = self.output.forward(x)
+        logits = Tensor(logits_2d.data.reshape(batch_size, seq_len, self.vocab_size))
+        
+        return logits
+
+
+if __name__ == "__main__":
+    # Test TinyGPT models
+    print("🧪 Testing TinyGPT Models")
+    print("=" * 50)
+    
+    # Model parameters
+    vocab_size = 50
+    d_model = 64
+    num_heads = 4
+    num_layers = 2
+    seq_len = 10
+    batch_size = 2
+    
+    # Create sample input (token indices)
+    input_ids = Tensor(np.random.randint(0, vocab_size, (batch_size, seq_len)))
+    print(f"Input shape: {input_ids.shape}")
+    print(f"Sample tokens: {input_ids.data[0, :5]}")
+    
+    # Test TinyGPT
+    print("\n🤖 TinyGPT:")
+    model = TinyGPT(
+        vocab_size=vocab_size,
+        d_model=d_model,
+        num_heads=num_heads,
+        num_layers=num_layers,
+        max_length=128
+    )
+    
+    # Forward pass
+    logits = model.forward(input_ids)
+    print(f"Logits shape: {logits.shape}")
+    print(f"Logits sample: {logits.data[0, 0, :5]}")
+    
+    # Test generation
+    print("\n📝 Text Generation:")
+    start_tokens = Tensor(np.array([[1, 2, 3]]))  # Start with tokens 1, 2, 3
+    generated = model.generate(start_tokens, max_new_tokens=10, temperature=0.8)
+    print(f"Generated shape: {generated.shape}")
+    print(f"Generated tokens: {generated.data[0]}")
+    
+    # Test simple LM for comparison
+    print("\n🔤 Simple LM (for comparison):")
+    simple_model = SimpleLM(vocab_size=vocab_size, d_model=d_model)
+    simple_logits = simple_model.forward(input_ids)
+    print(f"Simple LM logits shape: {simple_logits.shape}")
+    
+    # Compare model sizes
+    print("\n📊 Model Comparison:")
+    print(f"TinyGPT parameters: ~{model.count_parameters():,}")
+    simple_params = vocab_size * d_model + d_model * 256 + 256 * vocab_size
+    print(f"Simple LM parameters: ~{simple_params:,}")
+    print(f"TinyGPT is {model.count_parameters() / simple_params:.1f}x larger")
+    
+    print("\n✅ Model tests completed!")
+    print("\n💡 Key insights:")
+    print("   • TinyGPT successfully reuses TinyTorch Dense layers")
+    print("   • Transformer architecture much more powerful than simple LM")
+    print("   • Self-attention enables long-range dependencies")
+    print("   • Autoregressive generation works out of the box")
+    print("   • 🎉 Vision and language models share the same foundation!")
--- a/tinyGPT/tinyGPT/examples/shakespeare_demo.py
+++ b/tinyGPT/tinyGPT/examples/shakespeare_demo.py
@@ -0,0 +1,297 @@
+"""
+TinyGPT Shakespeare Demo: Character-level GPT trained on Shakespeare text.
+
+This example demonstrates how TinyGPT can learn to generate Shakespeare-style text
+using only TinyTorch components and character-level tokenization.
+"""
+
+import sys
+import os
+import numpy as np
+import time
+
+# Add paths for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+
+from core.tokenizer import CharTokenizer
+from core.models import TinyGPT
+from core.training import LanguageModelTrainer
+
+
+def create_shakespeare_sample() -> str:
+    """Create a longer Shakespeare sample for training."""
+    return """To be, or not to be, that is the question:
+Whether 'tis nobler in the mind to suffer
+The slings and arrows of outrageous fortune,
+Or to take arms against a sea of troubles
+And by opposing end them. To die—to sleep,
+No more; and by a sleep to say we end
+The heart-ache and the thousand natural shocks
+That flesh is heir to: 'tis a consummation
+Devoutly to be wish'd. To die, to sleep;
+To sleep, perchance to dream—ay, there's the rub:
+For in that sleep of death what dreams may come,
+When we have shuffled off this mortal coil,
+Must give us pause—there's the respect
+That makes calamity of so long life.
+
+For who would bear the whips and scorns of time,
+The oppressor's wrong, the proud man's contumely,
+The pangs of despised love, the law's delay,
+The insolence of office, and the spurns
+That patient merit of th' unworthy takes,
+When he himself might his quietus make
+With a bare bodkin? Who would fardels bear,
+To grunt and sweat under a weary life,
+But that the dread of something after death,
+The undiscovered country, from whose bourn
+No traveller returns, puzzles the will,
+And makes us rather bear those ills we have
+Than fly to others that we know not of?
+
+Thus conscience does make cowards of us all,
+And thus the native hue of resolution
+Is sicklied o'er with the pale cast of thought,
+And enterprises of great pitch and moment
+With this regard their currents turn awry
+And lose the name of action.
+
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimmed;
+And every fair from fair sometime declines,
+By chance, or nature's changing course, untrimmed;
+But thy eternal summer shall not fade,
+Nor lose possession of that fair thou ow'st,
+Nor shall death brag thou wander'st in his shade,
+When in eternal lines to time thou grow'st:
+So long as men can breathe or eyes can see,
+So long lives this, and this gives life to thee."""
+
+
+def analyze_text(text: str) -> dict:
+    """Analyze text statistics."""
+    stats = {
+        'characters': len(text),
+        'unique_chars': len(set(text)),
+        'words': len(text.split()),
+        'lines': len(text.split('\n')),
+    }
+    return stats
+
+
+def main():
+    """Main demonstration of TinyGPT on Shakespeare text."""
+    print("🎭 TinyGPT Shakespeare Demo")
+    print("=" * 60)
+    print("Training a character-level GPT on Shakespeare using TinyTorch!")
+    print()
+    
+    # Load and analyze text
+    print("📚 Loading Shakespeare text...")
+    shakespeare_text = create_shakespeare_sample()
+    stats = analyze_text(shakespeare_text)
+    
+    print(f"📊 Text Statistics:")
+    print(f"   Characters: {stats['characters']:,}")
+    print(f"   Unique characters: {stats['unique_chars']}")
+    print(f"   Words: {stats['words']:,}")
+    print(f"   Lines: {stats['lines']}")
+    print()
+    
+    # Create and fit tokenizer
+    print("🔤 Creating character tokenizer...")
+    tokenizer = CharTokenizer(vocab_size=100)  # Limit vocab size
+    tokenizer.fit(shakespeare_text)
+    
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"   Vocabulary size: {vocab_size}")
+    print(f"   Sample characters: {list(tokenizer.char_to_idx.keys())[:20]}")
+    print()
+    
+    # Test tokenization
+    sample_text = "To be or not to be"
+    encoded = tokenizer.encode(sample_text)
+    decoded = tokenizer.decode(encoded)
+    print(f"🔬 Tokenization Test:")
+    print(f"   Original: '{sample_text}'")
+    print(f"   Encoded: {encoded}")
+    print(f"   Decoded: '{decoded}'")
+    print()
+    
+    # Create TinyGPT model
+    print("🤖 Creating TinyGPT model...")
+    model = TinyGPT(
+        vocab_size=vocab_size,
+        d_model=128,        # Embedding dimension
+        num_heads=8,        # Attention heads
+        num_layers=4,       # Transformer layers
+        d_ff=512,          # Feedforward dimension
+        max_length=256,     # Maximum sequence length
+        dropout=0.1
+    )
+    print()
+    
+    # Create trainer
+    print("🎓 Setting up trainer...")
+    trainer = LanguageModelTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        optimizer=None,  # Will use default Adam
+        loss_fn=None,    # Will use default LanguageModelLoss
+        metrics=None     # Will use default LanguageModelAccuracy
+    )
+    print()
+    
+    # Generate text before training (should be random)
+    print("📝 Text generation BEFORE training:")
+    prompts = ["To be", "Shall I", "The quick"]
+    for prompt in prompts:
+        generated = trainer.generate_text(prompt, max_length=30, temperature=1.0)
+        print(f"   '{prompt}' → '{generated[:50]}...'")
+    print()
+    
+    # Train the model
+    print("🚀 Training TinyGPT on Shakespeare...")
+    start_time = time.time()
+    
+    history = trainer.fit(
+        text=shakespeare_text,
+        epochs=5,           # Quick training for demo
+        seq_length=64,      # Sequence length
+        batch_size=8,       # Batch size
+        val_split=0.2,      # 20% for validation
+        verbose=True
+    )
+    
+    training_time = time.time() - start_time
+    print(f"\n⏱️ Training completed in {training_time:.1f} seconds")
+    print()
+    
+    # Analyze training results
+    print("📈 Training Results:")
+    final_train_loss = history['train_loss'][-1]
+    final_val_loss = history['val_loss'][-1]
+    final_train_acc = history['train_accuracy'][-1]
+    final_val_acc = history['val_accuracy'][-1]
+    
+    print(f"   Final train loss: {final_train_loss:.4f}")
+    print(f"   Final val loss:   {final_val_loss:.4f}")
+    print(f"   Final train acc:  {final_train_acc:.3f}")
+    print(f"   Final val acc:    {final_val_acc:.3f}")
+    
+    # Check for overfitting
+    if final_train_loss < final_val_loss * 0.8:
+        print("   ⚠️ Possible overfitting detected")
+    else:
+        print("   ✅ Training looks healthy")
+    print()
+    
+    # Generate text after training (should be better)
+    print("📝 Text generation AFTER training:")
+    generation_prompts = [
+        "To be",
+        "Shall I",
+        "The",
+        "And",
+        "But"
+    ]
+    
+    for prompt in generation_prompts:
+        # Generate with different temperatures
+        for temp in [0.3, 0.7, 1.0]:
+            generated = trainer.generate_text(prompt, max_length=50, temperature=temp)
+            print(f"   '{prompt}' (T={temp}) → '{generated}'")
+        print()
+    
+    # Demonstrate completion capabilities
+    print("🎯 Shakespeare Completion Test:")
+    test_completions = [
+        "To be, or not to",
+        "Shall I compare thee",
+        "The slings and arrows",
+        "When in eternal lines"
+    ]
+    
+    for completion_prompt in test_completions:
+        generated = trainer.generate_text(completion_prompt, max_length=40, temperature=0.5)
+        print(f"   Input:  '{completion_prompt}'")
+        print(f"   Output: '{generated}'")
+        print()
+    
+    # Performance analysis
+    print("⚡ Performance Analysis:")
+    total_params = model.count_parameters()
+    tokens_per_sec = len(tokenizer.encode(shakespeare_text)) / training_time
+    
+    print(f"   Model parameters: {total_params:,}")
+    print(f"   Training speed: {tokens_per_sec:.1f} tokens/sec")
+    print(f"   Memory usage: ~{total_params * 4 / 1024 / 1024:.1f} MB (fp32)")
+    print()
+    
+    # Compare with TinyTorch vision models
+    print("🔍 Comparison with TinyTorch Vision Models:")
+    print("   Similarities:")
+    print("     • Uses same Dense layers for embeddings and projections")
+    print("     • Reuses CrossEntropyLoss and Adam optimizer")
+    print("     • Training loop structure identical to CNN training")
+    print("     • Batch processing works the same way")
+    print("   Differences:")
+    print("     • Attention mechanism is new (not in CNN models)")
+    print("     • Sequence processing vs spatial processing")
+    print("     • Autoregressive generation vs classification")
+    print("     • Character tokenization vs image preprocessing")
+    print()
+    
+    # Framework reusability analysis
+    print("🔄 TinyTorch Reusability Analysis:")
+    reusable_components = [
+        "Dense layers (100%)",
+        "Activation functions (100%)",
+        "Loss functions (95%)",
+        "Optimizers (100%)",
+        "Training infrastructure (90%)",
+        "DataLoader concept (80%)",
+        "Tensor operations (100%)"
+    ]
+    
+    new_components = [
+        "Multi-head attention",
+        "Positional encoding", 
+        "Layer normalization",
+        "Causal masking",
+        "Text tokenization",
+        "Autoregressive generation"
+    ]
+    
+    print("   ✅ Reusable from TinyTorch:")
+    for component in reusable_components:
+        print(f"     • {component}")
+    
+    print("   🆕 New for language models:")
+    for component in new_components:
+        print(f"     • {component}")
+    print()
+    
+    # Conclusion
+    print("🎉 Conclusion:")
+    print("   TinyGPT successfully demonstrates that TinyTorch's foundation")
+    print("   is general enough to support both vision AND language models!")
+    print("   ")
+    print(f"   Key achievements:")
+    print(f"   ✅ Character-level GPT trained from scratch")
+    print(f"   ✅ ~70% component reuse from TinyTorch")
+    print(f"   ✅ Text generation works out of the box")
+    print(f"   ✅ Training infrastructure fully compatible")
+    print(f"   ✅ Educational clarity maintained")
+    print()
+    print("   🤔 Framework decision: TinyTorch can handle both!")
+    print("   The same mathematical foundations power vision and language.")
+    
+
+if __name__ == "__main__":
+    main()