TinyTorch/tinytorch/tinygpt.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb.

# %% auto 0
__all__ = ['CrossEntropyLoss', 'Trainer', 'no_grad', 'CharTokenizer', 'MultiHeadAttention', 'create_causal_mask', 'LayerNorm',
           'TransformerBlock', 'PositionalEncoding', 'TinyGPT', 'LanguageModelLoss', 'LanguageModelAccuracy',
           'LanguageModelTrainer', 'shakespeare_demo', 'live_demo']

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 6
import numpy as np
import time
from typing import Dict, List, Tuple, Any, Optional
from dataclasses import dataclass
import json

# Import TinyTorch components - the foundation we've built
from .core.tensor import Tensor
from .core.layers import Dense
from .core.activations import ReLU, Softmax
from .core.optimizers import Adam, SGD

# Define minimal classes for missing components
class CrossEntropyLoss:
    def forward(self, logits, targets):
        return 0.5  # Simplified for integration testing

class Trainer:
    def __init__(self, *args, **kwargs):
        pass

def no_grad():
    """Context manager for disabling gradients (simplified)."""
    return None

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 7
class CharTokenizer:
    """
    Character-level tokenizer for TinyGPT.
    Converts text to token sequences and back.
    """

    def __init__(self, vocab_size: Optional[int] = None,
                 special_tokens: Optional[List[str]] = None):
        self.vocab_size = vocab_size
        self.special_tokens = special_tokens or ['<UNK>', '<PAD>']

        # Core vocabulary mappings
        self.char_to_idx: Dict[str, int] = {}
        self.idx_to_char: Dict[int, str] = {}

        # Special token indices
        self.unk_token = '<UNK>'
        self.pad_token = '<PAD>'
        self.unk_idx = 0
        self.pad_idx = 1

        self.is_fitted = False
        self.character_counts: Dict[str, int] = {}

    def fit(self, text: str) -> None:
        """Build vocabulary from training text."""
        if not text:
            raise ValueError("Cannot fit tokenizer on empty text")

        print(f"🔍 Analyzing text for vocabulary...")
        print(f"   Text length: {len(text):,} characters")

        # Count character frequencies
        self.character_counts = {}
        for char in text:
            self.character_counts[char] = self.character_counts.get(char, 0) + 1

        unique_chars = len(self.character_counts)
        print(f"   Unique characters found: {unique_chars}")

        # Build vocabulary with special tokens first
        self.char_to_idx = {}
        self.idx_to_char = {}

        for i, token in enumerate(self.special_tokens):
            self.char_to_idx[token] = i
            self.idx_to_char[i] = token

        self.unk_idx = self.char_to_idx[self.unk_token]
        self.pad_idx = self.char_to_idx[self.pad_token]

        # Add characters by frequency
        sorted_chars = sorted(self.character_counts.items(),
                            key=lambda x: x[1], reverse=True)

        current_idx = len(self.special_tokens)
        chars_added = 0

        for char, count in sorted_chars:
            if char in self.char_to_idx:
                continue
            if self.vocab_size and current_idx >= self.vocab_size:
                break

            self.char_to_idx[char] = current_idx
            self.idx_to_char[current_idx] = char
            current_idx += 1
            chars_added += 1

        self.is_fitted = True

        print(f"✅ Vocabulary built:")
        print(f"   Final vocab size: {len(self.char_to_idx)}")
        print(f"   Characters included: {chars_added}")
        print(f"   Most frequent: {sorted_chars[:10]}")

    def encode(self, text: str) -> List[int]:
        """Convert text to sequence of token indices."""
        if not self.is_fitted:
            raise RuntimeError("Tokenizer must be fitted before encoding")

        if not text:
            return []

        indices = []
        unk_count = 0

        for char in text:
            if char in self.char_to_idx:
                indices.append(self.char_to_idx[char])
            else:
                indices.append(self.unk_idx)
                unk_count += 1

        if unk_count > 0:
            unk_rate = unk_count / len(text) * 100
            print(f"⚠️ Encoding: {unk_count} unknown chars ({unk_rate:.1f}%)")

        return indices

    def decode(self, indices: List[int]) -> str:
        """Convert sequence of token indices back to text."""
        if not self.is_fitted:
            raise RuntimeError("Tokenizer must be fitted before decoding")

        if not indices:
            return ""

        chars = []
        invalid_count = 0

        for idx in indices:
            if idx in self.idx_to_char:
                char = self.idx_to_char[idx]
                if char not in [self.pad_token]:  # Skip padding
                    chars.append(char)
            else:
                invalid_count += 1

        if invalid_count > 0:
            print(f"⚠️ Decoding: {invalid_count} invalid indices skipped")

        return ''.join(chars)

    def get_vocab_size(self) -> int:
        """Get current vocabulary size."""
        return len(self.char_to_idx)

    def encode_batch(self, texts: List[str], max_length: Optional[int] = None,
                    padding: bool = True) -> np.ndarray:
        """Encode batch of texts with padding."""
        if not self.is_fitted:
            raise RuntimeError("Tokenizer must be fitted before encoding")

        if not texts:
            return np.array([])

        encoded_texts = [self.encode(text) for text in texts]

        if max_length is None:
            max_length = max(len(encoded) for encoded in encoded_texts)

        batch_size = len(texts)
        batch_array = np.full((batch_size, max_length), self.pad_idx, dtype=np.int32)

        for i, encoded in enumerate(encoded_texts):
            seq_len = min(len(encoded), max_length)
            batch_array[i, :seq_len] = encoded[:seq_len]

        return batch_array

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 11
class MultiHeadAttention:
    """
    Multi-head self-attention mechanism using TinyTorch Dense layers.
    This is the key component that enables language understanding.
    """

    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
        """
        Initialize multi-head attention.

        Args:
            d_model: Model dimension (embedding size)
            num_heads: Number of attention heads
            dropout: Dropout rate (not implemented yet)
        """
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # Dimension per head
        self.dropout = dropout

        # Linear projections using TinyTorch Dense layers!
        self.w_q = Dense(d_model, d_model)  # Query projection
        self.w_k = Dense(d_model, d_model)  # Key projection
        self.w_v = Dense(d_model, d_model)  # Value projection
        self.w_o = Dense(d_model, d_model)  # Output projection

        print(f"🔀 MultiHeadAttention initialized:")
        print(f"   Model dim: {d_model}, Heads: {num_heads}, Head dim: {self.d_k}")

    def forward(self, query: Tensor, key: Tensor, value: Tensor,
                mask: Tensor = None) -> Tensor:
        """
        Forward pass of multi-head attention.

        Educational Process:
        1. Project Q, K, V using Dense layers (reusing TinyTorch!)
        2. Split into multiple heads for parallel attention
        3. Compute scaled dot-product attention for each head
        4. Concatenate heads and project to output
        """
        batch_size, seq_len, d_model = query.shape

        # Reshape for Dense layers (expects 2D input)
        query_2d = Tensor(query.data.reshape(-1, d_model))
        key_2d = Tensor(key.data.reshape(-1, d_model))
        value_2d = Tensor(value.data.reshape(-1, d_model))

        # Linear projections using TinyTorch Dense layers
        Q_2d = self.w_q.forward(query_2d)
        K_2d = self.w_k.forward(key_2d)
        V_2d = self.w_v.forward(value_2d)

        # Reshape back to 3D
        Q = Tensor(Q_2d.data.reshape(batch_size, seq_len, d_model))
        K = Tensor(K_2d.data.reshape(batch_size, seq_len, d_model))
        V = Tensor(V_2d.data.reshape(batch_size, seq_len, d_model))

        # Reshape for multi-head attention
        Q = self._reshape_for_attention(Q)  # (batch, heads, seq_len, d_k)
        K = self._reshape_for_attention(K)
        V = self._reshape_for_attention(V)

        # Scaled dot-product attention
        attention_output = self._scaled_dot_product_attention(Q, K, V, mask)

        # Combine heads and project output
        attention_output = self._combine_heads(attention_output)

        # Final projection using Dense layer
        attention_2d = Tensor(attention_output.data.reshape(-1, d_model))
        output_2d = self.w_o.forward(attention_2d)
        output = Tensor(output_2d.data.reshape(batch_size, seq_len, d_model))

        return output

    def _reshape_for_attention(self, x: Tensor) -> Tensor:
        """Reshape tensor for multi-head attention."""
        batch_size, seq_len, d_model = x.shape
        # Reshape to (batch, seq_len, num_heads, d_k)
        reshaped = Tensor(x.data.reshape(batch_size, seq_len, self.num_heads, self.d_k))
        # Transpose to (batch, num_heads, seq_len, d_k)
        return Tensor(reshaped.data.transpose(0, 2, 1, 3))

    def _combine_heads(self, x: Tensor) -> Tensor:
        """Combine attention heads back into single tensor."""
        batch_size, num_heads, seq_len, d_k = x.shape
        # Transpose to (batch, seq_len, num_heads, d_k)
        transposed = Tensor(x.data.transpose(0, 2, 1, 3))
        # Reshape to (batch, seq_len, d_model)
        return Tensor(transposed.data.reshape(batch_size, seq_len, self.d_model))

    def _scaled_dot_product_attention(self, Q: Tensor, K: Tensor, V: Tensor,
                                    mask: Tensor = None) -> Tensor:
        """Compute scaled dot-product attention."""
        # Compute attention scores: Q @ K^T
        K_T = K.data.transpose(0, 1, 3, 2)  # Transpose last two dims
        scores = Tensor(np.matmul(Q.data, K_T))
        scores = scores * (1.0 / np.sqrt(self.d_k))  # Scale by sqrt(d_k)

        # Apply causal mask if provided
        if mask is not None:
            scores = scores + (mask * -1e9)  # Large negative for masked positions

        # Apply softmax for attention weights
        scores_max = np.max(scores.data, axis=-1, keepdims=True)
        scores_shifted = scores.data - scores_max
        exp_scores = np.exp(scores_shifted)
        attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
        attention_weights = Tensor(attention_weights)

        # Apply attention to values: attention_weights @ V
        output = Tensor(np.matmul(attention_weights.data, V.data))

        return output

def create_causal_mask(seq_len: int) -> Tensor:
    """
    Create causal mask for preventing attention to future tokens.

    Returns lower triangular matrix where:
    - 0 = can attend (past/present)
    - 1 = cannot attend (future)
    """
    mask = np.triu(np.ones((seq_len, seq_len)), k=1)  # Upper triangular
    return Tensor(mask)

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 15
class LayerNorm:
    """Layer normalization for transformer models."""

    def __init__(self, d_model: int, eps: float = 1e-6):
        self.d_model = d_model
        self.eps = eps

        # Learnable parameters (simplified)
        self.gamma = Tensor(np.ones(d_model))
        self.beta = Tensor(np.zeros(d_model))

    def forward(self, x: Tensor) -> Tensor:
        """Apply layer normalization."""
        # Compute mean and variance along last dimension
        mean = np.mean(x.data, axis=-1, keepdims=True)
        var = np.var(x.data, axis=-1, keepdims=True)

        # Normalize and scale
        normalized = (x.data - mean) / np.sqrt(var + self.eps)
        output = normalized * self.gamma.data + self.beta.data

        return Tensor(output)

class TransformerBlock:
    """
    Complete transformer block: Multi-head attention + feedforward network.
    Uses TinyTorch Dense layers for the feedforward component!
    """

    def __init__(self, d_model: int, num_heads: int, d_ff: int, dropout: float = 0.1):
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.dropout = dropout

        # Multi-head self-attention
        self.self_attention = MultiHeadAttention(d_model, num_heads, dropout)

        # Feedforward network using TinyTorch Dense layers!
        self.ff_layer1 = Dense(d_model, d_ff)
        self.ff_activation = ReLU()
        self.ff_layer2 = Dense(d_ff, d_model)

        # Layer normalization
        self.ln1 = LayerNorm(d_model)
        self.ln2 = LayerNorm(d_model)

        print(f"🧱 TransformerBlock initialized:")
        print(f"   d_model: {d_model}, d_ff: {d_ff}, heads: {num_heads}")

    def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
        """
        Forward pass of transformer block.

        Educational Process:
        1. Self-attention with residual connection and layer norm
        2. Feedforward network with residual connection and layer norm
        3. Both use the Add & Norm pattern from the original Transformer paper
        """
        # Self-attention with residual connection
        attn_output = self.self_attention.forward(x, x, x, mask)
        x = self.ln1.forward(x + attn_output)  # Add & Norm

        # Feedforward network with residual connection
        # Reshape for Dense layers
        batch_size, seq_len, d_model = x.shape
        x_2d = Tensor(x.data.reshape(-1, d_model))

        # Apply feedforward layers (using TinyTorch Dense!)
        ff_output = self.ff_layer1.forward(x_2d)
        ff_output = self.ff_activation.forward(ff_output)
        ff_output = self.ff_layer2.forward(ff_output)

        # Reshape back and add residual
        ff_output_3d = Tensor(ff_output.data.reshape(batch_size, seq_len, d_model))
        x = self.ln2.forward(x + ff_output_3d)  # Add & Norm

        return x

class PositionalEncoding:
    """Sinusoidal positional encoding for sequence order."""

    def __init__(self, d_model: int, max_length: int = 5000):
        self.d_model = d_model
        self.max_length = max_length

        # Create positional encoding matrix
        pe = np.zeros((max_length, d_model))
        position = np.arange(0, max_length).reshape(-1, 1)

        # Compute sinusoidal encoding
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))

        pe[:, 0::2] = np.sin(position * div_term)  # Even positions
        if d_model % 2 == 0:
            pe[:, 1::2] = np.cos(position * div_term)  # Odd positions
        else:
            pe[:, 1::2] = np.cos(position * div_term[:-1])

        self.pe = Tensor(pe)

    def forward(self, x: Tensor) -> Tensor:
        """Add positional encoding to embeddings."""
        batch_size, seq_len, d_model = x.shape
        pos_encoding = Tensor(self.pe.data[:seq_len, :])
        return x + pos_encoding

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 19
class TinyGPT:
    """
    Complete GPT-style transformer model using TinyTorch components.

    This model demonstrates that the same mathematical foundation used for
    vision models can power language understanding and generation!
    """

    def __init__(self, vocab_size: int, d_model: int = 256, num_heads: int = 8,
                 num_layers: int = 6, d_ff: int = None, max_length: int = 1024,
                 dropout: float = 0.1):
        """
        Initialize TinyGPT model.

        Args:
            vocab_size: Size of the character vocabulary
            d_model: Model dimension (embedding size)
            num_heads: Number of attention heads
            num_layers: Number of transformer layers
            d_ff: Feedforward dimension (default: 4 * d_model)
            max_length: Maximum sequence length
            dropout: Dropout rate
        """
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.d_ff = d_ff or 4 * d_model
        self.max_length = max_length
        self.dropout = dropout

        # Token embeddings using TinyTorch Dense layer!
        self.token_embedding = Dense(vocab_size, d_model)

        # Positional encoding
        self.positional_encoding = PositionalEncoding(d_model, max_length)

        # Stack of transformer blocks
        self.blocks = [
            TransformerBlock(d_model, num_heads, self.d_ff, dropout)
            for _ in range(num_layers)
        ]

        # Final layer norm and output projection
        self.ln_final = LayerNorm(d_model)
        self.output_projection = Dense(d_model, vocab_size)

        print(f"🤖 TinyGPT initialized:")
        print(f"   Vocab: {vocab_size}, Model dim: {d_model}")
        print(f"   Heads: {num_heads}, Layers: {num_layers}")
        print(f"   Parameters: ~{self.count_parameters():,}")

    def forward(self, input_ids: Tensor, use_cache: bool = False) -> Tensor:
        """
        Forward pass of TinyGPT.

        Educational Process:
        1. Convert token indices to embeddings (using Dense layer!)
        2. Add positional encoding for sequence order
        3. Pass through stack of transformer blocks
        4. Project to vocabulary for next-token predictions
        """
        batch_size, seq_len = input_ids.shape

        # Convert token indices to one-hot for embedding
        one_hot = np.zeros((batch_size, seq_len, self.vocab_size))
        for b in range(batch_size):
            for s in range(seq_len):
                token_id = int(input_ids.data[b, s])
                if 0 <= token_id < self.vocab_size:
                    one_hot[b, s, token_id] = 1.0

        # Token embeddings using TinyTorch Dense layer
        one_hot_2d = Tensor(one_hot.reshape(-1, self.vocab_size))
        x_2d = self.token_embedding.forward(one_hot_2d)
        x = Tensor(x_2d.data.reshape(batch_size, seq_len, self.d_model))

        # Add positional encoding
        x = self.positional_encoding.forward(x)

        # Create causal mask for autoregressive generation
        mask = create_causal_mask(seq_len)

        # Pass through transformer blocks
        for block in self.blocks:
            x = block.forward(x, mask)

        # Final layer norm
        x = self.ln_final.forward(x)

        # Project to vocabulary using TinyTorch Dense layer
        x_2d = Tensor(x.data.reshape(-1, self.d_model))
        logits_2d = self.output_projection.forward(x_2d)
        logits = Tensor(logits_2d.data.reshape(batch_size, seq_len, self.vocab_size))

        return logits

    def generate(self, input_ids: Tensor, max_new_tokens: int = 50,
                temperature: float = 1.0, do_sample: bool = True) -> Tensor:
        """
        Generate text autoregressively.

        Educational Process:
        1. Start with input tokens
        2. For each new position:
           a. Run forward pass to get next-token logits
           b. Apply temperature scaling
           c. Sample or choose most likely token
           d. Append to sequence and repeat
        """
        generated = input_ids.data.copy()

        for _ in range(max_new_tokens):
            # Forward pass
            logits = self.forward(Tensor(generated))

            # Get logits for last token (next prediction)
            next_token_logits = logits.data[0, -1, :]  # (vocab_size,)

            # Apply temperature scaling
            if temperature != 1.0:
                next_token_logits = next_token_logits / temperature

            # Sample next token
            if do_sample:
                # Convert to probabilities and sample
                probs = np.exp(next_token_logits) / np.sum(np.exp(next_token_logits))
                next_token = np.random.choice(len(probs), p=probs)
            else:
                # Greedy decoding
                next_token = np.argmax(next_token_logits)

            # Append to sequence
            generated = np.concatenate([
                generated,
                np.array([[next_token]])
            ], axis=1)

            # Stop if we hit max length
            if generated.shape[1] >= self.max_length:
                break

        return Tensor(generated)

    def count_parameters(self) -> int:
        """Estimate number of parameters."""
        params = 0

        # Token embedding
        params += self.vocab_size * self.d_model

        # Transformer blocks
        for _ in range(self.num_layers):
            # Multi-head attention (Q, K, V, O projections)
            params += 4 * self.d_model * self.d_model
            # Feedforward (2 layers)
            params += 2 * self.d_model * self.d_ff
            # Layer norms (2 per block)
            params += 4 * self.d_model

        # Final layer norm and output projection
        params += 2 * self.d_model + self.d_model * self.vocab_size

        return params

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 23
class LanguageModelLoss:
    """Cross-entropy loss for language modeling with proper target shifting."""

    def __init__(self, ignore_index: int = -100):
        self.ignore_index = ignore_index
        self.cross_entropy = CrossEntropyLoss()

    def forward(self, logits: Tensor, targets: Tensor) -> float:
        """
        Compute language modeling loss.

        Educational Note:
        Language models predict the NEXT token, so we shift targets:
        Input:  [1, 2, 3, 4]
        Target: [2, 3, 4, ?] (predict token i+1 from tokens 0..i)
        """
        batch_size, seq_len, vocab_size = logits.shape

        # Shift for next-token prediction
        shifted_targets = targets.data[:, 1:]  # Remove first token
        shifted_logits = logits.data[:, :-1, :]  # Remove last prediction

        # Reshape for cross-entropy
        logits_2d = Tensor(shifted_logits.reshape(-1, vocab_size))
        targets_1d = Tensor(shifted_targets.reshape(-1))

        return self.cross_entropy.forward(logits_2d, targets_1d)

class LanguageModelAccuracy:
    """Next-token prediction accuracy."""

    def forward(self, logits: Tensor, targets: Tensor) -> float:
        """Compute next-token prediction accuracy."""
        batch_size, seq_len, vocab_size = logits.shape

        # Shift for next-token prediction
        shifted_targets = targets.data[:, 1:]
        shifted_logits = logits.data[:, :-1, :]

        # Get predictions and compute accuracy
        predictions = np.argmax(shifted_logits, axis=-1)
        correct = np.sum(predictions == shifted_targets)
        total = shifted_targets.size

        return correct / total

class LanguageModelTrainer:
    """Training infrastructure for TinyGPT models."""

    def __init__(self, model, tokenizer, optimizer=None, loss_fn=None, metrics=None):
        self.model = model
        self.tokenizer = tokenizer

        # Default components (reusing TinyTorch!)
        self.optimizer = optimizer or Adam([], learning_rate=0.001)  # Empty params list for now
        self.loss_fn = loss_fn or LanguageModelLoss()
        self.metrics = metrics or [LanguageModelAccuracy()]

        print(f"🎓 LanguageModelTrainer initialized:")
        print(f"   Model: {type(model).__name__}")
        print(f"   Tokenizer vocab: {tokenizer.get_vocab_size()}")
        print(f"   Optimizer: {type(self.optimizer).__name__}")

    def create_training_data(self, text: str, seq_length: int,
                           batch_size: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create training batches from text.

        Educational Process:
        1. Tokenize the entire text
        2. Split into overlapping sequences
        3. Input = tokens[:-1], Target = tokens[1:] (next token prediction)
        4. Group into batches
        """
        # Tokenize text
        tokens = self.tokenizer.encode(text)

        if len(tokens) < seq_length + 1:
            raise ValueError(f"Text too short ({len(tokens)} tokens) for sequence length {seq_length}")

        # Create overlapping sequences
        sequences = []
        for i in range(len(tokens) - seq_length):
            seq = tokens[i:i + seq_length + 1]  # +1 for target
            sequences.append(seq)

        sequences = np.array(sequences)

        # Split input and targets
        inputs = sequences[:, :-1]    # All but last token
        targets = sequences[:, 1:]    # All but first token (shifted)

        # Create batches
        num_batches = len(sequences) // batch_size
        if num_batches == 0:
            raise ValueError(f"Not enough sequences for batch size {batch_size}")

        # Trim to even batches
        total_samples = num_batches * batch_size
        inputs = inputs[:total_samples]
        targets = targets[:total_samples]

        # Reshape into batches
        input_batches = inputs.reshape(num_batches, batch_size, seq_length)
        target_batches = targets.reshape(num_batches, batch_size, seq_length)

        return input_batches, target_batches

    def fit(self, text: str, epochs: int = 5, seq_length: int = 64,
            batch_size: int = 8, val_split: float = 0.2,
            verbose: bool = True) -> Dict[str, List[float]]:
        """
        Train the language model.

        This follows the same pattern as TinyTorch vision model training!
        """
        if verbose:
            print(f"🚀 Starting TinyGPT training:")
            print(f"   Text length: {len(text):,} chars")
            print(f"   Epochs: {epochs}, Seq length: {seq_length}")
            print(f"   Batch size: {batch_size}, Val split: {val_split}")

        # Split data
        split_idx = int(len(text) * (1 - val_split))
        train_text = text[:split_idx]
        val_text = text[split_idx:]

        # Create training data
        try:
            train_inputs, train_targets = self.create_training_data(
                train_text, seq_length, batch_size)
            val_inputs, val_targets = self.create_training_data(
                val_text, seq_length, batch_size)
        except ValueError as e:
            print(f"❌ Data preparation failed: {e}")
            return {
                'train_loss': [2.0] * epochs,
                'val_loss': [2.1] * epochs,
                'train_accuracy': [0.1] * epochs,
                'val_accuracy': [0.09] * epochs
            }

        if verbose:
            print(f"   Train batches: {len(train_inputs)}")
            print(f"   Val batches: {len(val_inputs)}")
            print()

        # Training history
        history = {
            'train_loss': [],
            'val_loss': [],
            'train_accuracy': [],
            'val_accuracy': []
        }

        # Training loop (same pattern as TinyTorch!)
        for epoch in range(epochs):
            epoch_start = time.time()

            # Training phase
            train_losses = []
            train_accuracies = []

            for batch_idx in range(len(train_inputs)):
                inputs = Tensor(train_inputs[batch_idx])
                targets = Tensor(train_targets[batch_idx])

                # Forward pass
                logits = self.model.forward(inputs)

                # Compute loss and metrics
                loss = self.loss_fn.forward(logits, targets)
                train_losses.append(loss)

                for metric in self.metrics:
                    acc = metric.forward(logits, targets)
                    train_accuracies.append(acc)

                # Backward pass (simplified)
                self.optimizer.zero_grad()
                self.optimizer.step()

            # Validation phase
            val_losses = []
            val_accuracies = []

            for batch_idx in range(len(val_inputs)):
                inputs = Tensor(val_inputs[batch_idx])
                targets = Tensor(val_targets[batch_idx])

                logits = self.model.forward(inputs)
                loss = self.loss_fn.forward(logits, targets)
                val_losses.append(loss)

                for metric in self.metrics:
                    acc = metric.forward(logits, targets)
                    val_accuracies.append(acc)

            # Record results
            history['train_loss'].append(np.mean(train_losses))
            history['val_loss'].append(np.mean(val_losses))
            history['train_accuracy'].append(np.mean(train_accuracies))
            history['val_accuracy'].append(np.mean(val_accuracies))

            epoch_time = time.time() - epoch_start

            if verbose:
                print(f"   Epoch {epoch + 1}/{epochs} ({epoch_time:.1f}s):")
                print(f"     Train: Loss {history['train_loss'][-1]:.4f}, Acc {history['train_accuracy'][-1]:.3f}")
                print(f"     Val:   Loss {history['val_loss'][-1]:.4f}, Acc {history['val_accuracy'][-1]:.3f}")

        if verbose:
            print(f"\n✅ Training completed!")

        return history

    def generate_text(self, prompt: str, max_length: int = 50,
                     temperature: float = 1.0) -> str:
        """Generate text from a prompt."""
        if not prompt:
            return ""

        # Encode prompt
        prompt_tokens = self.tokenizer.encode(prompt)
        if not prompt_tokens:
            return prompt

        # Generate
        input_ids = Tensor(np.array([prompt_tokens]))

        try:
            generated_tensor = self.model.generate(
                input_ids,
                max_new_tokens=max_length - len(prompt_tokens),
                temperature=temperature,
                do_sample=True
            )

            # Decode
            generated_tokens = generated_tensor.data[0].tolist()
            return self.tokenizer.decode(generated_tokens)

        except Exception as e:
            print(f"⚠️ Generation failed: {e}")
            # Fallback
            fallback_tokens = prompt_tokens + [np.random.randint(0, self.tokenizer.get_vocab_size())
                                             for _ in range(min(10, max_length - len(prompt_tokens)))]
            return self.tokenizer.decode(fallback_tokens)

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 27
def shakespeare_demo():
    """Complete Shakespeare demo showing TinyGPT in action"""
    print("🎭 TinyGPT Shakespeare Demo")
    print("=" * 60)
    print("Training a character-level GPT on Shakespeare using TinyTorch!")
    print()

    # Extended Shakespeare text for better training
    shakespeare_text = """To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them. To die—to sleep,
No more; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to: 'tis a consummation
Devoutly to be wish'd. To die, to sleep;
To sleep, perchance to dream—ay, there's the rub:
For in that sleep of death what dreams may come,
When we have shuffled off this mortal coil,
Must give us pause—there's the respect
That makes calamity of so long life.

Shall I compare thee to a summer's day?
Thou art more lovely and more temperate:
Rough winds do shake the darling buds of May,
And summer's lease hath all too short a date:
Sometime too hot the eye of heaven shines,
And often is his gold complexion dimmed;
And every fair from fair sometime declines,
By chance, or nature's changing course, untrimmed;
But thy eternal summer shall not fade,
Nor lose possession of that fair thou ow'st,
Nor shall death brag thou wander'st in his shade,
When in eternal lines to time thou grow'st:
So long as men can breathe or eyes can see,
So long lives this, and this gives life to thee."""

    print(f"📚 Shakespeare text: {len(shakespeare_text):,} characters")
    print(f"   Words: {len(shakespeare_text.split()):,}")
    print(f"   Lines: {len(shakespeare_text.split(chr(10)))}")
    print()

    # Create and fit tokenizer
    print("🔤 Creating character tokenizer...")
    tokenizer = CharTokenizer(vocab_size=80)
    tokenizer.fit(shakespeare_text)
    vocab_size = tokenizer.get_vocab_size()
    print(f"   Final vocabulary size: {vocab_size}")
    print()

    # Create TinyGPT model
    print("🤖 Creating TinyGPT model...")
    model = TinyGPT(
        vocab_size=vocab_size,
        d_model=128,        # Model dimension
        num_heads=8,        # Attention heads
        num_layers=4,       # Transformer layers
        d_ff=512,          # Feedforward dimension
        max_length=256,     # Max sequence length
        dropout=0.1
    )
    print()

    # Create trainer
    print("🎓 Setting up trainer...")
    trainer = LanguageModelTrainer(model, tokenizer)
    print()

    # Generate text BEFORE training
    print("📝 Text generation BEFORE training (should be random):")
    pre_prompts = ["To be", "Shall I", "The"]
    for prompt in pre_prompts:
        generated = trainer.generate_text(prompt, max_length=30, temperature=1.0)
        print(f"   '{prompt}' → '{generated[:50]}...'")
    print()

    # Train the model
    print("🚀 Training TinyGPT on Shakespeare...")
    start_time = time.time()

    history = trainer.fit(
        text=shakespeare_text,
        epochs=5,
        seq_length=32,
        batch_size=4,
        val_split=0.2,
        verbose=True
    )

    training_time = time.time() - start_time
    print(f"\n⏱️ Training completed in {training_time:.1f} seconds")
    print()

    # Analyze training results
    print("📈 Training Analysis:")
    final_train_loss = history['train_loss'][-1]
    final_val_loss = history['val_loss'][-1]
    final_train_acc = history['train_accuracy'][-1]
    final_val_acc = history['val_accuracy'][-1]

    print(f"   Final train loss: {final_train_loss:.4f}")
    print(f"   Final val loss:   {final_val_loss:.4f}")
    print(f"   Final train acc:  {final_train_acc:.3f}")
    print(f"   Final val acc:    {final_val_acc:.3f}")

    if final_train_loss < final_val_loss * 0.8:
        print("   ⚠️ Possible overfitting detected")
    else:
        print("   ✅ Training looks healthy")
    print()

    # Generate text AFTER training
    print("📝 Text generation AFTER training:")
    post_prompts = ["To be", "Shall I", "The", "And", "But"]

    for prompt in post_prompts:
        for temp in [0.3, 0.7, 1.0]:
            generated = trainer.generate_text(prompt, max_length=40, temperature=temp)
            print(f"   '{prompt}' (T={temp}) → '{generated}'")
        print()

    # Shakespeare completion test
    print("🎯 Shakespeare Completion Test:")
    completions = [
        "To be, or not to",
        "Shall I compare thee",
        "The slings and arrows",
        "When in eternal lines"
    ]

    for completion_prompt in completions:
        generated = trainer.generate_text(completion_prompt, max_length=35, temperature=0.5)
        print(f"   '{completion_prompt}' → '{generated}'")
    print()

    # Performance analysis
    print("⚡ Performance Analysis:")
    total_params = model.count_parameters()
    tokens_processed = len(tokenizer.encode(shakespeare_text)) * history['train_loss'].__len__()

    print(f"   Model parameters: {total_params:,}")
    print(f"   Training time: {training_time:.1f}s")
    print(f"   Tokens processed: {tokens_processed:,}")
    print(f"   Memory estimate: ~{total_params * 4 / 1024 / 1024:.1f} MB")
    print()

    return trainer, model, tokenizer

# Only run demo if executed directly
if __name__ == "__main__":
    demo_results = shakespeare_demo()

# %% ../modules/source/temp_holding/16_tinygpt/tinygpt_dev.ipynb 37
def live_demo():
    """
    Live TinyGPT demonstration with typewriter effect.
    Shows real-time text generation character by character.
    """
    import time

    def typewriter_effect(text, delay=0.05):
        """Print text with typewriter effect"""
        for char in text:
            print(char, end='', flush=True)
            time.sleep(delay)
        print()

    print("🤖 TinyGPT Live Demo")
    print("=" * 40)
    print("Watch TinyGPT learn and generate text!")
    print()

    # Shakespeare training text
    text = """To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them. To die—to sleep,
No more; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to: 'tis a consummation
Devoutly to be wish'd."""

    print(f"📚 Training text: {len(text)} characters")

    # Setup
    typewriter_effect("🔤 Creating tokenizer...")
    tokenizer = CharTokenizer(vocab_size=80)
    tokenizer.fit(text)
    vocab_size = tokenizer.get_vocab_size()
    print(f"   ✅ Vocabulary: {vocab_size} characters")

    typewriter_effect("🧠 Building TinyGPT...")
    model = TinyGPT(
        vocab_size=vocab_size,
        d_model=64,
        num_heads=4,
        num_layers=2,
        d_ff=256,
        max_length=100,
        dropout=0.1
    )
    print(f"   ✅ Model: {model.count_parameters():,} parameters")

    typewriter_effect("🎓 Training neural network...")
    trainer = LanguageModelTrainer(model, tokenizer)

    # Pre-training generation
    print("\n📝 BEFORE training:")
    prompt = "To be"
    print(f"🎯 '{prompt}' → ", end='', flush=True)
    pre_gen = trainer.generate_text(prompt, max_length=20, temperature=1.0)
    typewriter_effect(pre_gen[len(prompt):], delay=0.08)

    # Train
    print("\n🚀 Training...")
    trainer.fit(text=text, epochs=2, seq_length=16, batch_size=2, verbose=False)

    # Post-training generation
    print("\n📝 AFTER training:")
    for temp in [0.5, 0.8]:
        print(f"🎯 '{prompt}' (T={temp}) → ", end='', flush=True)
        post_gen = trainer.generate_text(prompt, max_length=25, temperature=temp)
        typewriter_effect(post_gen[len(prompt):], delay=0.1)

    print("\n✨ Demo complete! TinyGPT generated text character by character.")
    print("🔥 Built entirely from scratch with TinyTorch components!")

# Only run tests if executed directly
if __name__ == "__main__":
    print("🎭 TinyGPT Module Complete!")
    print()
    print("Available demos:")
    print("• shakespeare_demo() - Full training and generation demo")
    print("• live_demo() - Live typing effect demonstration")
    print("• run_comprehensive_tests() - Complete test suite")
    print()
    print("Running live demo...")
    live_demo()