TinyTorch/tinytorch/applications/tinygpt.py

# ╔═══════════════════════════════════════════════════════════════════════════════╗
# ║                        🚨 CRITICAL WARNING 🚨                                ║
# ║                     AUTOGENERATED! DO NOT EDIT!                              ║
# ║                                                                               ║
# ║  This file is AUTOMATICALLY GENERATED from source modules.                   ║
# ║  ANY CHANGES MADE HERE WILL BE LOST when modules are re-exported!            ║
# ║                                                                               ║
# ║  ✅ TO EDIT: src/XX_tinygpt/XX_tinygpt.py                           ║
# ║  ✅ TO EXPORT: Run 'tito module complete <module_name>'                      ║
# ║                                                                               ║
# ║  🛡️ STUDENT PROTECTION: This file contains optimized implementations.        ║
# ║     Editing it directly may break module functionality and training.         ║
# ║                                                                               ║
# ║  🎓 LEARNING TIP: Work in src/ (developers) or modules/ (learners)    ║
# ║     The tinytorch/ directory is generated code - edit source files instead!  ║
# ╚═══════════════════════════════════════════════════════════════════════════════╝
# %% auto 0
__all__ = ['TinyGPT', 'test_unit_tinygpt_init', 'TinyGPTTrainer', 'test_unit_training_pipeline', 'CompleteTinyGPTPipeline',
           'test_unit_complete_pipeline']

# %% ../../modules/20_capstone/20_capstone.ipynb 2
#| default_exp applications.tinygpt
#| export

# %% ../../modules/20_capstone/20_capstone.ipynb 7
class TinyGPT:
    """
    Complete GPT implementation integrating all TinyTorch modules.

    This class demonstrates how framework components compose into real applications.
    Built using modules 01,02,03,11,12,13 as core architecture.

    Architecture:
    - Token Embeddings (Module 11)
    - Positional Encoding (Module 11)
    - Transformer Blocks (Module 13)
    - Output Linear Layer (Module 03)
    - Language Modeling Head (Module 04)
    """

    def __init__(self, vocab_size: int, embed_dim: int = 128, num_layers: int = 4,
                 num_heads: int = 4, max_seq_len: int = 256, dropout: float = 0.1):
        """
        Initialize TinyGPT with production-inspired architecture.

        TODO: Build a complete GPT model using TinyTorch components

        APPROACH:
        1. Create token embeddings (vocab_size × embed_dim)
        2. Create positional encoding (max_seq_len × embed_dim)
        3. Build transformer layers using TransformerBlock
        4. Add output projection layer
        5. Calculate and report parameter count

        ARCHITECTURE DECISIONS:
        - embed_dim=128: Small enough for fast training, large enough for learning
        - num_layers=4: Sufficient depth without excessive memory
        - num_heads=4: Multi-head attention without head_dim being too small
        - max_seq_len=256: Reasonable context length for character-level modeling

        EXAMPLE:
        >>> model = TinyGPT(vocab_size=50, embed_dim=128, num_layers=4)
        >>> print(f"Parameters: {model.count_parameters():,}")
        Parameters: 1,234,567

        HINTS:
        - Use Embedding class for token embeddings
        - Use PositionalEncoding for position information
        - Stack TransformerBlock instances in a list
        - Final Linear layer maps embed_dim → vocab_size
        """
        ### BEGIN SOLUTION
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.max_seq_len = max_seq_len
        self.dropout = dropout

        # Token embeddings: convert token IDs to dense vectors
        self.token_embedding = Embedding(vocab_size, embed_dim)

        # Positional encoding: add position information
        self.positional_encoding = PositionalEncoding(max_seq_len, embed_dim)

        # Transformer layers: core processing
        self.transformer_blocks = []
        for _ in range(num_layers):
            block = TransformerBlock(embed_dim, num_heads, mlp_ratio=4.0)
            self.transformer_blocks.append(block)

        # Output projection: map back to vocabulary
        self.output_projection = Linear(embed_dim, vocab_size)

        # Dropout for regularization
        self.dropout_layer = Dropout(dropout)

        # Calculate parameter count for systems analysis
        self._param_count = self.count_parameters()
        print(f"🏗️ TinyGPT initialized: {self._param_count:,} parameters")
        print(f"📐 Architecture: {num_layers}L/{num_heads}H/{embed_dim}D")
        print(f"💾 Estimated memory: {self._param_count * BYTES_PER_FLOAT32 / MB_TO_BYTES:.1f}MB")
        ### END SOLUTION

def test_unit_tinygpt_init():
    """🔬 Test TinyGPT initialization and parameter counting."""
    print("🔬 Unit Test: TinyGPT Initialization...")

    # Create a small model for testing
    model = TinyGPT(vocab_size=50, embed_dim=64, num_layers=2, num_heads=2, max_seq_len=128)

    # Verify architecture components exist
    assert hasattr(model, 'token_embedding')
    assert hasattr(model, 'positional_encoding')
    assert hasattr(model, 'transformer_blocks')
    assert hasattr(model, 'output_projection')
    assert len(model.transformer_blocks) == 2

    # Verify parameter count is reasonable
    param_count = model.count_parameters()
    assert param_count > 0
    assert param_count < 1000000  # Sanity check for small model

    print(f"✅ Model created with {param_count:,} parameters")
    print("✅ TinyGPT initialization works correctly!")

# Run immediate test when developing this module
if __name__ == "__main__":
    test_unit_tinygpt_init()

# %% ../../modules/20_capstone/20_capstone.ipynb 10
class TinyGPTTrainer:
    """
    Complete training pipeline integrating optimizers, schedulers, and monitoring.

    Uses modules 05 (autograd), 06 (optimizers), 07 (training) for end-to-end training.
    """

    def __init__(self, model: TinyGPT, tokenizer: CharTokenizer,
                 learning_rate: float = 3e-4, weight_decay: float = 0.01):
        """
        Initialize trainer with model and optimization components.

        TODO: Set up complete training infrastructure

        APPROACH:
        1. Store model and tokenizer references
        2. Initialize AdamW optimizer (standard for transformers)
        3. Initialize loss function (CrossEntropyLoss for language modeling)
        4. Set up learning rate scheduler (cosine schedule)
        5. Initialize training metrics tracking

        PRODUCTION CHOICES:
        - AdamW: Better generalization than Adam (weight decay)
        - learning_rate=3e-4: Standard for small transformers
        - Cosine schedule: Smooth learning rate decay
        - CrossEntropy: Standard for classification/language modeling

        EXAMPLE:
        >>> model = TinyGPT(vocab_size=100)
        >>> tokenizer = CharTokenizer(['a', 'b', 'c'])
        >>> trainer = TinyGPTTrainer(model, tokenizer)
        >>> print("Trainer ready for training")
        Trainer ready for training

        HINTS:
        - Get all model parameters with model.parameters()
        - Use AdamW with weight_decay for better generalization
        - CrossEntropyLoss handles the language modeling objective
        """
        ### BEGIN SOLUTION
        self.model = model
        self.tokenizer = tokenizer

        # Collect all trainable parameters
        all_params = []
        all_params.extend(model.token_embedding.parameters())
        for block in model.transformer_blocks:
            all_params.extend(block.parameters())
        all_params.extend(model.output_projection.parameters())

        # Initialize optimizer (AdamW for transformers)
        self.optimizer = AdamW(
            params=all_params,
            lr=learning_rate,
            weight_decay=weight_decay,
            betas=(0.9, 0.95)  # Standard for language models
        )

        # Loss function for next token prediction
        self.loss_fn = CrossEntropyLoss()

        # Learning rate scheduler
        self.scheduler = CosineSchedule(
            optimizer=self.optimizer,
            max_epochs=100,  # Will adjust based on actual training
            min_lr=learning_rate * 0.1
        )

        # Training metrics
        self.training_history = {
            'losses': [],
            'perplexities': [],
            'learning_rates': [],
            'epoch': 0
        }

        print(f"🚀 Trainer initialized:")
        print(f"   Optimizer: AdamW (lr={learning_rate}, wd={weight_decay})")
        print(f"   Parameters: {len(all_params):,} tensors")
        print(f"   Loss: CrossEntropyLoss")
        ### END SOLUTION

    def prepare_batch(self, text_batch: List[str], max_length: int = 128) -> Tuple[Tensor, Tensor]:
        """
        Convert text batch to input/target tensors for language modeling.

        TODO: Implement text-to-tensor conversion with proper targets

        APPROACH:
        1. Tokenize each text in the batch
        2. Pad/truncate to consistent length
        3. Create input_ids (text) and target_ids (text shifted by 1)
        4. Convert to Tensor format

        LANGUAGE MODELING OBJECTIVE:
        - Input: [token1, token2, token3, token4]
        - Target: [token2, token3, token4, token5]
        - Model predicts next token at each position

        EXAMPLE:
        >>> trainer = TinyGPTTrainer(model, tokenizer)
        >>> texts = ["hello world", "ai is fun"]
        >>> inputs, targets = trainer.prepare_batch(texts)
        >>> print(inputs.shape, targets.shape)
        (2, 128) (2, 128)

        HINTS:
        - Use tokenizer.encode() for text → token conversion
        - Pad shorter sequences with tokenizer pad token
        - Target sequence is input sequence shifted right by 1
        """
        ### BEGIN SOLUTION
        batch_size = len(text_batch)

        # Tokenize all texts
        tokenized_batch = []
        for text in text_batch:
            tokens = self.tokenizer.encode(text)

            # Truncate or pad to max_length
            if len(tokens) > max_length:
                tokens = tokens[:max_length]
            else:
                # Pad with special token (use 0 as pad)
                tokens.extend([0] * (max_length - len(tokens)))

            tokenized_batch.append(tokens)

        # Convert to numpy then Tensor
        input_ids = Tensor(np.array(tokenized_batch))  # (batch_size, seq_len)

        # Create targets (shifted input for next token prediction)
        target_ids = Tensor(np.roll(input_ids.data, -1, axis=1))  # Shift left by 1

        return input_ids, target_ids
        ### END SOLUTION

    def train_step(self, input_ids: Tensor, target_ids: Tensor) -> float:
        """
        Single training step with forward, backward, and optimization.

        TODO: Implement complete training step

        APPROACH:
        1. Zero gradients from previous step
        2. Forward pass to get logits
        3. Compute loss between logits and targets
        4. Backward pass to compute gradients
        5. Optimizer step to update parameters
        6. Return loss value for monitoring

        MEMORY MANAGEMENT:
        During training, memory usage = 3× model size:
        - 1× for parameters
        - 1× for gradients
        - 1× for optimizer states (Adam moments)

        EXAMPLE:
        >>> loss = trainer.train_step(input_ids, target_ids)
        >>> print(f"Training loss: {loss:.4f}")
        Training loss: 2.3456

        HINTS:
        - Always zero_grad() before forward pass
        - Loss should be computed on flattened logits and targets
        - Call backward() on the loss tensor
        """
        ### BEGIN SOLUTION
        # Zero gradients from previous step
        self.optimizer.zero_grad()

        # Forward pass
        logits = self.model.forward(input_ids)  # (batch, seq_len, vocab_size)

        # Reshape for loss computation
        batch_size, seq_len, vocab_size = logits.shape
        logits_flat = logits.reshape(batch_size * seq_len, vocab_size)
        targets_flat = target_ids.reshape(batch_size * seq_len)

        # Compute loss
        loss = self.loss_fn.forward(logits_flat, targets_flat)

        # Backward pass
        loss.backward()

        # Optimizer step
        self.optimizer.step()

        # Return scalar loss for monitoring
        # loss.data is numpy array - float() handles conversion automatically
        return float(loss.data)
        ### END SOLUTION

def test_unit_training_pipeline():
    """🔬 Test training pipeline components."""
    print("🔬 Unit Test: Training Pipeline...")

    # Create small model and trainer
    model = TinyGPT(vocab_size=50, embed_dim=32, num_layers=2, num_heads=2)
    tokenizer = CharTokenizer(['a', 'b', 'c', 'd', 'e', ' '])
    trainer = TinyGPTTrainer(model, tokenizer, learning_rate=1e-3)

    # Test batch preparation
    texts = ["hello", "world"]
    input_ids, target_ids = trainer.prepare_batch(texts, max_length=8)

    assert input_ids.shape == (2, 8), f"Expected (2, 8), got {input_ids.shape}"
    assert target_ids.shape == (2, 8), f"Expected (2, 8), got {target_ids.shape}"

    # Test training step
    initial_loss = trainer.train_step(input_ids, target_ids)
    assert initial_loss > 0, "Loss should be positive"

    # Second step should work (gradients computed and applied)
    second_loss = trainer.train_step(input_ids, target_ids)
    assert second_loss > 0, "Second loss should also be positive"

    print(f"✅ Batch preparation shape: {input_ids.shape}")
    print(f"✅ Initial loss: {initial_loss:.4f}")
    print(f"✅ Second loss: {second_loss:.4f}")
    print("✅ Training pipeline works correctly!")

# Run immediate test when developing this module
if __name__ == "__main__":
    test_unit_training_pipeline()

# %% ../../modules/20_capstone/20_capstone.ipynb 14
class CompleteTinyGPTPipeline:
    """
    End-to-end ML pipeline demonstrating integration of all 19 modules.

    Pipeline stages:
    1. Data preparation (Module 10: Tokenization)
    2. Model creation (Modules 01-04, 11-13: Architecture)
    3. Training setup (Modules 05-07: Optimization)
    4. Training loop (Module 08: DataLoader)
    5. Optimization (Modules 17-18: Quantization, Pruning)
    6. Evaluation (Module 19: Benchmarking)
    7. Generation (Module 14: KV Caching)
    """

    def __init__(self, vocab_size: int = 100, embed_dim: int = 128,
                 num_layers: int = 4, num_heads: int = 4):
        """
        Initialize complete end-to-end TinyGPT pipeline integrating all 19 modules.

        TODO: Set up a complete ML pipeline with tokenization, model, training,
        profiling, and benchmarking components

        APPROACH:
        1. Store model architecture parameters (vocab_size, embed_dim, num_layers, num_heads)
        2. Initialize tokenizer using CharTokenizer from Module 10 with printable ASCII (32-127)
        3. Create TinyGPT model instance with stored parameters and max_seq_len=256
        4. Setup TinyGPTTrainer for training orchestration with learning_rate=3e-4
        5. Initialize Profiler (Module 15) and Benchmark (Module 19) for performance analysis
        6. Initialize pipeline state tracking (is_trained flag, training_history list)
        7. Print pipeline initialization summary with parameter count and memory usage

        EXAMPLE:
        >>> pipeline = CompleteTinyGPTPipeline(vocab_size=100, embed_dim=128,
        ...                                     num_layers=4, num_heads=4)
        🏗️ Complete TinyGPT Pipeline Initialized
           Model: 419,300 parameters
           Memory: 1.6MB
        >>> pipeline.model.count_parameters()
        419300
        >>> pipeline.is_trained
        False
        >>> len(pipeline.training_history)
        0

        HINTS:
        - CharTokenizer needs list of characters: [chr(i) for i in range(32, 127)]
        - TinyGPT requires vocab_size, embed_dim, num_layers, num_heads, max_seq_len
        - TinyGPTTrainer takes model, tokenizer, and learning_rate as arguments
        - Benchmark expects (models_list, datasets_list, metrics_list) format
        - Memory calculation: parameters * 4 bytes / 1024 / 1024 for MB
        """

        ### BEGIN SOLUTION
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_heads = num_heads

        # Stage 1: Initialize tokenizer (Module 10)
        self.tokenizer = CharTokenizer([chr(i) for i in range(32, 127)])  # Printable ASCII

        # Stage 2: Create model (Modules 01-04, 11-13)
        self.model = TinyGPT(
            vocab_size=vocab_size,
            embed_dim=embed_dim,
            num_layers=num_layers,
            num_heads=num_heads,
            max_seq_len=256
        )

        # Stage 3: Setup training (Modules 05-07)
        self.trainer = TinyGPTTrainer(self.model, self.tokenizer, learning_rate=3e-4)

        # Stage 4: Initialize profiler and benchmark (Modules 15, 19)
        self.profiler = Profiler()
        self.benchmark = Benchmark([self.model], [], ["perplexity", "latency"])

        # Pipeline state
        self.is_trained = False
        self.training_history = []

        print("🏗️ Complete TinyGPT Pipeline Initialized")
        print(f"   Model: {self.model.count_parameters():,} parameters")
        print(f"   Memory: {self.model.count_parameters() * 4 / 1024 / 1024:.1f}MB")
        ### END SOLUTION

    def prepare_training_data(self, text_corpus: List[str], batch_size: int = 8) -> DataLoader:
        """
        Prepare training data using DataLoader (Module 08).

        TODO: Create DataLoader for training text data

        APPROACH:
        1. Tokenize all texts in corpus
        2. Create input/target pairs for language modeling
        3. Package into TensorDataset
        4. Create DataLoader with batching and shuffling

        EXAMPLE:
        >>> pipeline = CompleteTinyGPTPipeline()
        >>> corpus = ["hello world", "ai is amazing"]
        >>> dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
        >>> print(f"Batches: {len(dataloader)}")
        Batches: 1
        """
        ### BEGIN SOLUTION
        # Tokenize and prepare training pairs
        input_sequences = []
        target_sequences = []

        for text in text_corpus:
            tokens = self.tokenizer.encode(text)
            if len(tokens) < 2:
                continue  # Skip very short texts

            # Create sliding window of input/target pairs
            for i in range(len(tokens) - 1):
                input_seq = tokens[:i+1]
                target_seq = tokens[i+1]

                # Pad input to consistent length
                max_len = 32  # Reasonable context window
                if len(input_seq) > max_len:
                    input_seq = input_seq[-max_len:]
                else:
                    input_seq = [0] * (max_len - len(input_seq)) + input_seq

                input_sequences.append(input_seq)
                target_sequences.append(target_seq)

        # Convert to tensors
        inputs = Tensor(np.array(input_sequences))
        targets = Tensor(np.array(target_sequences))

        # Create dataset and dataloader
        dataset = TensorDataset(inputs, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        print(f"📚 Training data prepared: {len(dataset)} examples, {len(dataloader)} batches")
        return dataloader
        ### END SOLUTION

    def train(self, dataloader: DataLoader, epochs: int = 10) -> Dict[str, List[float]]:
        """
        Complete training loop with monitoring.

        TODO: Implement full training with progress tracking

        APPROACH:
        1. Loop through epochs
        2. For each batch: forward, backward, optimize
        3. Track loss and perplexity
        4. Update learning rate schedule
        5. Return training history

        EXAMPLE:
        >>> history = pipeline.train(dataloader, epochs=5)
        >>> print(f"Final loss: {history['losses'][-1]:.4f}")
        Final loss: 1.2345
        """
        ### BEGIN SOLUTION
        history = {'losses': [], 'perplexities': [], 'epochs': []}

        print(f"🚀 Starting training for {epochs} epochs...")

        for epoch in range(epochs):
            epoch_losses = []

            for batch_idx, (inputs, targets) in enumerate(dataloader):
                # Training step
                loss = self.trainer.train_step(inputs, targets)
                epoch_losses.append(loss)

                # Log progress
                if batch_idx % 10 == 0:
                    perplexity = np.exp(loss)
                    print(f"   Epoch {epoch+1}/{epochs}, Batch {batch_idx}: "
                          f"Loss={loss:.4f}, PPL={perplexity:.2f}")

            # Epoch summary
            avg_loss = np.mean(epoch_losses)
            avg_perplexity = np.exp(avg_loss)

            history['losses'].append(avg_loss)
            history['perplexities'].append(avg_perplexity)
            history['epochs'].append(epoch + 1)

            # Update learning rate
            self.trainer.scheduler.step()

            print(f"✅ Epoch {epoch+1} complete: Loss={avg_loss:.4f}, PPL={avg_perplexity:.2f}")

        self.is_trained = True
        self.training_history = history
        print(f"🎉 Training complete! Final perplexity: {history['perplexities'][-1]:.2f}")

        return history
        ### END SOLUTION

    def optimize_model(self, quantize: bool = True, prune_sparsity: float = 0.0):
        """
        Apply optimization techniques (Modules 17-18).

        TODO: Apply quantization and pruning optimizations

        APPROACH:
        1. Optionally apply quantization to reduce precision
        2. Optionally apply pruning to remove weights
        3. Measure size reduction
        4. Validate model still works

        EXAMPLE:
        >>> pipeline.optimize_model(quantize=True, prune_sparsity=0.5)
        Model optimized: 75% size reduction
        """
        ### BEGIN SOLUTION
        original_params = self.model.count_parameters()
        original_memory = original_params * 4 / (1024 * 1024)

        optimizations_applied = []

        if quantize:
            # Apply quantization (simulated)
            # In real implementation, would use quantize_model()
            quantized_memory = original_memory / 4  # INT8 vs FP32
            optimizations_applied.append(f"INT8 quantization (4× memory reduction)")
            print("   Applied INT8 quantization")

        if prune_sparsity > 0:
            # Apply pruning (simulated)
            # In real implementation, would use magnitude_prune()
            remaining_weights = 1 - prune_sparsity
            optimizations_applied.append(f"{prune_sparsity:.0%} pruning ({remaining_weights:.0%} weights remain)")
            print(f"   Applied {prune_sparsity:.0%} magnitude pruning")

        # Calculate final size
        size_reduction = 1.0
        if quantize:
            size_reduction *= 0.25  # 4× smaller
        if prune_sparsity > 0:
            size_reduction *= (1 - prune_sparsity)

        final_memory = original_memory * size_reduction
        reduction_factor = original_memory / final_memory

        print(f"🔧 Model optimization complete:")
        print(f"   Original: {original_memory:.1f}MB")
        print(f"   Optimized: {final_memory:.1f}MB")
        print(f"   Reduction: {reduction_factor:.1f}× smaller")
        print(f"   Applied: {', '.join(optimizations_applied)}")
        ### END SOLUTION

    def generate_text(self, prompt: str, max_tokens: int = 50) -> str:
        """
        Generate text using the trained model.

        TODO: Implement text generation with proper encoding/decoding

        APPROACH:
        1. Encode prompt to token IDs
        2. Use model.generate() for autoregressive generation
        3. Decode generated tokens back to text
        4. Return generated text

        EXAMPLE:
        >>> text = pipeline.generate_text("Hello", max_tokens=10)
        >>> print(f"Generated: {text}")
        Generated: Hello world this is AI
        """
        ### BEGIN SOLUTION
        if not self.is_trained:
            print("⚠️ Model not trained yet. Generating with random weights.")

        # Encode prompt
        prompt_tokens = self.tokenizer.encode(prompt)
        prompt_tensor = Tensor([prompt_tokens])

        # Generate tokens
        generated_tokens = self.model.generate(
            prompt_tensor,
            max_new_tokens=max_tokens,
            temperature=0.8,
            use_cache=True
        )

        # Decode to text
        all_tokens = generated_tokens.data[0].tolist()
        generated_text = self.tokenizer.decode(all_tokens)

        return generated_text
        ### END SOLUTION

def test_unit_complete_pipeline():
    """🔬 Test complete pipeline integration."""
    print("🔬 Unit Test: Complete Pipeline Integration...")

    # Create pipeline
    pipeline = CompleteTinyGPTPipeline(vocab_size=50, embed_dim=32, num_layers=2)

    # Test data preparation
    corpus = ["hello world", "ai is fun", "machine learning"]
    dataloader = pipeline.prepare_training_data(corpus, batch_size=2)
    assert len(dataloader) > 0, "DataLoader should have batches"

    # Test training (minimal)
    history = pipeline.train(dataloader, epochs=1)
    assert 'losses' in history, "History should contain losses"
    assert len(history['losses']) == 1, "Should have one epoch of losses"

    # Test optimization
    pipeline.optimize_model(quantize=True, prune_sparsity=0.5)

    # Test generation
    generated = pipeline.generate_text("hello", max_tokens=5)
    assert isinstance(generated, str), "Generated output should be string"
    assert len(generated) > 0, "Generated text should not be empty"

    print(f"✅ Pipeline stages completed successfully")
    print(f"✅ Training history: {len(history['losses'])} epochs")
    print(f"✅ Generated text: '{generated[:20]}...'")
    print("✅ Complete pipeline integration works!")

# Run immediate test when developing this module
if __name__ == "__main__":
    test_unit_complete_pipeline()