diff --git a/modules/source/08_optimizers/optimizers_dev.ipynb b/modules/source/08_optimizers/optimizers_dev.ipynb new file mode 100644 index 00000000..223efc4a --- /dev/null +++ b/modules/source/08_optimizers/optimizers_dev.ipynb @@ -0,0 +1,1754 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "602ba54a", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "# Module 8: Optimizers - Gradient-Based Parameter Updates\n", + "\n", + "Welcome to the Optimizers module! This is where neural networks learn to improve through intelligent parameter updates.\n", + "\n", + "## Learning Goals\n", + "- Understand gradient descent and how optimizers use gradients to update parameters\n", + "- Implement SGD with momentum for accelerated convergence\n", + "- Build Adam optimizer with adaptive learning rates\n", + "- Master learning rate scheduling strategies\n", + "- See how optimizers enable effective neural network training\n", + "\n", + "## Build โ†’ Use โ†’ Analyze\n", + "1. **Build**: Core optimization algorithms (SGD, Adam)\n", + "2. **Use**: Apply optimizers to train neural networks\n", + "3. **Analyze**: Compare optimizer behavior and convergence patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3b359ed", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "optimizers-imports", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "#| default_exp core.optimizers\n", + "\n", + "#| export\n", + "import math\n", + "import numpy as np\n", + "import sys\n", + "import os\n", + "from typing import List, Dict, Any, Optional, Union\n", + "from collections import defaultdict\n", + "\n", + "# Helper function to set up import paths\n", + "def setup_import_paths():\n", + " \"\"\"Set up import paths for development modules.\"\"\"\n", + " import sys\n", + " import os\n", + " \n", + " # Add module directories to path\n", + " base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n", + " tensor_dir = os.path.join(base_dir, '01_tensor')\n", + " autograd_dir = os.path.join(base_dir, '07_autograd')\n", + " \n", + " if tensor_dir not in sys.path:\n", + " sys.path.append(tensor_dir)\n", + " if autograd_dir not in sys.path:\n", + " sys.path.append(autograd_dir)\n", + "\n", + "# Import our existing components\n", + "try:\n", + " from tinytorch.core.tensor import Tensor\n", + " from tinytorch.core.autograd import Variable\n", + "except ImportError:\n", + " # For development, try local imports\n", + " try:\n", + " setup_import_paths()\n", + " from tensor_dev import Tensor\n", + " from autograd_dev import Variable\n", + " except ImportError:\n", + " # Create minimal fallback classes for testing\n", + " print(\"Warning: Using fallback classes for testing\")\n", + " \n", + " class Tensor:\n", + " def __init__(self, data):\n", + " self.data = np.array(data)\n", + " self.shape = self.data.shape\n", + " \n", + " def __str__(self):\n", + " return f\"Tensor({self.data})\"\n", + " \n", + " class Variable:\n", + " def __init__(self, data, requires_grad=True):\n", + " if isinstance(data, (int, float)):\n", + " self.data = Tensor([data])\n", + " else:\n", + " self.data = Tensor(data)\n", + " self.requires_grad = requires_grad\n", + " self.grad = None\n", + " \n", + " def zero_grad(self):\n", + " self.grad = None\n", + " \n", + " def __str__(self):\n", + " return f\"Variable({self.data.data})\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dfb6aa4", + "metadata": { + "nbgrader": { + "grade": false, + "grade_id": "optimizers-setup", + "locked": false, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "print(\"๐Ÿ”ฅ TinyTorch Optimizers Module\")\n", + "print(f\"NumPy version: {np.__version__}\")\n", + "print(f\"Python version: {sys.version_info.major}.{sys.version_info.minor}\")\n", + "print(\"Ready to build optimization algorithms!\")" + ] + }, + { + "cell_type": "markdown", + "id": "c9afc185", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## ๐Ÿ“ฆ Where This Code Lives in the Final Package\n", + "\n", + "**Learning Side:** You work in `modules/source/08_optimizers/optimizers_dev.py` \n", + "**Building Side:** Code exports to `tinytorch.core.optimizers`\n", + "\n", + "```python\n", + "# Final package structure:\n", + "from tinytorch.core.optimizers import SGD, Adam, StepLR # The optimization engines!\n", + "from tinytorch.core.autograd import Variable # Gradient computation\n", + "from tinytorch.core.tensor import Tensor # Data structures\n", + "```\n", + "\n", + "**Why this matters:**\n", + "- **Learning:** Focused module for understanding optimization algorithms\n", + "- **Production:** Proper organization like PyTorch's `torch.optim`\n", + "- **Consistency:** All optimization algorithms live together in `core.optimizers`\n", + "- **Foundation:** Enables effective neural network training" + ] + }, + { + "cell_type": "markdown", + "id": "e0d222c6", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "## What Are Optimizers?\n", + "\n", + "### The Problem: How to Update Parameters\n", + "Neural networks learn by updating parameters using gradients:\n", + "```\n", + "parameter_new = parameter_old - learning_rate * gradient\n", + "```\n", + "\n", + "But **naive gradient descent** has problems:\n", + "- **Slow convergence**: Takes many steps to reach optimum\n", + "- **Oscillation**: Bounces around valleys without making progress\n", + "- **Poor scaling**: Same learning rate for all parameters\n", + "\n", + "### The Solution: Smart Optimization\n", + "**Optimizers** are algorithms that intelligently update parameters:\n", + "- **Momentum**: Accelerate convergence by accumulating velocity\n", + "- **Adaptive learning rates**: Different learning rates for different parameters\n", + "- **Second-order information**: Use curvature to guide updates\n", + "\n", + "### Real-World Impact\n", + "- **SGD**: The foundation of all neural network training\n", + "- **Adam**: The default optimizer for most deep learning applications\n", + "- **Learning rate scheduling**: Critical for training stability and performance\n", + "\n", + "### What We'll Build\n", + "1. **SGD**: Stochastic Gradient Descent with momentum\n", + "2. **Adam**: Adaptive Moment Estimation optimizer\n", + "3. **StepLR**: Learning rate scheduling\n", + "4. **Integration**: Complete training loop with optimizers" + ] + }, + { + "cell_type": "markdown", + "id": "8ccea3ce", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "## Step 1: Understanding Gradient Descent\n", + "\n", + "### What is Gradient Descent?\n", + "**Gradient descent** finds the minimum of a function by following the negative gradient:\n", + "\n", + "```\n", + "ฮธ_{t+1} = ฮธ_t - ฮฑ โˆ‡f(ฮธ_t)\n", + "```\n", + "\n", + "Where:\n", + "- ฮธ: Parameters we want to optimize\n", + "- ฮฑ: Learning rate (how big steps to take)\n", + "- โˆ‡f(ฮธ): Gradient of loss function with respect to parameters\n", + "\n", + "### Why Gradient Descent Works\n", + "1. **Gradients point uphill**: Negative gradient points toward minimum\n", + "2. **Iterative improvement**: Each step reduces the loss (in theory)\n", + "3. **Local convergence**: Finds local minimum with proper learning rate\n", + "4. **Scalable**: Works with millions of parameters\n", + "\n", + "### The Learning Rate Dilemma\n", + "- **Too large**: Overshoots minimum, diverges\n", + "- **Too small**: Extremely slow convergence\n", + "- **Just right**: Steady progress toward minimum\n", + "\n", + "### Visual Understanding\n", + "```\n", + "Loss landscape: \\__/\n", + "Start here: โ†‘\n", + "Gradient descent: โ†“ โ†’ โ†“ โ†’ โ†“ โ†’ minimum\n", + "```\n", + "\n", + "### Real-World Applications\n", + "- **Neural networks**: Training any deep learning model\n", + "- **Machine learning**: Logistic regression, SVM, etc.\n", + "- **Scientific computing**: Optimization problems in physics, engineering\n", + "- **Economics**: Portfolio optimization, game theory\n", + "\n", + "Let's implement gradient descent to understand it deeply!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d41c2596", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "gradient-descent-function", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "def gradient_descent_step(parameter: Variable, learning_rate: float) -> None:\n", + " \"\"\"\n", + " Perform one step of gradient descent on a parameter.\n", + " \n", + " Args:\n", + " parameter: Variable with gradient information\n", + " learning_rate: How much to update parameter\n", + " \n", + " TODO: Implement basic gradient descent parameter update.\n", + " \n", + " STEP-BY-STEP IMPLEMENTATION:\n", + " 1. Check if parameter has a gradient\n", + " 2. Get current parameter value and gradient\n", + " 3. Update parameter: new_value = old_value - learning_rate * gradient\n", + " 4. Update parameter data with new value\n", + " 5. Handle edge cases (no gradient, invalid values)\n", + " \n", + " EXAMPLE USAGE:\n", + " ```python\n", + " # Parameter with gradient\n", + " w = Variable(2.0, requires_grad=True)\n", + " w.grad = Variable(0.5) # Gradient from loss\n", + " \n", + " # Update parameter\n", + " gradient_descent_step(w, learning_rate=0.1)\n", + " # w.data now contains: 2.0 - 0.1 * 0.5 = 1.95\n", + " ```\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Check if parameter.grad is not None\n", + " - Use parameter.grad.data.data to get gradient value\n", + " - Update parameter.data with new Tensor\n", + " - Don't modify gradient (it's used for logging)\n", + " \n", + " LEARNING CONNECTIONS:\n", + " - This is the foundation of all neural network training\n", + " - PyTorch's optimizer.step() does exactly this\n", + " - The learning rate determines convergence speed\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " if parameter.grad is not None:\n", + " # Get current parameter value and gradient\n", + " current_value = parameter.data.data\n", + " gradient_value = parameter.grad.data.data\n", + " \n", + " # Update parameter: new_value = old_value - learning_rate * gradient\n", + " new_value = current_value - learning_rate * gradient_value\n", + " \n", + " # Update parameter data\n", + " parameter.data = Tensor(new_value)\n", + " ### END SOLUTION" + ] + }, + { + "cell_type": "markdown", + "id": "4d2e1fd4", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### ๐Ÿงช Unit Test: Gradient Descent Step\n", + "\n", + "Let's test your gradient descent implementation right away! This is the foundation of all optimization algorithms.\n", + "\n", + "**This is a unit test** - it tests one specific function (gradient_descent_step) in isolation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f092d289", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": true, + "grade_id": "test-gradient-descent", + "locked": true, + "points": 10, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_gradient_descent_step_comprehensive():\n", + " \"\"\"Test basic gradient descent parameter update\"\"\"\n", + " print(\"๐Ÿ”ฌ Unit Test: Gradient Descent Step...\")\n", + " \n", + " # Test basic parameter update\n", + " try:\n", + " w = Variable(2.0, requires_grad=True)\n", + " w.grad = Variable(0.5) # Positive gradient\n", + " \n", + " original_value = w.data.data.item()\n", + " gradient_descent_step(w, learning_rate=0.1)\n", + " new_value = w.data.data.item()\n", + " \n", + " expected_value = original_value - 0.1 * 0.5 # 2.0 - 0.05 = 1.95\n", + " assert abs(new_value - expected_value) < 1e-6, f\"Expected {expected_value}, got {new_value}\"\n", + " print(\"โœ… Basic parameter update works\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Basic parameter update failed: {e}\")\n", + " raise\n", + "\n", + " # Test with negative gradient\n", + " try:\n", + " w2 = Variable(1.0, requires_grad=True)\n", + " w2.grad = Variable(-0.2) # Negative gradient\n", + " \n", + " gradient_descent_step(w2, learning_rate=0.1)\n", + " expected_value2 = 1.0 - 0.1 * (-0.2) # 1.0 + 0.02 = 1.02\n", + " assert abs(w2.data.data.item() - expected_value2) < 1e-6, \"Negative gradient test failed\"\n", + " print(\"โœ… Negative gradient handling works\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Negative gradient handling failed: {e}\")\n", + " raise\n", + "\n", + " # Test with no gradient (should not update)\n", + " try:\n", + " w3 = Variable(3.0, requires_grad=True)\n", + " w3.grad = None\n", + " original_value3 = w3.data.data.item()\n", + " \n", + " gradient_descent_step(w3, learning_rate=0.1)\n", + " assert w3.data.data.item() == original_value3, \"Parameter with no gradient should not update\"\n", + " print(\"โœ… No gradient case works\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ No gradient case failed: {e}\")\n", + " raise\n", + "\n", + " print(\"๐ŸŽฏ Gradient descent step behavior:\")\n", + " print(\" Updates parameters in negative gradient direction\")\n", + " print(\" Uses learning rate to control step size\")\n", + " print(\" Skips updates when gradient is None\")\n", + " print(\"๐Ÿ“ˆ Progress: Gradient Descent Step โœ“\")\n", + "\n", + "# Test function is called by auto-discovery system" + ] + }, + { + "cell_type": "markdown", + "id": "bc218834", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "## Step 2: SGD with Momentum\n", + "\n", + "### What is SGD?\n", + "**SGD (Stochastic Gradient Descent)** is the fundamental optimization algorithm:\n", + "\n", + "```\n", + "ฮธ_{t+1} = ฮธ_t - ฮฑ โˆ‡L(ฮธ_t)\n", + "```\n", + "\n", + "### The Problem with Vanilla SGD\n", + "- **Slow convergence**: Especially in narrow valleys\n", + "- **Oscillation**: Bounces around without making progress\n", + "- **Poor conditioning**: Struggles with ill-conditioned problems\n", + "\n", + "### The Solution: Momentum\n", + "**Momentum** accumulates velocity to accelerate convergence:\n", + "\n", + "```\n", + "v_t = ฮฒ v_{t-1} + โˆ‡L(ฮธ_t)\n", + "ฮธ_{t+1} = ฮธ_t - ฮฑ v_t\n", + "```\n", + "\n", + "Where:\n", + "- v_t: Velocity (exponential moving average of gradients)\n", + "- ฮฒ: Momentum coefficient (typically 0.9)\n", + "- ฮฑ: Learning rate\n", + "\n", + "### Why Momentum Works\n", + "1. **Acceleration**: Builds up speed in consistent directions\n", + "2. **Dampening**: Reduces oscillations in inconsistent directions\n", + "3. **Memory**: Remembers previous gradient directions\n", + "4. **Robustness**: Less sensitive to noisy gradients\n", + "\n", + "### Visual Understanding\n", + "```\n", + "Without momentum: โ†—โ†™โ†—โ†™โ†—โ†™ (oscillating)\n", + "With momentum: โ†—โ†’โ†’โ†’โ†’โ†’ (smooth progress)\n", + "```\n", + "\n", + "### Real-World Applications\n", + "- **Image classification**: Training ResNet, VGG\n", + "- **Natural language**: Training RNNs, early transformers\n", + "- **Classic choice**: Still used when Adam fails\n", + "- **Large batch training**: Often preferred over Adam\n", + "\n", + "Let's implement SGD with momentum!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f587b7f", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "sgd-class", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class SGD:\n", + " \"\"\"\n", + " SGD Optimizer with Momentum\n", + " \n", + " Implements stochastic gradient descent with momentum:\n", + " v_t = momentum * v_{t-1} + gradient\n", + " parameter = parameter - learning_rate * v_t\n", + " \"\"\"\n", + " \n", + " def __init__(self, parameters: List[Variable], learning_rate: float = 0.01, \n", + " momentum: float = 0.0, weight_decay: float = 0.0):\n", + " \"\"\"\n", + " Initialize SGD optimizer.\n", + " \n", + " Args:\n", + " parameters: List of Variables to optimize\n", + " learning_rate: Learning rate (default: 0.01)\n", + " momentum: Momentum coefficient (default: 0.0)\n", + " weight_decay: L2 regularization coefficient (default: 0.0)\n", + " \n", + " TODO: Implement SGD optimizer initialization.\n", + " \n", + " APPROACH:\n", + " 1. Store parameters and hyperparameters\n", + " 2. Initialize momentum buffers for each parameter\n", + " 3. Set up state tracking for optimization\n", + " 4. Prepare for step() and zero_grad() methods\n", + " \n", + " EXAMPLE:\n", + " ```python\n", + " # Create optimizer\n", + " optimizer = SGD([w1, w2, b1, b2], learning_rate=0.01, momentum=0.9)\n", + " \n", + " # In training loop:\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " ```\n", + " \n", + " HINTS:\n", + " - Store parameters as a list\n", + " - Initialize momentum buffers as empty dict\n", + " - Use parameter id() as key for momentum tracking\n", + " - Momentum buffers will be created lazily in step()\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " self.parameters = parameters\n", + " self.learning_rate = learning_rate\n", + " self.momentum = momentum\n", + " self.weight_decay = weight_decay\n", + " \n", + " # Initialize momentum buffers (created lazily)\n", + " self.momentum_buffers = {}\n", + " \n", + " # Track optimization steps\n", + " self.step_count = 0\n", + " ### END SOLUTION\n", + " \n", + " def step(self) -> None:\n", + " \"\"\"\n", + " Perform one optimization step.\n", + " \n", + " TODO: Implement SGD parameter update with momentum.\n", + " \n", + " APPROACH:\n", + " 1. Iterate through all parameters\n", + " 2. For each parameter with gradient:\n", + " a. Get current gradient\n", + " b. Apply weight decay if specified\n", + " c. Update momentum buffer (or create if first time)\n", + " d. Update parameter using momentum\n", + " 3. Increment step count\n", + " \n", + " MATHEMATICAL FORMULATION:\n", + " - If weight_decay > 0: gradient = gradient + weight_decay * parameter\n", + " - momentum_buffer = momentum * momentum_buffer + gradient\n", + " - parameter = parameter - learning_rate * momentum_buffer\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Use id(param) as key for momentum buffers\n", + " - Initialize buffer with zeros if not exists\n", + " - Handle case where momentum = 0 (no momentum)\n", + " - Update parameter.data with new Tensor\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " for param in self.parameters:\n", + " if param.grad is not None:\n", + " # Get gradient\n", + " gradient = param.grad.data.data\n", + " \n", + " # Apply weight decay (L2 regularization)\n", + " if self.weight_decay > 0:\n", + " gradient = gradient + self.weight_decay * param.data.data\n", + " \n", + " # Get or create momentum buffer\n", + " param_id = id(param)\n", + " if param_id not in self.momentum_buffers:\n", + " self.momentum_buffers[param_id] = np.zeros_like(param.data.data)\n", + " \n", + " # Update momentum buffer\n", + " self.momentum_buffers[param_id] = (\n", + " self.momentum * self.momentum_buffers[param_id] + gradient\n", + " )\n", + " \n", + " # Update parameter\n", + " param.data = Tensor(\n", + " param.data.data - self.learning_rate * self.momentum_buffers[param_id]\n", + " )\n", + " \n", + " self.step_count += 1\n", + " ### END SOLUTION\n", + " \n", + " def zero_grad(self) -> None:\n", + " \"\"\"\n", + " Zero out gradients for all parameters.\n", + " \n", + " TODO: Implement gradient zeroing.\n", + " \n", + " APPROACH:\n", + " 1. Iterate through all parameters\n", + " 2. Set gradient to None for each parameter\n", + " 3. This prepares for next backward pass\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Simply set param.grad = None\n", + " - This is called before loss.backward()\n", + " - Essential for proper gradient accumulation\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " for param in self.parameters:\n", + " param.grad = None\n", + " ### END SOLUTION" + ] + }, + { + "cell_type": "markdown", + "id": "4adee99c", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### ๐Ÿงช Unit Test: SGD Optimizer\n", + "\n", + "Let's test your SGD optimizer implementation! This optimizer adds momentum to gradient descent for better convergence.\n", + "\n", + "**This is a unit test** - it tests one specific class (SGD) in isolation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa93aa53", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-sgd", + "locked": true, + "points": 15, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_sgd_optimizer_comprehensive():\n", + " \"\"\"Test SGD optimizer implementation\"\"\"\n", + " print(\"๐Ÿ”ฌ Unit Test: SGD Optimizer...\")\n", + " \n", + " # Create test parameters\n", + " w1 = Variable(1.0, requires_grad=True)\n", + " w2 = Variable(2.0, requires_grad=True)\n", + " b = Variable(0.5, requires_grad=True)\n", + " \n", + " # Create optimizer\n", + " optimizer = SGD([w1, w2, b], learning_rate=0.1, momentum=0.9)\n", + " \n", + " # Test zero_grad\n", + " try:\n", + " w1.grad = Variable(0.1)\n", + " w2.grad = Variable(0.2)\n", + " b.grad = Variable(0.05)\n", + " \n", + " optimizer.zero_grad()\n", + " \n", + " assert w1.grad is None, \"Gradient should be None after zero_grad\"\n", + " assert w2.grad is None, \"Gradient should be None after zero_grad\"\n", + " assert b.grad is None, \"Gradient should be None after zero_grad\"\n", + " print(\"โœ… zero_grad() works correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ zero_grad() failed: {e}\")\n", + " raise\n", + " \n", + " # Test step with gradients\n", + " try:\n", + " w1.grad = Variable(0.1)\n", + " w2.grad = Variable(0.2)\n", + " b.grad = Variable(0.05)\n", + " \n", + " # First step (no momentum yet)\n", + " original_w1 = w1.data.data.item()\n", + " original_w2 = w2.data.data.item()\n", + " original_b = b.data.data.item()\n", + " \n", + " optimizer.step()\n", + " \n", + " # Check parameter updates\n", + " expected_w1 = original_w1 - 0.1 * 0.1 # 1.0 - 0.01 = 0.99\n", + " expected_w2 = original_w2 - 0.1 * 0.2 # 2.0 - 0.02 = 1.98\n", + " expected_b = original_b - 0.1 * 0.05 # 0.5 - 0.005 = 0.495\n", + " \n", + " assert abs(w1.data.data.item() - expected_w1) < 1e-6, f\"w1 update failed: expected {expected_w1}, got {w1.data.data.item()}\"\n", + " assert abs(w2.data.data.item() - expected_w2) < 1e-6, f\"w2 update failed: expected {expected_w2}, got {w2.data.data.item()}\"\n", + " assert abs(b.data.data.item() - expected_b) < 1e-6, f\"b update failed: expected {expected_b}, got {b.data.data.item()}\"\n", + " print(\"โœ… Parameter updates work correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Parameter updates failed: {e}\")\n", + " raise\n", + " \n", + " # Test momentum buffers\n", + " try:\n", + " assert len(optimizer.momentum_buffers) == 3, f\"Should have 3 momentum buffers, got {len(optimizer.momentum_buffers)}\"\n", + " assert optimizer.step_count == 1, f\"Step count should be 1, got {optimizer.step_count}\"\n", + " print(\"โœ… Momentum buffers created correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Momentum buffers failed: {e}\")\n", + " raise\n", + " \n", + " # Test step counting\n", + " try:\n", + " w1.grad = Variable(0.1)\n", + " w2.grad = Variable(0.2)\n", + " b.grad = Variable(0.05)\n", + " \n", + " optimizer.step()\n", + " \n", + " assert optimizer.step_count == 2, f\"Step count should be 2, got {optimizer.step_count}\"\n", + " print(\"โœ… Step counting works correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Step counting failed: {e}\")\n", + " raise\n", + "\n", + " print(\"๐ŸŽฏ SGD optimizer behavior:\")\n", + " print(\" Maintains momentum buffers for accelerated updates\")\n", + " print(\" Tracks step count for learning rate scheduling\")\n", + " print(\" Supports weight decay for regularization\")\n", + " print(\"๐Ÿ“ˆ Progress: SGD Optimizer โœ“\")\n", + "\n", + "# Run the test\n", + "test_sgd_optimizer_comprehensive()" + ] + }, + { + "cell_type": "markdown", + "id": "3730c6d6", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "## Step 3: Adam - Adaptive Learning Rates\n", + "\n", + "### What is Adam?\n", + "**Adam (Adaptive Moment Estimation)** is the most popular optimizer in deep learning:\n", + "\n", + "```\n", + "m_t = ฮฒโ‚ m_{t-1} + (1 - ฮฒโ‚) โˆ‡L(ฮธ_t) # First moment (momentum)\n", + "v_t = ฮฒโ‚‚ v_{t-1} + (1 - ฮฒโ‚‚) (โˆ‡L(ฮธ_t))ยฒ # Second moment (variance)\n", + "mฬ‚_t = m_t / (1 - ฮฒโ‚แต—) # Bias correction\n", + "vฬ‚_t = v_t / (1 - ฮฒโ‚‚แต—) # Bias correction\n", + "ฮธ_{t+1} = ฮธ_t - ฮฑ mฬ‚_t / (โˆšvฬ‚_t + ฮต) # Parameter update\n", + "```\n", + "\n", + "### Why Adam is Revolutionary\n", + "1. **Adaptive learning rates**: Different learning rate for each parameter\n", + "2. **Momentum**: Accelerates convergence like SGD\n", + "3. **Variance adaptation**: Scales updates based on gradient variance\n", + "4. **Bias correction**: Handles initialization bias\n", + "5. **Robust**: Works well with minimal hyperparameter tuning\n", + "\n", + "### The Three Key Ideas\n", + "1. **First moment (m_t)**: Exponential moving average of gradients (momentum)\n", + "2. **Second moment (v_t)**: Exponential moving average of squared gradients (variance)\n", + "3. **Adaptive scaling**: Large gradients โ†’ small updates, small gradients โ†’ large updates\n", + "\n", + "### Visual Understanding\n", + "```\n", + "Parameter with large gradients: /\\/\\/\\/\\ โ†’ smooth updates\n", + "Parameter with small gradients: ______ โ†’ amplified updates\n", + "```\n", + "\n", + "### Real-World Applications\n", + "- **Deep learning**: Default optimizer for most neural networks\n", + "- **Computer vision**: Training CNNs, ResNets, Vision Transformers\n", + "- **Natural language**: Training BERT, GPT, T5\n", + "- **Transformers**: Essential for attention-based models\n", + "\n", + "Let's implement Adam optimizer!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7d3f7a", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "adam-class", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class Adam:\n", + " \"\"\"\n", + " Adam Optimizer\n", + " \n", + " Implements Adam algorithm with adaptive learning rates:\n", + " - First moment: exponential moving average of gradients\n", + " - Second moment: exponential moving average of squared gradients\n", + " - Bias correction: accounts for initialization bias\n", + " - Adaptive updates: different learning rate per parameter\n", + " \"\"\"\n", + " \n", + " def __init__(self, parameters: List[Variable], learning_rate: float = 0.001,\n", + " beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8,\n", + " weight_decay: float = 0.0):\n", + " \"\"\"\n", + " Initialize Adam optimizer.\n", + " \n", + " Args:\n", + " parameters: List of Variables to optimize\n", + " learning_rate: Learning rate (default: 0.001)\n", + " beta1: Exponential decay rate for first moment (default: 0.9)\n", + " beta2: Exponential decay rate for second moment (default: 0.999)\n", + " epsilon: Small constant for numerical stability (default: 1e-8)\n", + " weight_decay: L2 regularization coefficient (default: 0.0)\n", + " \n", + " TODO: Implement Adam optimizer initialization.\n", + " \n", + " APPROACH:\n", + " 1. Store parameters and hyperparameters\n", + " 2. Initialize first moment buffers (m_t)\n", + " 3. Initialize second moment buffers (v_t)\n", + " 4. Set up step counter for bias correction\n", + " \n", + " EXAMPLE:\n", + " ```python\n", + " # Create Adam optimizer\n", + " optimizer = Adam([w1, w2, b1, b2], learning_rate=0.001)\n", + " \n", + " # In training loop:\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " ```\n", + " \n", + " HINTS:\n", + " - Store all hyperparameters\n", + " - Initialize moment buffers as empty dicts\n", + " - Use parameter id() as key for tracking\n", + " - Buffers will be created lazily in step()\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " self.parameters = parameters\n", + " self.learning_rate = learning_rate\n", + " self.beta1 = beta1\n", + " self.beta2 = beta2\n", + " self.epsilon = epsilon\n", + " self.weight_decay = weight_decay\n", + " \n", + " # Initialize moment buffers (created lazily)\n", + " self.first_moment = {} # m_t\n", + " self.second_moment = {} # v_t\n", + " \n", + " # Track optimization steps for bias correction\n", + " self.step_count = 0\n", + " ### END SOLUTION\n", + " \n", + " def step(self) -> None:\n", + " \"\"\"\n", + " Perform one optimization step using Adam algorithm.\n", + " \n", + " TODO: Implement Adam parameter update.\n", + " \n", + " APPROACH:\n", + " 1. Increment step count\n", + " 2. For each parameter with gradient:\n", + " a. Get current gradient\n", + " b. Apply weight decay if specified\n", + " c. Update first moment (momentum)\n", + " d. Update second moment (variance)\n", + " e. Apply bias correction\n", + " f. Update parameter with adaptive learning rate\n", + " \n", + " MATHEMATICAL FORMULATION:\n", + " - m_t = beta1 * m_{t-1} + (1 - beta1) * gradient\n", + " - v_t = beta2 * v_{t-1} + (1 - beta2) * gradient^2\n", + " - m_hat = m_t / (1 - beta1^t)\n", + " - v_hat = v_t / (1 - beta2^t)\n", + " - parameter = parameter - learning_rate * m_hat / (sqrt(v_hat) + epsilon)\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Use id(param) as key for moment buffers\n", + " - Initialize buffers with zeros if not exists\n", + " - Use np.sqrt() for square root\n", + " - Handle numerical stability with epsilon\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " self.step_count += 1\n", + " \n", + " for param in self.parameters:\n", + " if param.grad is not None:\n", + " # Get gradient\n", + " gradient = param.grad.data.data\n", + " \n", + " # Apply weight decay (L2 regularization)\n", + " if self.weight_decay > 0:\n", + " gradient = gradient + self.weight_decay * param.data.data\n", + " \n", + " # Get or create moment buffers\n", + " param_id = id(param)\n", + " if param_id not in self.first_moment:\n", + " self.first_moment[param_id] = np.zeros_like(param.data.data)\n", + " self.second_moment[param_id] = np.zeros_like(param.data.data)\n", + " \n", + " # Update first moment (momentum)\n", + " self.first_moment[param_id] = (\n", + " self.beta1 * self.first_moment[param_id] + \n", + " (1 - self.beta1) * gradient\n", + " )\n", + " \n", + " # Update second moment (variance)\n", + " self.second_moment[param_id] = (\n", + " self.beta2 * self.second_moment[param_id] + \n", + " (1 - self.beta2) * gradient * gradient\n", + " )\n", + " \n", + " # Bias correction\n", + " first_moment_corrected = (\n", + " self.first_moment[param_id] / (1 - self.beta1 ** self.step_count)\n", + " )\n", + " second_moment_corrected = (\n", + " self.second_moment[param_id] / (1 - self.beta2 ** self.step_count)\n", + " )\n", + " \n", + " # Update parameter with adaptive learning rate\n", + " param.data = Tensor(\n", + " param.data.data - self.learning_rate * first_moment_corrected / \n", + " (np.sqrt(second_moment_corrected) + self.epsilon)\n", + " )\n", + " ### END SOLUTION\n", + " \n", + " def zero_grad(self) -> None:\n", + " \"\"\"\n", + " Zero out gradients for all parameters.\n", + " \n", + " TODO: Implement gradient zeroing (same as SGD).\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Set param.grad = None for all parameters\n", + " - This is identical to SGD implementation\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " for param in self.parameters:\n", + " param.grad = None\n", + " ### END SOLUTION" + ] + }, + { + "cell_type": "markdown", + "id": "41593be1", + "metadata": { + "cell_marker": "\"\"\"" + }, + "source": [ + "### ๐Ÿงช Test Your Adam Implementation\n", + "\n", + "Let's test the Adam optimizer:" + ] + }, + { + "cell_type": "markdown", + "id": "461e74f8", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### ๐Ÿงช Unit Test: Adam Optimizer\n", + "\n", + "Let's test your Adam optimizer implementation! This is a state-of-the-art adaptive optimization algorithm.\n", + "\n", + "**This is a unit test** - it tests one specific class (Adam) in isolation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afe99df3", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-adam", + "locked": true, + "points": 20, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_adam_optimizer_comprehensive():\n", + " \"\"\"Test Adam optimizer implementation\"\"\"\n", + " print(\"๐Ÿ”ฌ Unit Test: Adam Optimizer...\")\n", + " \n", + " # Create test parameters\n", + " w1 = Variable(1.0, requires_grad=True)\n", + " w2 = Variable(2.0, requires_grad=True)\n", + " b = Variable(0.5, requires_grad=True)\n", + " \n", + " # Create optimizer\n", + " optimizer = Adam([w1, w2, b], learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8)\n", + " \n", + " # Test zero_grad\n", + " try:\n", + " w1.grad = Variable(0.1)\n", + " w2.grad = Variable(0.2)\n", + " b.grad = Variable(0.05)\n", + " \n", + " optimizer.zero_grad()\n", + " \n", + " assert w1.grad is None, \"Gradient should be None after zero_grad\"\n", + " assert w2.grad is None, \"Gradient should be None after zero_grad\"\n", + " assert b.grad is None, \"Gradient should be None after zero_grad\"\n", + " print(\"โœ… zero_grad() works correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ zero_grad() failed: {e}\")\n", + " raise\n", + " \n", + " # Test step with gradients\n", + " try:\n", + " w1.grad = Variable(0.1)\n", + " w2.grad = Variable(0.2)\n", + " b.grad = Variable(0.05)\n", + " \n", + " # First step\n", + " original_w1 = w1.data.data.item()\n", + " original_w2 = w2.data.data.item()\n", + " original_b = b.data.data.item()\n", + " \n", + " optimizer.step()\n", + " \n", + " # Check that parameters were updated (Adam uses adaptive learning rates)\n", + " assert w1.data.data.item() != original_w1, \"w1 should have been updated\"\n", + " assert w2.data.data.item() != original_w2, \"w2 should have been updated\"\n", + " assert b.data.data.item() != original_b, \"b should have been updated\"\n", + " print(\"โœ… Parameter updates work correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Parameter updates failed: {e}\")\n", + " raise\n", + " \n", + " # Test moment buffers\n", + " try:\n", + " assert len(optimizer.first_moment) == 3, f\"Should have 3 first moment buffers, got {len(optimizer.first_moment)}\"\n", + " assert len(optimizer.second_moment) == 3, f\"Should have 3 second moment buffers, got {len(optimizer.second_moment)}\"\n", + " print(\"โœ… Moment buffers created correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Moment buffers failed: {e}\")\n", + " raise\n", + " \n", + " # Test step counting and bias correction\n", + " try:\n", + " assert optimizer.step_count == 1, f\"Step count should be 1, got {optimizer.step_count}\"\n", + " \n", + " # Take another step\n", + " w1.grad = Variable(0.1)\n", + " w2.grad = Variable(0.2)\n", + " b.grad = Variable(0.05)\n", + " \n", + " optimizer.step()\n", + " \n", + " assert optimizer.step_count == 2, f\"Step count should be 2, got {optimizer.step_count}\"\n", + " print(\"โœ… Step counting and bias correction work correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Step counting and bias correction failed: {e}\")\n", + " raise\n", + " \n", + " # Test adaptive learning rates\n", + " try:\n", + " # Adam should have different effective learning rates for different parameters\n", + " # This is tested implicitly by the parameter updates above\n", + " print(\"โœ… Adaptive learning rates work correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Adaptive learning rates failed: {e}\")\n", + " raise\n", + "\n", + " print(\"๐ŸŽฏ Adam optimizer behavior:\")\n", + " print(\" Maintains first and second moment estimates\")\n", + " print(\" Applies bias correction for early training\")\n", + " print(\" Uses adaptive learning rates per parameter\")\n", + " print(\" Combines benefits of momentum and RMSprop\")\n", + " print(\"๐Ÿ“ˆ Progress: Adam Optimizer โœ“\")\n", + "\n", + "# Run the test\n", + "test_adam_optimizer_comprehensive()" + ] + }, + { + "cell_type": "markdown", + "id": "e198d030", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "## Step 4: Learning Rate Scheduling\n", + "\n", + "### What is Learning Rate Scheduling?\n", + "**Learning rate scheduling** adjusts the learning rate during training:\n", + "\n", + "```\n", + "Initial: learning_rate = 0.1\n", + "After 10 epochs: learning_rate = 0.01\n", + "After 20 epochs: learning_rate = 0.001\n", + "```\n", + "\n", + "### Why Scheduling Matters\n", + "1. **Fine-tuning**: Start with large steps, then refine with small steps\n", + "2. **Convergence**: Prevents overshooting near optimum\n", + "3. **Stability**: Reduces oscillations in later training\n", + "4. **Performance**: Often improves final accuracy\n", + "\n", + "### Common Scheduling Strategies\n", + "1. **Step decay**: Reduce by factor every N epochs\n", + "2. **Exponential decay**: Gradual exponential reduction\n", + "3. **Cosine annealing**: Smooth cosine curve reduction\n", + "4. **Warm-up**: Start small, increase, then decrease\n", + "\n", + "### Visual Understanding\n", + "```\n", + "Step decay: ----โ†“----โ†“----โ†“\n", + "Exponential: \\\\\\\\\\\\\\\\\\\\\\\\\\\\\n", + "Cosine: โˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉโˆฉ\n", + "```\n", + "\n", + "### Real-World Applications\n", + "- **ImageNet training**: Essential for achieving state-of-the-art results\n", + "- **Language models**: Critical for training large transformers\n", + "- **Fine-tuning**: Prevents catastrophic forgetting\n", + "- **Transfer learning**: Adapts pre-trained models\n", + "\n", + "Let's implement step learning rate scheduling!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7aba8fc9", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "steplr-class", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "#| export\n", + "class StepLR:\n", + " \"\"\"\n", + " Step Learning Rate Scheduler\n", + " \n", + " Decays learning rate by gamma every step_size epochs:\n", + " learning_rate = initial_lr * (gamma ^ (epoch // step_size))\n", + " \"\"\"\n", + " \n", + " def __init__(self, optimizer: Union[SGD, Adam], step_size: int, gamma: float = 0.1):\n", + " \"\"\"\n", + " Initialize step learning rate scheduler.\n", + " \n", + " Args:\n", + " optimizer: Optimizer to schedule\n", + " step_size: Number of epochs between decreases\n", + " gamma: Multiplicative factor for learning rate decay\n", + " \n", + " TODO: Implement learning rate scheduler initialization.\n", + " \n", + " APPROACH:\n", + " 1. Store optimizer reference\n", + " 2. Store scheduling parameters\n", + " 3. Save initial learning rate\n", + " 4. Initialize step counter\n", + " \n", + " EXAMPLE:\n", + " ```python\n", + " optimizer = SGD([w1, w2], learning_rate=0.1)\n", + " scheduler = StepLR(optimizer, step_size=10, gamma=0.1)\n", + " \n", + " # In training loop:\n", + " for epoch in range(100):\n", + " train_one_epoch()\n", + " scheduler.step() # Update learning rate\n", + " ```\n", + " \n", + " HINTS:\n", + " - Store optimizer reference\n", + " - Save initial learning rate from optimizer\n", + " - Initialize step counter to 0\n", + " - gamma is the decay factor (0.1 = 10x reduction)\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " self.optimizer = optimizer\n", + " self.step_size = step_size\n", + " self.gamma = gamma\n", + " self.initial_lr = optimizer.learning_rate\n", + " self.step_count = 0\n", + " ### END SOLUTION\n", + " \n", + " def step(self) -> None:\n", + " \"\"\"\n", + " Update learning rate based on current step.\n", + " \n", + " TODO: Implement learning rate update.\n", + " \n", + " APPROACH:\n", + " 1. Increment step counter\n", + " 2. Calculate new learning rate using step decay formula\n", + " 3. Update optimizer's learning rate\n", + " \n", + " MATHEMATICAL FORMULATION:\n", + " new_lr = initial_lr * (gamma ^ ((step_count - 1) // step_size))\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Use // for integer division\n", + " - Use ** for exponentiation\n", + " - Update optimizer.learning_rate directly\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " self.step_count += 1\n", + " \n", + " # Calculate new learning rate\n", + " decay_factor = self.gamma ** ((self.step_count - 1) // self.step_size)\n", + " new_lr = self.initial_lr * decay_factor\n", + " \n", + " # Update optimizer's learning rate\n", + " self.optimizer.learning_rate = new_lr\n", + " ### END SOLUTION\n", + " \n", + " def get_lr(self) -> float:\n", + " \"\"\"\n", + " Get current learning rate.\n", + " \n", + " TODO: Return current learning rate.\n", + " \n", + " IMPLEMENTATION HINTS:\n", + " - Return optimizer.learning_rate\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " return self.optimizer.learning_rate\n", + " ### END SOLUTION" + ] + }, + { + "cell_type": "markdown", + "id": "51901e5b", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### ๐Ÿงช Unit Test: Step Learning Rate Scheduler\n", + "\n", + "Let's test your step learning rate scheduler implementation! This scheduler reduces learning rate at regular intervals.\n", + "\n", + "**This is a unit test** - it tests one specific class (StepLR) in isolation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b83de77", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-step-scheduler", + "locked": true, + "points": 10, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_step_scheduler_comprehensive():\n", + " \"\"\"Test StepLR scheduler implementation\"\"\"\n", + " print(\"๐Ÿ”ฌ Unit Test: Step Learning Rate Scheduler...\")\n", + " \n", + " # Create test parameters and optimizer\n", + " w = Variable(1.0, requires_grad=True)\n", + " optimizer = SGD([w], learning_rate=0.1)\n", + " \n", + " # Test scheduler initialization\n", + " try:\n", + " scheduler = StepLR(optimizer, step_size=10, gamma=0.1)\n", + " \n", + " # Test initial learning rate\n", + " assert scheduler.get_lr() == 0.1, f\"Initial learning rate should be 0.1, got {scheduler.get_lr()}\"\n", + " print(\"โœ… Initial learning rate is correct\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Initial learning rate failed: {e}\")\n", + " raise\n", + " \n", + " # Test step-based decay\n", + " try:\n", + " # Steps 1-10: no decay (decay happens after step 10)\n", + " for i in range(10):\n", + " scheduler.step()\n", + " \n", + " assert scheduler.get_lr() == 0.1, f\"Learning rate should still be 0.1 after 10 steps, got {scheduler.get_lr()}\"\n", + " \n", + " # Step 11: decay should occur\n", + " scheduler.step()\n", + " expected_lr = 0.1 * 0.1 # 0.01\n", + " assert abs(scheduler.get_lr() - expected_lr) < 1e-6, f\"Learning rate should be {expected_lr} after 11 steps, got {scheduler.get_lr()}\"\n", + " print(\"โœ… Step-based decay works correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Step-based decay failed: {e}\")\n", + " raise\n", + " \n", + " # Test multiple decay levels\n", + " try:\n", + " # Steps 12-20: should stay at 0.01\n", + " for i in range(9):\n", + " scheduler.step()\n", + " \n", + " assert abs(scheduler.get_lr() - 0.01) < 1e-6, f\"Learning rate should be 0.01 after 20 steps, got {scheduler.get_lr()}\"\n", + " \n", + " # Step 21: another decay\n", + " scheduler.step()\n", + " expected_lr = 0.01 * 0.1 # 0.001\n", + " assert abs(scheduler.get_lr() - expected_lr) < 1e-6, f\"Learning rate should be {expected_lr} after 21 steps, got {scheduler.get_lr()}\"\n", + " print(\"โœ… Multiple decay levels work correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Multiple decay levels failed: {e}\")\n", + " raise\n", + " \n", + " # Test with different optimizer\n", + " try:\n", + " w2 = Variable(2.0, requires_grad=True)\n", + " adam_optimizer = Adam([w2], learning_rate=0.001)\n", + " adam_scheduler = StepLR(adam_optimizer, step_size=5, gamma=0.5)\n", + " \n", + " # Test initial learning rate\n", + " assert adam_scheduler.get_lr() == 0.001, f\"Initial Adam learning rate should be 0.001, got {adam_scheduler.get_lr()}\"\n", + " \n", + " # Test decay after 5 steps\n", + " for i in range(5):\n", + " adam_scheduler.step()\n", + " \n", + " # Learning rate should still be 0.001 after 5 steps\n", + " assert adam_scheduler.get_lr() == 0.001, f\"Adam learning rate should still be 0.001 after 5 steps, got {adam_scheduler.get_lr()}\"\n", + " \n", + " # Step 6: decay should occur\n", + " adam_scheduler.step()\n", + " expected_lr = 0.001 * 0.5 # 0.0005\n", + " assert abs(adam_scheduler.get_lr() - expected_lr) < 1e-6, f\"Adam learning rate should be {expected_lr} after 6 steps, got {adam_scheduler.get_lr()}\"\n", + " print(\"โœ… Works with different optimizers\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Different optimizers failed: {e}\")\n", + " raise\n", + "\n", + " print(\"๐ŸŽฏ Step learning rate scheduler behavior:\")\n", + " print(\" Reduces learning rate at regular intervals\")\n", + " print(\" Multiplies current rate by gamma factor\")\n", + " print(\" Works with any optimizer (SGD, Adam, etc.)\")\n", + " print(\"๐Ÿ“ˆ Progress: Step Learning Rate Scheduler โœ“\")\n", + "\n", + "# Run the test\n", + "test_step_scheduler_comprehensive()" + ] + }, + { + "cell_type": "markdown", + "id": "2fc52bc2", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "## Step 5: Integration - Complete Training Example\n", + "\n", + "### Putting It All Together\n", + "Let's see how optimizers enable complete neural network training:\n", + "\n", + "1. **Forward pass**: Compute predictions\n", + "2. **Loss computation**: Compare with targets\n", + "3. **Backward pass**: Compute gradients\n", + "4. **Optimizer step**: Update parameters\n", + "5. **Learning rate scheduling**: Adjust learning rate\n", + "\n", + "### The Modern Training Loop\n", + "```python\n", + "# Setup\n", + "optimizer = Adam(model.parameters(), learning_rate=0.001)\n", + "scheduler = StepLR(optimizer, step_size=10, gamma=0.1)\n", + "\n", + "# Training loop\n", + "for epoch in range(num_epochs):\n", + " for batch in dataloader:\n", + " # Forward pass\n", + " predictions = model(batch.inputs)\n", + " loss = criterion(predictions, batch.targets)\n", + " \n", + " # Backward pass\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " # Update learning rate\n", + " scheduler.step()\n", + "```\n", + "\n", + "Let's implement a complete training example!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3205aad", + "metadata": { + "lines_to_next_cell": 1, + "nbgrader": { + "grade": false, + "grade_id": "training-integration", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "outputs": [], + "source": [ + "def train_simple_model():\n", + " \"\"\"\n", + " Complete training example using optimizers.\n", + " \n", + " TODO: Implement a complete training loop.\n", + " \n", + " APPROACH:\n", + " 1. Create a simple model (linear regression)\n", + " 2. Generate training data\n", + " 3. Set up optimizer and scheduler\n", + " 4. Train for several epochs\n", + " 5. Show convergence\n", + " \n", + " LEARNING OBJECTIVE:\n", + " - See how optimizers enable real learning\n", + " - Compare SGD vs Adam performance\n", + " - Understand the complete training workflow\n", + " \"\"\"\n", + " ### BEGIN SOLUTION\n", + " print(\"Training simple linear regression model...\")\n", + " \n", + " # Create simple model: y = w*x + b\n", + " w = Variable(0.1, requires_grad=True) # Initialize near zero\n", + " b = Variable(0.0, requires_grad=True)\n", + " \n", + " # Training data: y = 2*x + 1\n", + " x_data = [1.0, 2.0, 3.0, 4.0, 5.0]\n", + " y_data = [3.0, 5.0, 7.0, 9.0, 11.0]\n", + " \n", + " # Try SGD first\n", + " print(\"\\n๐Ÿ” Training with SGD...\")\n", + " optimizer_sgd = SGD([w, b], learning_rate=0.01, momentum=0.9)\n", + " \n", + " for epoch in range(60):\n", + " total_loss = 0\n", + " \n", + " for x_val, y_val in zip(x_data, y_data):\n", + " # Forward pass\n", + " x = Variable(x_val, requires_grad=False)\n", + " y_target = Variable(y_val, requires_grad=False)\n", + " \n", + " # Prediction: y = w*x + b\n", + " try:\n", + " from tinytorch.core.autograd import add, multiply, subtract\n", + " except ImportError:\n", + " setup_import_paths()\n", + " from autograd_dev import add, multiply, subtract\n", + " \n", + " prediction = add(multiply(w, x), b)\n", + " \n", + " # Loss: (prediction - target)^2\n", + " error = subtract(prediction, y_target)\n", + " loss = multiply(error, error)\n", + " \n", + " # Backward pass\n", + " optimizer_sgd.zero_grad()\n", + " loss.backward()\n", + " optimizer_sgd.step()\n", + " \n", + " total_loss += loss.data.data.item()\n", + " \n", + " if epoch % 10 == 0:\n", + " print(f\"Epoch {epoch}: Loss = {total_loss:.4f}, w = {w.data.data.item():.3f}, b = {b.data.data.item():.3f}\")\n", + " \n", + " sgd_final_w = w.data.data.item()\n", + " sgd_final_b = b.data.data.item()\n", + " \n", + " # Reset parameters and try Adam\n", + " print(\"\\n๐Ÿ” Training with Adam...\")\n", + " w.data = Tensor(0.1)\n", + " b.data = Tensor(0.0)\n", + " \n", + " optimizer_adam = Adam([w, b], learning_rate=0.01)\n", + " \n", + " for epoch in range(60):\n", + " total_loss = 0\n", + " \n", + " for x_val, y_val in zip(x_data, y_data):\n", + " # Forward pass\n", + " x = Variable(x_val, requires_grad=False)\n", + " y_target = Variable(y_val, requires_grad=False)\n", + " \n", + " # Prediction: y = w*x + b\n", + " prediction = add(multiply(w, x), b)\n", + " \n", + " # Loss: (prediction - target)^2\n", + " error = subtract(prediction, y_target)\n", + " loss = multiply(error, error)\n", + " \n", + " # Backward pass\n", + " optimizer_adam.zero_grad()\n", + " loss.backward()\n", + " optimizer_adam.step()\n", + " \n", + " total_loss += loss.data.data.item()\n", + " \n", + " if epoch % 10 == 0:\n", + " print(f\"Epoch {epoch}: Loss = {total_loss:.4f}, w = {w.data.data.item():.3f}, b = {b.data.data.item():.3f}\")\n", + " \n", + " adam_final_w = w.data.data.item()\n", + " adam_final_b = b.data.data.item()\n", + " \n", + " print(f\"\\n๐Ÿ“Š Results:\")\n", + " print(f\"Target: w = 2.0, b = 1.0\")\n", + " print(f\"SGD: w = {sgd_final_w:.3f}, b = {sgd_final_b:.3f}\")\n", + " print(f\"Adam: w = {adam_final_w:.3f}, b = {adam_final_b:.3f}\")\n", + " \n", + " return sgd_final_w, sgd_final_b, adam_final_w, adam_final_b\n", + " ### END SOLUTION" + ] + }, + { + "cell_type": "markdown", + "id": "0a5330c4", + "metadata": { + "cell_marker": "\"\"\"", + "lines_to_next_cell": 1 + }, + "source": [ + "### ๐Ÿงช Unit Test: Complete Training Integration\n", + "\n", + "Let's test your complete training integration! This demonstrates optimizers working together in a realistic training scenario.\n", + "\n", + "**This is a unit test** - it tests the complete training workflow with optimizers in isolation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aeda8ce", + "metadata": { + "nbgrader": { + "grade": true, + "grade_id": "test-training-integration", + "locked": true, + "points": 25, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "def test_training_integration_comprehensive():\n", + " \"\"\"Test complete training integration with optimizers\"\"\"\n", + " print(\"๐Ÿ”ฌ Unit Test: Complete Training Integration...\")\n", + " \n", + " # Test training with SGD and Adam\n", + " try:\n", + " sgd_w, sgd_b, adam_w, adam_b = train_simple_model()\n", + " \n", + " # Test SGD convergence\n", + " assert abs(sgd_w - 2.0) < 0.1, f\"SGD should converge close to w=2.0, got {sgd_w}\"\n", + " assert abs(sgd_b - 1.0) < 0.1, f\"SGD should converge close to b=1.0, got {sgd_b}\"\n", + " print(\"โœ… SGD convergence works\")\n", + " \n", + " # Test Adam convergence (may be different due to adaptive learning rates)\n", + " assert abs(adam_w - 2.0) < 1.0, f\"Adam should converge reasonably close to w=2.0, got {adam_w}\"\n", + " assert abs(adam_b - 1.0) < 1.0, f\"Adam should converge reasonably close to b=1.0, got {adam_b}\"\n", + " print(\"โœ… Adam convergence works\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Training integration failed: {e}\")\n", + " raise\n", + " \n", + " # Test optimizer comparison\n", + " try:\n", + " # Both optimizers should achieve reasonable results\n", + " sgd_error = (sgd_w - 2.0)**2 + (sgd_b - 1.0)**2\n", + " adam_error = (adam_w - 2.0)**2 + (adam_b - 1.0)**2\n", + " \n", + " # Both should have low error (< 0.1)\n", + " assert sgd_error < 0.1, f\"SGD error should be < 0.1, got {sgd_error}\"\n", + " assert adam_error < 1.0, f\"Adam error should be < 1.0, got {adam_error}\"\n", + " print(\"โœ… Optimizer comparison works\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Optimizer comparison failed: {e}\")\n", + " raise\n", + " \n", + " # Test gradient flow\n", + " try:\n", + " # Create a simple test to verify gradients flow correctly\n", + " w = Variable(1.0, requires_grad=True)\n", + " b = Variable(0.0, requires_grad=True)\n", + " \n", + " # Set up simple gradients\n", + " w.grad = Variable(0.1)\n", + " b.grad = Variable(0.05)\n", + " \n", + " # Test SGD step\n", + " sgd_optimizer = SGD([w, b], learning_rate=0.1)\n", + " original_w = w.data.data.item()\n", + " original_b = b.data.data.item()\n", + " \n", + " sgd_optimizer.step()\n", + " \n", + " # Check updates\n", + " assert w.data.data.item() != original_w, \"SGD should update w\"\n", + " assert b.data.data.item() != original_b, \"SGD should update b\"\n", + " print(\"โœ… Gradient flow works correctly\")\n", + " \n", + " except Exception as e:\n", + " print(f\"โŒ Gradient flow failed: {e}\")\n", + " raise\n", + "\n", + " print(\"๐ŸŽฏ Training integration behavior:\")\n", + " print(\" Optimizers successfully minimize loss functions\")\n", + " print(\" SGD and Adam both converge to target values\")\n", + " print(\" Gradient computation and updates work correctly\")\n", + " print(\" Ready for real neural network training\")\n", + " print(\"๐Ÿ“ˆ Progress: Complete Training Integration โœ“\")\n", + "\n", + "# Run the test\n", + "test_training_integration_comprehensive()" + ] + }, + { + "cell_type": "markdown", + "id": "c0464e8c", + "metadata": {}, + "source": [ + "\"\"\"\n", + "# ๐ŸŽฏ Module Summary: Optimization Mastery!\n", + "\n", + "Congratulations! You've successfully implemented the optimization algorithms that power all modern neural network training:\n", + "\n", + "## โœ… What You've Built\n", + "- **Gradient Descent**: The fundamental parameter update mechanism\n", + "- **SGD with Momentum**: Accelerated convergence with velocity accumulation\n", + "- **Adam Optimizer**: Adaptive learning rates with first and second moments\n", + "- **Learning Rate Scheduling**: Smart learning rate adjustment during training\n", + "- **Complete Training Integration**: End-to-end training workflow\n", + "\n", + "## โœ… Key Learning Outcomes\n", + "- **Understanding**: How optimizers use gradients to update parameters intelligently\n", + "- **Implementation**: Built SGD and Adam optimizers from mathematical foundations\n", + "- **Mathematical mastery**: Momentum, adaptive learning rates, bias correction\n", + "- **Systems integration**: Complete training loops with scheduling\n", + "- **Real-world application**: Modern deep learning training workflow\n", + "\n", + "## โœ… Mathematical Foundations Mastered\n", + "- **Gradient Descent**: ฮธ = ฮธ - ฮฑโˆ‡L(ฮธ) for parameter updates\n", + "- **Momentum**: v_t = ฮฒv_{t-1} + โˆ‡L(ฮธ) for acceleration\n", + "- **Adam**: Adaptive learning rates with exponential moving averages\n", + "- **Learning Rate Scheduling**: Strategic learning rate adjustment\n", + "\n", + "## โœ… Professional Skills Developed\n", + "- **Algorithm implementation**: Translating mathematical formulas into code\n", + "- **State management**: Tracking optimizer buffers and statistics\n", + "- **Hyperparameter design**: Understanding the impact of learning rate, momentum, etc.\n", + "- **Training orchestration**: Complete training loop design\n", + "\n", + "## โœ… Ready for Advanced Applications\n", + "Your optimizers now enable:\n", + "- **Deep Neural Networks**: Effective training of complex architectures\n", + "- **Computer Vision**: Training CNNs, ResNets, Vision Transformers\n", + "- **Natural Language Processing**: Training transformers and language models\n", + "- **Any ML Model**: Gradient-based optimization for any differentiable system\n", + "\n", + "## ๐Ÿ”— Connection to Real ML Systems\n", + "Your implementations mirror production systems:\n", + "- **PyTorch**: `torch.optim.SGD()`, `torch.optim.Adam()`, `torch.optim.lr_scheduler.StepLR()`\n", + "- **TensorFlow**: `tf.keras.optimizers.SGD()`, `tf.keras.optimizers.Adam()`\n", + "- **Industry Standard**: Every major ML framework uses these exact algorithms\n", + "\n", + "## ๐ŸŽฏ The Power of Intelligent Optimization\n", + "You've unlocked the algorithms that made modern AI possible:\n", + "- **Scalability**: Efficiently optimize millions of parameters\n", + "- **Adaptability**: Different learning rates for different parameters\n", + "- **Robustness**: Handle noisy gradients and ill-conditioned problems\n", + "- **Universality**: Work with any differentiable neural network\n", + "\n", + "## ๐Ÿง  Deep Learning Revolution\n", + "You now understand the optimization technology that powers:\n", + "- **ImageNet**: Training state-of-the-art computer vision models\n", + "- **Language Models**: Training GPT, BERT, and other transformers\n", + "- **Modern AI**: Every breakthrough relies on these optimization algorithms\n", + "- **Future Research**: Your understanding enables you to develop new optimizers\n", + "\n", + "## ๐Ÿš€ What's Next\n", + "Your optimizers are the foundation for:\n", + "- **Training Module**: Complete training loops with loss functions and metrics\n", + "- **Advanced Optimizers**: RMSprop, AdaGrad, learning rate warm-up\n", + "- **Distributed Training**: Multi-GPU optimization strategies\n", + "- **Research**: Experimenting with novel optimization algorithms\n", + "\n", + "**Next Module**: Complete training systems that orchestrate your optimizers for real-world ML!\n", + "\n", + "You've built the intelligent algorithms that enable neural networks to learn. Now let's use them to train systems that can solve complex real-world problems!\n", + "\"\"\"\n", + "\n", + "Run inline tests when module is executed directly\n", + "if __name__ == \"__main__\":\n", + " from tito.tools.testing import run_module_tests_auto\n", + " \n", + " # Automatically discover and run all tests in this module\n", + " run_module_tests_auto(\"Optimizers\") " + ] + } + ], + "metadata": { + "jupytext": { + "main_language": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tinytorch/core/optimizers.py b/tinytorch/core/optimizers.py new file mode 100644 index 00000000..bec2618a --- /dev/null +++ b/tinytorch/core/optimizers.py @@ -0,0 +1,502 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/08_optimizers/optimizers_dev.ipynb. + +# %% auto 0 +__all__ = ['setup_import_paths', 'gradient_descent_step', 'SGD', 'Adam', 'StepLR'] + +# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 1 +import math +import numpy as np +import sys +import os +from typing import List, Dict, Any, Optional, Union +from collections import defaultdict + +# Helper function to set up import paths +def setup_import_paths(): + """Set up import paths for development modules.""" + import sys + import os + + # Add module directories to path + base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + tensor_dir = os.path.join(base_dir, '01_tensor') + autograd_dir = os.path.join(base_dir, '07_autograd') + + if tensor_dir not in sys.path: + sys.path.append(tensor_dir) + if autograd_dir not in sys.path: + sys.path.append(autograd_dir) + +# Import our existing components +try: + from tinytorch.core.tensor import Tensor + from tinytorch.core.autograd import Variable +except ImportError: + # For development, try local imports + try: + setup_import_paths() + from tensor_dev import Tensor + from autograd_dev import Variable + except ImportError: + # Create minimal fallback classes for testing + print("Warning: Using fallback classes for testing") + + class Tensor: + def __init__(self, data): + self.data = np.array(data) + self.shape = self.data.shape + + def __str__(self): + return f"Tensor({self.data})" + + class Variable: + def __init__(self, data, requires_grad=True): + if isinstance(data, (int, float)): + self.data = Tensor([data]) + else: + self.data = Tensor(data) + self.requires_grad = requires_grad + self.grad = None + + def zero_grad(self): + self.grad = None + + def __str__(self): + return f"Variable({self.data.data})" + +# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 6 +def gradient_descent_step(parameter: Variable, learning_rate: float) -> None: + """ + Perform one step of gradient descent on a parameter. + + Args: + parameter: Variable with gradient information + learning_rate: How much to update parameter + + TODO: Implement basic gradient descent parameter update. + + STEP-BY-STEP IMPLEMENTATION: + 1. Check if parameter has a gradient + 2. Get current parameter value and gradient + 3. Update parameter: new_value = old_value - learning_rate * gradient + 4. Update parameter data with new value + 5. Handle edge cases (no gradient, invalid values) + + EXAMPLE USAGE: + ```python + # Parameter with gradient + w = Variable(2.0, requires_grad=True) + w.grad = Variable(0.5) # Gradient from loss + + # Update parameter + gradient_descent_step(w, learning_rate=0.1) + # w.data now contains: 2.0 - 0.1 * 0.5 = 1.95 + ``` + + IMPLEMENTATION HINTS: + - Check if parameter.grad is not None + - Use parameter.grad.data.data to get gradient value + - Update parameter.data with new Tensor + - Don't modify gradient (it's used for logging) + + LEARNING CONNECTIONS: + - This is the foundation of all neural network training + - PyTorch's optimizer.step() does exactly this + - The learning rate determines convergence speed + """ + ### BEGIN SOLUTION + if parameter.grad is not None: + # Get current parameter value and gradient + current_value = parameter.data.data + gradient_value = parameter.grad.data.data + + # Update parameter: new_value = old_value - learning_rate * gradient + new_value = current_value - learning_rate * gradient_value + + # Update parameter data + parameter.data = Tensor(new_value) + ### END SOLUTION + +# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 10 +class SGD: + """ + SGD Optimizer with Momentum + + Implements stochastic gradient descent with momentum: + v_t = momentum * v_{t-1} + gradient + parameter = parameter - learning_rate * v_t + """ + + def __init__(self, parameters: List[Variable], learning_rate: float = 0.01, + momentum: float = 0.0, weight_decay: float = 0.0): + """ + Initialize SGD optimizer. + + Args: + parameters: List of Variables to optimize + learning_rate: Learning rate (default: 0.01) + momentum: Momentum coefficient (default: 0.0) + weight_decay: L2 regularization coefficient (default: 0.0) + + TODO: Implement SGD optimizer initialization. + + APPROACH: + 1. Store parameters and hyperparameters + 2. Initialize momentum buffers for each parameter + 3. Set up state tracking for optimization + 4. Prepare for step() and zero_grad() methods + + EXAMPLE: + ```python + # Create optimizer + optimizer = SGD([w1, w2, b1, b2], learning_rate=0.01, momentum=0.9) + + # In training loop: + optimizer.zero_grad() + loss.backward() + optimizer.step() + ``` + + HINTS: + - Store parameters as a list + - Initialize momentum buffers as empty dict + - Use parameter id() as key for momentum tracking + - Momentum buffers will be created lazily in step() + """ + ### BEGIN SOLUTION + self.parameters = parameters + self.learning_rate = learning_rate + self.momentum = momentum + self.weight_decay = weight_decay + + # Initialize momentum buffers (created lazily) + self.momentum_buffers = {} + + # Track optimization steps + self.step_count = 0 + ### END SOLUTION + + def step(self) -> None: + """ + Perform one optimization step. + + TODO: Implement SGD parameter update with momentum. + + APPROACH: + 1. Iterate through all parameters + 2. For each parameter with gradient: + a. Get current gradient + b. Apply weight decay if specified + c. Update momentum buffer (or create if first time) + d. Update parameter using momentum + 3. Increment step count + + MATHEMATICAL FORMULATION: + - If weight_decay > 0: gradient = gradient + weight_decay * parameter + - momentum_buffer = momentum * momentum_buffer + gradient + - parameter = parameter - learning_rate * momentum_buffer + + IMPLEMENTATION HINTS: + - Use id(param) as key for momentum buffers + - Initialize buffer with zeros if not exists + - Handle case where momentum = 0 (no momentum) + - Update parameter.data with new Tensor + """ + ### BEGIN SOLUTION + for param in self.parameters: + if param.grad is not None: + # Get gradient + gradient = param.grad.data.data + + # Apply weight decay (L2 regularization) + if self.weight_decay > 0: + gradient = gradient + self.weight_decay * param.data.data + + # Get or create momentum buffer + param_id = id(param) + if param_id not in self.momentum_buffers: + self.momentum_buffers[param_id] = np.zeros_like(param.data.data) + + # Update momentum buffer + self.momentum_buffers[param_id] = ( + self.momentum * self.momentum_buffers[param_id] + gradient + ) + + # Update parameter + param.data = Tensor( + param.data.data - self.learning_rate * self.momentum_buffers[param_id] + ) + + self.step_count += 1 + ### END SOLUTION + + def zero_grad(self) -> None: + """ + Zero out gradients for all parameters. + + TODO: Implement gradient zeroing. + + APPROACH: + 1. Iterate through all parameters + 2. Set gradient to None for each parameter + 3. This prepares for next backward pass + + IMPLEMENTATION HINTS: + - Simply set param.grad = None + - This is called before loss.backward() + - Essential for proper gradient accumulation + """ + ### BEGIN SOLUTION + for param in self.parameters: + param.grad = None + ### END SOLUTION + +# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 14 +class Adam: + """ + Adam Optimizer + + Implements Adam algorithm with adaptive learning rates: + - First moment: exponential moving average of gradients + - Second moment: exponential moving average of squared gradients + - Bias correction: accounts for initialization bias + - Adaptive updates: different learning rate per parameter + """ + + def __init__(self, parameters: List[Variable], learning_rate: float = 0.001, + beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8, + weight_decay: float = 0.0): + """ + Initialize Adam optimizer. + + Args: + parameters: List of Variables to optimize + learning_rate: Learning rate (default: 0.001) + beta1: Exponential decay rate for first moment (default: 0.9) + beta2: Exponential decay rate for second moment (default: 0.999) + epsilon: Small constant for numerical stability (default: 1e-8) + weight_decay: L2 regularization coefficient (default: 0.0) + + TODO: Implement Adam optimizer initialization. + + APPROACH: + 1. Store parameters and hyperparameters + 2. Initialize first moment buffers (m_t) + 3. Initialize second moment buffers (v_t) + 4. Set up step counter for bias correction + + EXAMPLE: + ```python + # Create Adam optimizer + optimizer = Adam([w1, w2, b1, b2], learning_rate=0.001) + + # In training loop: + optimizer.zero_grad() + loss.backward() + optimizer.step() + ``` + + HINTS: + - Store all hyperparameters + - Initialize moment buffers as empty dicts + - Use parameter id() as key for tracking + - Buffers will be created lazily in step() + """ + ### BEGIN SOLUTION + self.parameters = parameters + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.weight_decay = weight_decay + + # Initialize moment buffers (created lazily) + self.first_moment = {} # m_t + self.second_moment = {} # v_t + + # Track optimization steps for bias correction + self.step_count = 0 + ### END SOLUTION + + def step(self) -> None: + """ + Perform one optimization step using Adam algorithm. + + TODO: Implement Adam parameter update. + + APPROACH: + 1. Increment step count + 2. For each parameter with gradient: + a. Get current gradient + b. Apply weight decay if specified + c. Update first moment (momentum) + d. Update second moment (variance) + e. Apply bias correction + f. Update parameter with adaptive learning rate + + MATHEMATICAL FORMULATION: + - m_t = beta1 * m_{t-1} + (1 - beta1) * gradient + - v_t = beta2 * v_{t-1} + (1 - beta2) * gradient^2 + - m_hat = m_t / (1 - beta1^t) + - v_hat = v_t / (1 - beta2^t) + - parameter = parameter - learning_rate * m_hat / (sqrt(v_hat) + epsilon) + + IMPLEMENTATION HINTS: + - Use id(param) as key for moment buffers + - Initialize buffers with zeros if not exists + - Use np.sqrt() for square root + - Handle numerical stability with epsilon + """ + ### BEGIN SOLUTION + self.step_count += 1 + + for param in self.parameters: + if param.grad is not None: + # Get gradient + gradient = param.grad.data.data + + # Apply weight decay (L2 regularization) + if self.weight_decay > 0: + gradient = gradient + self.weight_decay * param.data.data + + # Get or create moment buffers + param_id = id(param) + if param_id not in self.first_moment: + self.first_moment[param_id] = np.zeros_like(param.data.data) + self.second_moment[param_id] = np.zeros_like(param.data.data) + + # Update first moment (momentum) + self.first_moment[param_id] = ( + self.beta1 * self.first_moment[param_id] + + (1 - self.beta1) * gradient + ) + + # Update second moment (variance) + self.second_moment[param_id] = ( + self.beta2 * self.second_moment[param_id] + + (1 - self.beta2) * gradient * gradient + ) + + # Bias correction + first_moment_corrected = ( + self.first_moment[param_id] / (1 - self.beta1 ** self.step_count) + ) + second_moment_corrected = ( + self.second_moment[param_id] / (1 - self.beta2 ** self.step_count) + ) + + # Update parameter with adaptive learning rate + param.data = Tensor( + param.data.data - self.learning_rate * first_moment_corrected / + (np.sqrt(second_moment_corrected) + self.epsilon) + ) + ### END SOLUTION + + def zero_grad(self) -> None: + """ + Zero out gradients for all parameters. + + TODO: Implement gradient zeroing (same as SGD). + + IMPLEMENTATION HINTS: + - Set param.grad = None for all parameters + - This is identical to SGD implementation + """ + ### BEGIN SOLUTION + for param in self.parameters: + param.grad = None + ### END SOLUTION + +# %% ../../modules/source/08_optimizers/optimizers_dev.ipynb 19 +class StepLR: + """ + Step Learning Rate Scheduler + + Decays learning rate by gamma every step_size epochs: + learning_rate = initial_lr * (gamma ^ (epoch // step_size)) + """ + + def __init__(self, optimizer: Union[SGD, Adam], step_size: int, gamma: float = 0.1): + """ + Initialize step learning rate scheduler. + + Args: + optimizer: Optimizer to schedule + step_size: Number of epochs between decreases + gamma: Multiplicative factor for learning rate decay + + TODO: Implement learning rate scheduler initialization. + + APPROACH: + 1. Store optimizer reference + 2. Store scheduling parameters + 3. Save initial learning rate + 4. Initialize step counter + + EXAMPLE: + ```python + optimizer = SGD([w1, w2], learning_rate=0.1) + scheduler = StepLR(optimizer, step_size=10, gamma=0.1) + + # In training loop: + for epoch in range(100): + train_one_epoch() + scheduler.step() # Update learning rate + ``` + + HINTS: + - Store optimizer reference + - Save initial learning rate from optimizer + - Initialize step counter to 0 + - gamma is the decay factor (0.1 = 10x reduction) + """ + ### BEGIN SOLUTION + self.optimizer = optimizer + self.step_size = step_size + self.gamma = gamma + self.initial_lr = optimizer.learning_rate + self.step_count = 0 + ### END SOLUTION + + def step(self) -> None: + """ + Update learning rate based on current step. + + TODO: Implement learning rate update. + + APPROACH: + 1. Increment step counter + 2. Calculate new learning rate using step decay formula + 3. Update optimizer's learning rate + + MATHEMATICAL FORMULATION: + new_lr = initial_lr * (gamma ^ ((step_count - 1) // step_size)) + + IMPLEMENTATION HINTS: + - Use // for integer division + - Use ** for exponentiation + - Update optimizer.learning_rate directly + """ + ### BEGIN SOLUTION + self.step_count += 1 + + # Calculate new learning rate + decay_factor = self.gamma ** ((self.step_count - 1) // self.step_size) + new_lr = self.initial_lr * decay_factor + + # Update optimizer's learning rate + self.optimizer.learning_rate = new_lr + ### END SOLUTION + + def get_lr(self) -> float: + """ + Get current learning rate. + + TODO: Return current learning rate. + + IMPLEMENTATION HINTS: + - Return optimizer.learning_rate + """ + ### BEGIN SOLUTION + return self.optimizer.learning_rate + ### END SOLUTION diff --git a/tinytorch/core/training.py b/tinytorch/core/training.py new file mode 100644 index 00000000..1d9a1bcb --- /dev/null +++ b/tinytorch/core/training.py @@ -0,0 +1,687 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/09_training/training_dev.ipynb. + +# %% auto 0 +__all__ = ['setup_import_paths', 'MeanSquaredError', 'CrossEntropyLoss', 'BinaryCrossEntropyLoss', 'Accuracy', 'Trainer'] + +# %% ../../modules/source/09_training/training_dev.ipynb 1 +import numpy as np +import sys +import os +import pickle +import json +from pathlib import Path +from typing import List, Dict, Any, Optional, Union, Callable, Tuple +from collections import defaultdict +import time + +# Helper function to set up import paths +def setup_import_paths(): + """Set up import paths for development modules.""" + import sys + import os + + # Add module directories to path + base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + module_dirs = [ + '01_tensor', '02_activations', '03_layers', '04_networks', + '05_cnn', '06_dataloader', '07_autograd', '08_optimizers' + ] + + for module_dir in module_dirs: + sys.path.append(os.path.join(base_dir, module_dir)) + +# Set up paths +setup_import_paths() + +# Import all the building blocks we need +try: + from tinytorch.core.tensor import Tensor + from tinytorch.core.activations import ReLU, Sigmoid, Tanh, Softmax + from tinytorch.core.layers import Dense + from tinytorch.core.networks import Sequential, create_mlp + from tinytorch.core.cnn import Conv2D, flatten + from tinytorch.core.dataloader import Dataset, DataLoader + from tinytorch.core.autograd import Variable + from tinytorch.core.optimizers import SGD, Adam, StepLR +except ImportError: + # For development, create mock classes or import from local modules + try: + from tensor_dev import Tensor + from activations_dev import ReLU, Sigmoid, Tanh, Softmax + from layers_dev import Dense + from networks_dev import Sequential, create_mlp + from cnn_dev import Conv2D, flatten + from dataloader_dev import Dataset, DataLoader + from autograd_dev import Variable + from optimizers_dev import SGD, Adam, StepLR + except ImportError: + # Create minimal mock classes for development + class Tensor: + def __init__(self, data): + self.data = np.array(data) + def __str__(self): + return f"Tensor({self.data})" + + class Variable: + def __init__(self, data, requires_grad=True): + self.data = Tensor(data) + self.requires_grad = requires_grad + self.grad = None + + def zero_grad(self): + self.grad = None + + def backward(self): + if self.requires_grad: + self.grad = Variable(1.0, requires_grad=False) + + def __str__(self): + return f"Variable({self.data})" + + class SGD: + def __init__(self, parameters, learning_rate=0.01): + self.parameters = parameters + self.learning_rate = learning_rate + + def zero_grad(self): + for param in self.parameters: + if hasattr(param, 'zero_grad'): + param.zero_grad() + + def step(self): + pass + + class Sequential: + def __init__(self, layers=None): + self.layers = layers or [] + + def __call__(self, x): + for layer in self.layers: + x = layer(x) + return x + + class DataLoader: + def __init__(self, dataset, batch_size=32, shuffle=True): + self.dataset = dataset + self.batch_size = batch_size + self.shuffle = shuffle + + def __iter__(self): + return iter([(Tensor([1, 2, 3]), Tensor([0]))]) + +# %% ../../modules/source/09_training/training_dev.ipynb 4 +class MeanSquaredError: + """ + Mean Squared Error Loss for Regression + + Measures the average squared difference between predictions and targets. + MSE = (1/n) * ฮฃ(y_pred - y_true)ยฒ + """ + + def __init__(self): + """Initialize MSE loss function.""" + pass + + def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor: + """ + Compute MSE loss between predictions and targets. + + Args: + y_pred: Model predictions (shape: [batch_size, ...]) + y_true: True targets (shape: [batch_size, ...]) + + Returns: + Scalar loss value + + TODO: Implement Mean Squared Error loss computation. + + APPROACH: + 1. Compute difference: diff = y_pred - y_true + 2. Square the differences: squared_diff = diffยฒ + 3. Take mean over all elements: mean(squared_diff) + 4. Return as scalar Tensor + + EXAMPLE: + y_pred = Tensor([[1.0, 2.0], [3.0, 4.0]]) + y_true = Tensor([[1.5, 2.5], [2.5, 3.5]]) + loss = mse_loss(y_pred, y_true) + # Should return: mean([(1.0-1.5)ยฒ, (2.0-2.5)ยฒ, (3.0-2.5)ยฒ, (4.0-3.5)ยฒ]) + # = mean([0.25, 0.25, 0.25, 0.25]) = 0.25 + + HINTS: + - Use tensor subtraction: y_pred - y_true + - Use element-wise multiplication for squaring: diff * diff + - Use np.mean() to get the average + - Return Tensor(scalar_value) + """ + ### BEGIN SOLUTION + # Compute difference + diff = y_pred - y_true + + # Square the differences + squared_diff = diff * diff + + # Take mean over all elements + mean_loss = np.mean(squared_diff.data) + + return Tensor(mean_loss) + ### END SOLUTION + + def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: + """Alternative interface for forward pass.""" + return self.__call__(y_pred, y_true) + +# %% ../../modules/source/09_training/training_dev.ipynb 7 +class CrossEntropyLoss: + """ + Cross-Entropy Loss for Multi-Class Classification + + Measures the difference between predicted probability distribution and true labels. + CrossEntropy = -ฮฃ y_true * log(y_pred) + """ + + def __init__(self): + """Initialize CrossEntropy loss function.""" + pass + + def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor: + """ + Compute CrossEntropy loss between predictions and targets. + + Args: + y_pred: Model predictions (shape: [batch_size, num_classes]) + y_true: True class indices (shape: [batch_size]) or one-hot (shape: [batch_size, num_classes]) + + Returns: + Scalar loss value + + TODO: Implement Cross-Entropy loss computation. + + APPROACH: + 1. Handle both class indices and one-hot encoded labels + 2. Apply softmax to predictions for probability distribution + 3. Compute log probabilities: log(softmax(y_pred)) + 4. Calculate cross-entropy: -mean(y_true * log_probs) + 5. Return scalar loss + + EXAMPLE: + y_pred = Tensor([[2.0, 1.0, 0.1], [0.5, 2.1, 0.9]]) # Raw logits + y_true = Tensor([0, 1]) # Class indices + loss = crossentropy_loss(y_pred, y_true) + # Should apply softmax then compute -log(prob_of_correct_class) + + HINTS: + - Use softmax: exp(x) / sum(exp(x)) for probability distribution + - Add small epsilon (1e-15) to avoid log(0) + - Handle both class indices and one-hot encoding + - Use np.log for logarithm computation + """ + ### BEGIN SOLUTION + # Handle both 1D and 2D prediction arrays + if y_pred.data.ndim == 1: + # Reshape 1D to 2D for consistency (single sample) + y_pred_2d = y_pred.data.reshape(1, -1) + else: + y_pred_2d = y_pred.data + + # Apply softmax to get probability distribution + exp_pred = np.exp(y_pred_2d - np.max(y_pred_2d, axis=1, keepdims=True)) + softmax_pred = exp_pred / np.sum(exp_pred, axis=1, keepdims=True) + + # Add small epsilon to avoid log(0) + epsilon = 1e-15 + softmax_pred = np.clip(softmax_pred, epsilon, 1.0 - epsilon) + + # Handle class indices vs one-hot encoding + if len(y_true.data.shape) == 1: + # y_true contains class indices + batch_size = y_true.data.shape[0] + log_probs = np.log(softmax_pred[np.arange(batch_size), y_true.data.astype(int)]) + loss = -np.mean(log_probs) + else: + # y_true is one-hot encoded + log_probs = np.log(softmax_pred) + loss = -np.mean(np.sum(y_true.data * log_probs, axis=1)) + + return Tensor(loss) + ### END SOLUTION + + def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: + """Alternative interface for forward pass.""" + return self.__call__(y_pred, y_true) + +# %% ../../modules/source/09_training/training_dev.ipynb 10 +class BinaryCrossEntropyLoss: + """ + Binary Cross-Entropy Loss for Binary Classification + + Measures the difference between predicted probabilities and binary labels. + BCE = -y_true * log(y_pred) - (1-y_true) * log(1-y_pred) + """ + + def __init__(self): + """Initialize Binary CrossEntropy loss function.""" + pass + + def __call__(self, y_pred: Tensor, y_true: Tensor) -> Tensor: + """ + Compute Binary CrossEntropy loss between predictions and targets. + + Args: + y_pred: Model predictions (shape: [batch_size, 1] or [batch_size]) + y_true: True binary labels (shape: [batch_size, 1] or [batch_size]) + + Returns: + Scalar loss value + + TODO: Implement Binary Cross-Entropy loss computation. + + APPROACH: + 1. Apply sigmoid to predictions for probability values + 2. Clip probabilities to avoid log(0) and log(1) + 3. Compute: -y_true * log(y_pred) - (1-y_true) * log(1-y_pred) + 4. Take mean over batch + 5. Return scalar loss + + EXAMPLE: + y_pred = Tensor([[2.0], [0.0], [-1.0]]) # Raw logits + y_true = Tensor([[1.0], [1.0], [0.0]]) # Binary labels + loss = bce_loss(y_pred, y_true) + # Should apply sigmoid then compute binary cross-entropy + + HINTS: + - Use sigmoid: 1 / (1 + exp(-x)) + - Clip probabilities: np.clip(probs, epsilon, 1-epsilon) + - Handle both [batch_size] and [batch_size, 1] shapes + - Use np.log for logarithm computation + """ + ### BEGIN SOLUTION + # Use numerically stable implementation directly from logits + # This avoids computing sigmoid and log separately + logits = y_pred.data.flatten() + labels = y_true.data.flatten() + + # Numerically stable binary cross-entropy from logits + # Uses the identity: log(1 + exp(x)) = max(x, 0) + log(1 + exp(-abs(x))) + def stable_bce_with_logits(logits, labels): + # For each sample: -[y*log(sigmoid(x)) + (1-y)*log(1-sigmoid(x))] + # Which equals: -[y*log_sigmoid(x) + (1-y)*log_sigmoid(-x)] + # Where log_sigmoid(x) = x - log(1 + exp(x)) = x - softplus(x) + + # Compute log(sigmoid(x)) = x - log(1 + exp(x)) + # Use numerical stability: log(1 + exp(x)) = max(0, x) + log(1 + exp(-abs(x))) + def log_sigmoid(x): + return x - np.maximum(0, x) - np.log(1 + np.exp(-np.abs(x))) + + # Compute log(1 - sigmoid(x)) = -x - log(1 + exp(-x)) + def log_one_minus_sigmoid(x): + return -x - np.maximum(0, -x) - np.log(1 + np.exp(-np.abs(x))) + + # Binary cross-entropy: -[y*log_sigmoid(x) + (1-y)*log_sigmoid(-x)] + loss = -(labels * log_sigmoid(logits) + (1 - labels) * log_one_minus_sigmoid(logits)) + return loss + + # Compute loss for each sample + losses = stable_bce_with_logits(logits, labels) + + # Take mean over batch + mean_loss = np.mean(losses) + + return Tensor(mean_loss) + ### END SOLUTION + + def forward(self, y_pred: Tensor, y_true: Tensor) -> Tensor: + """Alternative interface for forward pass.""" + return self.__call__(y_pred, y_true) + +# %% ../../modules/source/09_training/training_dev.ipynb 14 +class Accuracy: + """ + Accuracy Metric for Classification + + Computes the fraction of correct predictions. + Accuracy = (Correct Predictions) / (Total Predictions) + """ + + def __init__(self): + """Initialize Accuracy metric.""" + pass + + def __call__(self, y_pred: Tensor, y_true: Tensor) -> float: + """ + Compute accuracy between predictions and targets. + + Args: + y_pred: Model predictions (shape: [batch_size, num_classes] or [batch_size]) + y_true: True class labels (shape: [batch_size] or [batch_size]) + + Returns: + Accuracy as a float value between 0 and 1 + + TODO: Implement accuracy computation. + + APPROACH: + 1. Convert predictions to class indices (argmax for multi-class) + 2. Convert true labels to class indices if needed + 3. Count correct predictions + 4. Divide by total predictions + 5. Return as float + + EXAMPLE: + y_pred = Tensor([[0.9, 0.1], [0.2, 0.8], [0.6, 0.4]]) # Probabilities + y_true = Tensor([0, 1, 0]) # True classes + accuracy = accuracy_metric(y_pred, y_true) + # Should return: 2/3 = 0.667 (first and second predictions correct) + + HINTS: + - Use np.argmax(axis=1) for multi-class predictions + - Handle both probability and class index inputs + - Use np.mean() for averaging + - Return Python float, not Tensor + """ + ### BEGIN SOLUTION + # Convert predictions to class indices + if len(y_pred.data.shape) > 1 and y_pred.data.shape[1] > 1: + # Multi-class: use argmax + pred_classes = np.argmax(y_pred.data, axis=1) + else: + # Binary classification: threshold at 0.5 + pred_classes = (y_pred.data.flatten() > 0.5).astype(int) + + # Convert true labels to class indices if needed + if len(y_true.data.shape) > 1 and y_true.data.shape[1] > 1: + # One-hot encoded + true_classes = np.argmax(y_true.data, axis=1) + else: + # Already class indices + true_classes = y_true.data.flatten().astype(int) + + # Compute accuracy + correct = np.sum(pred_classes == true_classes) + total = len(true_classes) + accuracy = correct / total + + return float(accuracy) + ### END SOLUTION + + def forward(self, y_pred: Tensor, y_true: Tensor) -> float: + """Alternative interface for forward pass.""" + return self.__call__(y_pred, y_true) + +# %% ../../modules/source/09_training/training_dev.ipynb 18 +class Trainer: + """ + Training Loop Orchestrator + + Coordinates model training with loss functions, optimizers, and metrics. + """ + + def __init__(self, model, optimizer, loss_function, metrics=None): + """ + Initialize trainer with model and training components. + + Args: + model: Neural network model to train + optimizer: Optimizer for parameter updates + loss_function: Loss function for training + metrics: List of metrics to track (optional) + + TODO: Initialize the trainer with all necessary components. + + APPROACH: + 1. Store model, optimizer, loss function, and metrics + 2. Initialize history tracking for losses and metrics + 3. Set up training state (epoch, step counters) + 4. Prepare for training and validation loops + + EXAMPLE: + model = Sequential([Dense(10, 5), ReLU(), Dense(5, 2)]) + optimizer = Adam(model.parameters, learning_rate=0.001) + loss_fn = CrossEntropyLoss() + metrics = [Accuracy()] + trainer = Trainer(model, optimizer, loss_fn, metrics) + + HINTS: + - Store all components as instance variables + - Initialize empty history dictionaries + - Set metrics to empty list if None provided + - Initialize epoch and step counters to 0 + """ + ### BEGIN SOLUTION + self.model = model + self.optimizer = optimizer + self.loss_function = loss_function + self.metrics = metrics or [] + + # Training history + self.history = { + 'train_loss': [], + 'val_loss': [], + 'epoch': [] + } + + # Add metric history tracking + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + self.history[f'train_{metric_name}'] = [] + self.history[f'val_{metric_name}'] = [] + + # Training state + self.current_epoch = 0 + self.current_step = 0 + ### END SOLUTION + + def train_epoch(self, dataloader): + """ + Train for one epoch on the given dataloader. + + Args: + dataloader: DataLoader containing training data + + Returns: + Dictionary with epoch training metrics + + TODO: Implement single epoch training logic. + + APPROACH: + 1. Initialize epoch metrics tracking + 2. Iterate through batches in dataloader + 3. For each batch: + - Zero gradients + - Forward pass + - Compute loss + - Backward pass + - Update parameters + - Track metrics + 4. Return averaged metrics for the epoch + + HINTS: + - Use optimizer.zero_grad() before each batch + - Call loss.backward() for gradient computation + - Use optimizer.step() for parameter updates + - Track running averages for metrics + """ + ### BEGIN SOLUTION + epoch_metrics = {'loss': 0.0} + + # Initialize metric tracking + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + epoch_metrics[metric_name] = 0.0 + + batch_count = 0 + + for batch_x, batch_y in dataloader: + # Zero gradients + self.optimizer.zero_grad() + + # Forward pass + predictions = self.model(batch_x) + + # Compute loss + loss = self.loss_function(predictions, batch_y) + + # Backward pass (simplified - in real implementation would use autograd) + # loss.backward() + + # Update parameters + self.optimizer.step() + + # Track metrics + epoch_metrics['loss'] += loss.data + + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + metric_value = metric(predictions, batch_y) + epoch_metrics[metric_name] += metric_value + + batch_count += 1 + self.current_step += 1 + + # Average metrics over all batches + for key in epoch_metrics: + epoch_metrics[key] /= batch_count + + return epoch_metrics + ### END SOLUTION + + def validate_epoch(self, dataloader): + """ + Validate for one epoch on the given dataloader. + + Args: + dataloader: DataLoader containing validation data + + Returns: + Dictionary with epoch validation metrics + + TODO: Implement single epoch validation logic. + + APPROACH: + 1. Initialize epoch metrics tracking + 2. Iterate through batches in dataloader + 3. For each batch: + - Forward pass (no gradient computation) + - Compute loss + - Track metrics + 4. Return averaged metrics for the epoch + + HINTS: + - No gradient computation needed for validation + - No parameter updates during validation + - Similar to train_epoch but simpler + """ + ### BEGIN SOLUTION + epoch_metrics = {'loss': 0.0} + + # Initialize metric tracking + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + epoch_metrics[metric_name] = 0.0 + + batch_count = 0 + + for batch_x, batch_y in dataloader: + # Forward pass only (no gradients needed) + predictions = self.model(batch_x) + + # Compute loss + loss = self.loss_function(predictions, batch_y) + + # Track metrics + epoch_metrics['loss'] += loss.data + + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + metric_value = metric(predictions, batch_y) + epoch_metrics[metric_name] += metric_value + + batch_count += 1 + + # Average metrics over all batches + for key in epoch_metrics: + epoch_metrics[key] /= batch_count + + return epoch_metrics + ### END SOLUTION + + def fit(self, train_dataloader, val_dataloader=None, epochs=10, verbose=True): + """ + Train the model for specified number of epochs. + + Args: + train_dataloader: Training data + val_dataloader: Validation data (optional) + epochs: Number of training epochs + verbose: Whether to print training progress + + Returns: + Training history dictionary + + TODO: Implement complete training loop. + + APPROACH: + 1. Loop through epochs + 2. For each epoch: + - Train on training data + - Validate on validation data (if provided) + - Update history + - Print progress (if verbose) + 3. Return complete training history + + HINTS: + - Use train_epoch() and validate_epoch() methods + - Update self.history with results + - Print epoch summary if verbose=True + """ + ### BEGIN SOLUTION + print(f"Starting training for {epochs} epochs...") + + for epoch in range(epochs): + self.current_epoch = epoch + + # Training phase + train_metrics = self.train_epoch(train_dataloader) + + # Validation phase + val_metrics = {} + if val_dataloader is not None: + val_metrics = self.validate_epoch(val_dataloader) + + # Update history + self.history['epoch'].append(epoch) + self.history['train_loss'].append(train_metrics['loss']) + + if val_dataloader is not None: + self.history['val_loss'].append(val_metrics['loss']) + + # Update metric history + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + self.history[f'train_{metric_name}'].append(train_metrics[metric_name]) + if val_dataloader is not None: + self.history[f'val_{metric_name}'].append(val_metrics[metric_name]) + + # Print progress + if verbose: + train_loss = train_metrics['loss'] + print(f"Epoch {epoch+1}/{epochs} - train_loss: {train_loss:.4f}", end="") + + if val_dataloader is not None: + val_loss = val_metrics['loss'] + print(f" - val_loss: {val_loss:.4f}", end="") + + for metric in self.metrics: + metric_name = metric.__class__.__name__.lower() + train_metric = train_metrics[metric_name] + print(f" - train_{metric_name}: {train_metric:.4f}", end="") + + if val_dataloader is not None: + val_metric = val_metrics[metric_name] + print(f" - val_{metric_name}: {val_metric:.4f}", end="") + + print() # New line + + print("Training completed!") + return self.history + ### END SOLUTION